Refactor dist_block

1. Split dist_block to dist_block_tx_domain and
dist_block_px_domain.
2. Move the calculation of block_sse to the outside of
tx_type loop, avoid call pixel_diff_dist repeatedly.

3. For encoder, about 0.3% faster shows by encoding
15 frame of city_cif ( 405627 ms -> 406840 ms)

a) gcc (Ubuntu 5.4.0-6ubuntu1~16.04.9) 5.4.0 20160609
b) CPU: Intel(R) Core(TM) i5-4590 CPU @ 3.30GHz
c) Config cmd
cmake ../ -DENABLE_CCACHE=1 -DCONFIG_LOWBITDEPTH=1
d) Test cmd:
./aomenc --cpu-used=1 --end-usage=vbr \
--target-bitrate=600 --limit=15

Change-Id: I1d179f4e4cb750c682a012b1a24513f41b0f2875
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index c6884f5..bcc0eb0 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1898,32 +1898,30 @@
 
 // Compute the pixel domain distortion from diff on all visible 4x4s in the
 // transform block.
-static int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
-                               const int16_t *diff, const int diff_stride,
-                               int blk_row, int blk_col,
-                               const BLOCK_SIZE plane_bsize,
-                               const BLOCK_SIZE tx_bsize) {
+static INLINE int64_t pixel_diff_dist(const MACROBLOCK *x, int plane,
+                                      int blk_row, int blk_col,
+                                      const BLOCK_SIZE plane_bsize,
+                                      const BLOCK_SIZE tx_bsize) {
   int visible_rows, visible_cols;
   const MACROBLOCKD *xd = &x->e_mbd;
+  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
+                     NULL, &visible_cols, &visible_rows);
+  const int diff_stride = block_size_wide[plane_bsize];
+  const int16_t *diff = x->plane[plane].src_diff;
 #if CONFIG_DIST_8X8
   int txb_height = block_size_high[tx_bsize];
   int txb_width = block_size_wide[tx_bsize];
-  const int src_stride = x->plane[plane].src.stride;
-  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
-  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-#endif
-
-  get_txb_dimensions(xd, plane, plane_bsize, blk_row, blk_col, tx_bsize, NULL,
-                     NULL, &visible_cols, &visible_rows);
-
-#if CONFIG_DIST_8X8
-  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8)
+  if (x->using_dist_8x8 && plane == 0 && txb_width >= 8 && txb_height >= 8) {
+    const int src_stride = x->plane[plane].src.stride;
+    const int src_idx = (blk_row * src_stride + blk_col)
+                        << tx_size_wide_log2[0];
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
     return dist_8x8_diff(x, src, src_stride, diff, diff_stride, txb_width,
                          txb_height, visible_cols, visible_rows, x->qindex);
-  else
+  }
 #endif
-    return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols,
-                                  visible_rows);
+  diff += ((blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]);
+  return aom_sum_squares_2d_i16(diff, diff_stride, visible_cols, visible_rows);
 }
 
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
@@ -2004,129 +2002,100 @@
          tx_size;
 }
 
-static void dist_block(const AV1_COMP *cpi, MACROBLOCK *x, int plane,
-                       BLOCK_SIZE plane_bsize, int block, int blk_row,
-                       int blk_col, TX_SIZE tx_size, int64_t *out_dist,
-                       int64_t *out_sse, OUTPUT_STATUS output_status,
-                       int use_transform_domain_distortion) {
+static INLINE void dist_block_tx_domain(MACROBLOCK *x, int plane, int block,
+                                        TX_SIZE tx_size, int64_t *out_dist,
+                                        int64_t *out_sse) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  // Transform domain distortion computation is more efficient as it does
+  // not involve an inverse transform, but it is less accurate.
+  const int buffer_length = av1_get_max_eob(tx_size);
+  int64_t this_sse;
+  // TX-domain results need to shift down to Q2/D10 to match pixel
+  // domain distortion values which are in Q2^2
+  int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length, &this_sse,
+                                       xd->bd);
+  else
+    *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+
+  *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
+  *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+}
+
+static INLINE int64_t dist_block_px_domain(const AV1_COMP *cpi, MACROBLOCK *x,
+                                           int plane, BLOCK_SIZE plane_bsize,
+                                           int block, int blk_row, int blk_col,
+                                           TX_SIZE tx_size) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const uint16_t eob = p->eobs[block];
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  const int bsw = block_size_wide[tx_bsize];
+  const int bsh = block_size_high[tx_bsize];
+  const int src_stride = x->plane[plane].src.stride;
+  const int dst_stride = xd->plane[plane].dst.stride;
+  // Scale the transform block index to pixel unit.
+  const int src_idx = (blk_row * src_stride + blk_col) << tx_size_wide_log2[0];
+  const int dst_idx = (blk_row * dst_stride + blk_col) << tx_size_wide_log2[0];
+  const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+  const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+  const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 
-  // When eob is 0, pixel domain distortion is more efficient and accurate.
-  if (!eob) use_transform_domain_distortion = 0;
-  if (use_transform_domain_distortion) {
-    // Transform domain distortion computation is more efficient as it does
-    // not involve an inverse transform, but it is less accurate.
-    const int buffer_length = av1_get_max_eob(tx_size);
-    int64_t this_sse;
-    // TX-domain results need to shift down to Q2/D10 to match pixel
-    // domain distortion values which are in Q2^2
-    int shift = (MAX_TX_SCALE - av1_get_tx_scale(tx_size)) * 2;
-    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  assert(cpi != NULL);
+  assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
 
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-      *out_dist = av1_highbd_block_error(coeff, dqcoeff, buffer_length,
-                                         &this_sse, xd->bd);
-    else
-      *out_dist = av1_block_error(coeff, dqcoeff, buffer_length, &this_sse);
+  uint8_t *recon;
+  DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
 
-    *out_dist = RIGHT_SIGNED_SHIFT(*out_dist, shift);
-    *out_sse = RIGHT_SIGNED_SHIFT(this_sse, shift);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    recon = CONVERT_TO_BYTEPTR(recon16);
+    av1_highbd_convolve_2d_copy_sr(CONVERT_TO_SHORTPTR(dst), dst_stride,
+                                   CONVERT_TO_SHORTPTR(recon), MAX_TX_SIZE, bsw,
+                                   bsh, NULL, NULL, 0, 0, NULL, xd->bd);
   } else {
-    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
-    const int bsw = block_size_wide[tx_bsize];
-    const int bsh = block_size_high[tx_bsize];
-    const int src_stride = x->plane[plane].src.stride;
-    const int dst_stride = xd->plane[plane].dst.stride;
-    // Scale the transform block index to pixel unit.
-    const int src_idx = (blk_row * src_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const int dst_idx = (blk_row * dst_stride + blk_col)
-                        << tx_size_wide_log2[0];
-    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
-    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
-    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    recon = (uint8_t *)recon16;
+    av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh, NULL,
+                            NULL, 0, 0, NULL);
+  }
 
-    assert(cpi != NULL);
-    assert(tx_size_wide_log2[0] == tx_size_high_log2[0]);
-
-    {
-      const int diff_stride = block_size_wide[plane_bsize];
-      const int diff_idx = (blk_row * diff_stride + blk_col)
-                           << tx_size_wide_log2[0];
-      const int16_t *diff = &p->src_diff[diff_idx];
-      *out_sse = pixel_diff_dist(x, plane, diff, diff_stride, blk_row, blk_col,
-                                 plane_bsize, tx_bsize);
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        *out_sse = ROUND_POWER_OF_TWO(*out_sse, (xd->bd - 8) * 2);
-    }
-    *out_sse *= 16;
-
-    if (eob) {
-      if (output_status == OUTPUT_HAS_DECODED_PIXELS) {
-        *out_dist = pixel_dist(cpi, x, plane, src, src_stride, dst, dst_stride,
-                               blk_row, blk_col, plane_bsize, tx_bsize);
-      } else {
-        uint8_t *recon;
-        DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
-
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-          recon = CONVERT_TO_BYTEPTR(recon16);
-        else
-          recon = (uint8_t *)recon16;
-
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          av1_highbd_convolve_2d_copy_sr(
-              CONVERT_TO_SHORTPTR(dst), dst_stride, CONVERT_TO_SHORTPTR(recon),
-              MAX_TX_SIZE, bsw, bsh, NULL, NULL, 0, 0, NULL, xd->bd);
-        } else {
-          av1_convolve_2d_copy_sr(dst, dst_stride, recon, MAX_TX_SIZE, bsw, bsh,
-                                  NULL, NULL, 0, 0, NULL);
-        }
-
-        const PLANE_TYPE plane_type = get_plane_type(plane);
-        TX_TYPE tx_type =
-            av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
-                            cpi->common.reduced_tx_set_used);
-        av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
-                                    MAX_TX_SIZE, eob,
+  const PLANE_TYPE plane_type = get_plane_type(plane);
+  TX_TYPE tx_type = av1_get_tx_type(plane_type, xd, blk_row, blk_col, tx_size,
                                     cpi->common.reduced_tx_set_used);
-
+  av1_inverse_transform_block(xd, dqcoeff, plane, tx_type, tx_size, recon,
+                              MAX_TX_SIZE, eob,
+                              cpi->common.reduced_tx_set_used);
 #if CONFIG_DIST_8X8
-        if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
-          // Save decoded pixels for inter block in pd->pred to avoid
-          // block_8x8_rd_txfm_daala_dist() need to produce them
-          // by calling av1_inverse_transform_block() again.
-          const int pred_stride = block_size_wide[plane_bsize];
-          const int pred_idx = (blk_row * pred_stride + blk_col)
-                               << tx_size_wide_log2[0];
-          int16_t *pred = &x->pred_luma[pred_idx];
-          int i, j;
+  if (x->using_dist_8x8 && plane == 0 && (bsw < 8 || bsh < 8)) {
+    // Save decoded pixels for inter block in pd->pred to avoid
+    // block_8x8_rd_txfm_daala_dist() need to produce them
+    // by calling av1_inverse_transform_block() again.
+    const int pred_stride = block_size_wide[plane_bsize];
+    const int pred_idx = (blk_row * pred_stride + blk_col)
+                         << tx_size_wide_log2[0];
+    int16_t *pred = &x->pred_luma[pred_idx];
+    int i, j;
 
-          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] =
-                    CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
-          } else {
-            for (j = 0; j < bsh; j++)
-              for (i = 0; i < bsw; i++)
-                pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
-          }
-        }
-#endif  // CONFIG_DIST_8X8
-        *out_dist =
-            pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
-                       blk_row, blk_col, plane_bsize, tx_bsize);
-      }
-      *out_dist *= 16;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] =
+              CONVERT_TO_SHORTPTR(recon)[j * MAX_TX_SIZE + i];
     } else {
-      *out_dist = *out_sse;
+      for (j = 0; j < bsh; j++)
+        for (i = 0; i < bsw; i++)
+          pred[j * pred_stride + i] = recon[j * MAX_TX_SIZE + i];
     }
   }
+#endif  // CONFIG_DIST_8X8
+  return 16 * pixel_dist(cpi, x, plane, src, src_stride, recon, MAX_TX_SIZE,
+                         blk_row, blk_col, plane_bsize, tx_bsize);
 }
 
   // NOTE: CONFIG_COLLECT_RD_STATS takes 3 possible values
@@ -2536,12 +2505,22 @@
 #if CONFIG_DIST_8X8
   if (x->using_dist_8x8) use_transform_domain_distortion = 0;
 #endif
+  const uint16_t *eobs_ptr = x->plane[plane].eobs;
+
+  const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+  int64_t block_sse =
+      pixel_diff_dist(x, plane, blk_row, blk_col, plane_bsize, tx_bsize);
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    block_sse = ROUND_POWER_OF_TWO(block_sse, (xd->bd - 8) * 2);
+  block_sse *= 16;
+
   for (TX_TYPE tx_type = txk_start; tx_type <= txk_end; ++tx_type) {
     if (!allowed_tx_mask[tx_type]) continue;
     if (plane == 0) mbmi->txk_type[txk_type_idx] = tx_type;
     last_tx_type = tx_type;
     RD_STATS this_rd_stats;
     av1_invalid_rd_stats(&this_rd_stats);
+
     if (!cpi->optimize_seg_arr[mbmi->segment_id]) {
       av1_xform_quant(
           cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -2552,11 +2531,10 @@
       av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize,
                       tx_size, AV1_XFORM_QUANT_FP);
       if (cpi->sf.optimize_b_precheck && best_rd < INT64_MAX &&
-          x->plane[plane].eobs[block] >= 4) {
+          eobs_ptr[block] >= 4) {
         // Calculate distortion quickly in transform domain.
-        dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
-                   &this_rd_stats.dist, &this_rd_stats.sse,
-                   OUTPUT_HAS_PREDICTED_PIXELS, 1);
+        dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                             &this_rd_stats.sse);
         rate_cost =
             av1_cost_coeffs(cm, x, plane_bsize, plane, blk_row, blk_col, block,
                             tx_size, a, l, use_fast_coef_costing);
@@ -2569,9 +2547,17 @@
       av1_optimize_b(cpi, x, plane, blk_row, blk_col, block, plane_bsize,
                      tx_size, a, l, 1, &rate_cost);
     }
-    dist_block(cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size,
-               &this_rd_stats.dist, &this_rd_stats.sse,
-               OUTPUT_HAS_PREDICTED_PIXELS, use_transform_domain_distortion);
+    if (eobs_ptr[block] == 0) {
+      // When eob is 0, pixel domain distortion is more efficient and accurate.
+      this_rd_stats.dist = this_rd_stats.sse = block_sse;
+    } else if (use_transform_domain_distortion) {
+      dist_block_tx_domain(x, plane, block, tx_size, &this_rd_stats.dist,
+                           &this_rd_stats.sse);
+    } else {
+      this_rd_stats.dist = dist_block_px_domain(
+          cpi, x, plane, plane_bsize, block, blk_row, blk_col, tx_size);
+      this_rd_stats.sse = block_sse;
+    }
 
     this_rd_stats.rate = rate_cost;
 
@@ -4991,7 +4977,7 @@
   const MACROBLOCKD *xd = &x->e_mbd;
   const int16_t dc_q = av1_dc_quant_QTX(x->qindex, 0, xd->bd);
 
-  *dist = pixel_diff_dist(x, 0, x->plane[0].src_diff, bw, 0, 0, bsize, bsize);
+  *dist = pixel_diff_dist(x, 0, 0, 0, bsize, bsize);
   const int64_t mse = *dist / bw / bh;
   // Normalized quantizer takes the transform upscaling factor (8 for tx size
   // smaller than 32) into account.
diff --git a/av1/encoder/rdopt.h b/av1/encoder/rdopt.h
index 49117c5..da8846c 100644
--- a/av1/encoder/rdopt.h
+++ b/av1/encoder/rdopt.h
@@ -54,11 +54,6 @@
 }
 #endif
 
-typedef enum OUTPUT_STATUS {
-  OUTPUT_HAS_PREDICTED_PIXELS,
-  OUTPUT_HAS_DECODED_PIXELS
-} OUTPUT_STATUS;
-
 // Returns the number of colors in 'src'.
 int av1_count_colors(const uint8_t *src, int stride, int rows, int cols,
                      int *val_count);