Handle the block offset using min transform block size

Map the block offset to pixel positions in the unit of minimum
transform block size. This change retains the same coding
statistics. The encoding speed is slightly faster (<1%).

Change-Id: I2641d2b1e317ef4cdf477718c446ef025b8eef41
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index 3c6f383..c213b77 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1581,8 +1581,8 @@
   const int txw = tx_size_wide_unit[tx_size];
   const int have_top = row_off || xd->up_available;
   const int have_left = col_off || xd->left_available;
-  const int x = col_off * 4;
-  const int y = row_off * 4;
+  const int x = col_off << tx_size_wide_log2[0];
+  const int y = row_off << tx_size_high_log2[0];
   const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
   const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   const int txwpx = tx_size_wide[tx_size];
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index 0e857d9..de2b477 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -514,7 +514,8 @@
   QUANT_PARAM qparam;
   const int16_t *src_diff;
 
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   qparam.log_scale = get_tx_scale(tx_size);
 #if CONFIG_NEW_QUANT
   qparam.tx_size = tx_size;
@@ -543,10 +544,11 @@
     assert(block < MAX_PVQ_BLOCKS_IN_SB);
     pvq_info = &x->pvq[block][plane];
   }
-  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
-  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-  src_int16 = &p->src_int16[4 * (blk_row * diff_stride + blk_col)];
-  pred = &pd->pred[4 * (blk_row * diff_stride + blk_col)];
+  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  src_int16 =
+      &p->src_int16[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
+  pred = &pd->pred[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
 
   // transform block size in pixels
   tx_blk_size = tx_size_wide[tx_size];
@@ -641,7 +643,8 @@
   int i;
   const int bwl = b_width_log2_lookup[plane_bsize];
 #endif
-  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
   a = &args->ta[blk_col];
   l = &args->tl[blk_row];
 #if CONFIG_VAR_TX
@@ -786,7 +789,8 @@
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
   int ctx = 0;
-  dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+  dst = &pd->dst
+             .buf[(blk_row * pd->dst.stride + blk_col) << tx_size_wide_log2[0]];
 
 #if CONFIG_NEW_QUANT
   av1_xform_quant(cm, x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
@@ -952,9 +956,10 @@
 
   assert(tx1d_width == tx1d_height);
 
-  dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
-  src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+  dst = &pd->dst.buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+  src = &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+  src_diff =
+      &p->src_diff[(blk_row * diff_stride + blk_col) << tx_size_wide_log2[0]];
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
   av1_predict_intra_block(xd, pd->width, pd->height, tx_size, mode, dst,
                           dst_stride, dst, dst_stride, blk_col, blk_row, plane);
diff --git a/av1/encoder/rdopt.c b/av1/encoder/rdopt.c
index 69dc233..941f465 100644
--- a/av1/encoder/rdopt.c
+++ b/av1/encoder/rdopt.c
@@ -1208,9 +1208,13 @@
       const int dst_stride = pd->dst.stride;
       const int diff_stride = block_size_wide[plane_bsize];
 
-      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
-      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
-      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+      const uint8_t *src =
+          &p->src.buf[(blk_row * src_stride + blk_col) << tx_size_wide_log2[0]];
+      const uint8_t *dst =
+          &pd->dst
+               .buf[(blk_row * dst_stride + blk_col) << tx_size_wide_log2[0]];
+      const int16_t *diff = &p->src_diff[(blk_row * diff_stride + blk_col)
+                                         << tx_size_wide_log2[0]];
 
       unsigned int tmp;
       this_rd_stats.sse = sum_squares_2d(diff, diff_stride, tx_size);