Pad extra columns in txb levels and signs

This helps the removal of horizontal availability check.

Change-Id: Ie3b97eea63b4cc79ec78df119c2730a4d7cf539d
diff --git a/av1/common/enums.h b/av1/common/enums.h
index b368701..fe4ce59 100644
--- a/av1/common/enums.h
+++ b/av1/common/enums.h
@@ -243,13 +243,17 @@
 #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2)
 #define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE)
 
-#define TX_PAD_HOR 0
+// Pad 4 extra columns to remove horizontal availability check.
+#define TX_PAD_HOR 4
 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability
 // check.
 #define TX_PAD_TOP 2
 #define TX_PAD_BOTTOM 4
 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM)
-#define TX_PAD_2D ((MAX_TX_SIZE + TX_PAD_HOR) * (MAX_TX_SIZE + TX_PAD_VER))
+// Pad 16 extra bytes to avoid reading overflow in SIMD optimization.
+#define TX_PAD_END 16
+#define TX_PAD_2D \
+  ((MAX_TX_SIZE + TX_PAD_HOR) * (MAX_TX_SIZE + TX_PAD_VER) + TX_PAD_END)
 
 // Number of maxium size transform blocks in the maximum size superblock
 #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h
index 6d0d7bd..71471da 100644
--- a/av1/common/txb_common.h
+++ b/av1/common/txb_common.h
@@ -80,20 +80,22 @@
   return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR);
 }
 
+static INLINE int get_paded_idx(const int idx, const int bwl) {
+  return idx + TX_PAD_HOR * (idx >> bwl);
+}
+
 static INLINE int get_level_count_mag(int *const mag,
                                       const uint8_t *const levels,
-                                      const int bwl, const int row,
+                                      const int stride, const int row,
                                       const int col, const int level,
                                       const int (*nb_offset)[2],
                                       const int nb_num) {
-  const int stride = 1 << bwl;
   int count = 0;
 
   for (int idx = 0; idx < nb_num; ++idx) {
     const int ref_row = row + nb_offset[idx][0];
     const int ref_col = col + nb_offset[idx][1];
-    if (ref_col < 0 || ref_col >= stride) continue;
-    const int pos = (ref_row << bwl) + ref_col;
+    const int pos = ref_row * stride + ref_col;
     count += levels[pos] > level;
     if (nb_offset[idx][0] == 0 && nb_offset[idx][1] == 1) mag[0] = levels[pos];
     if (nb_offset[idx][0] == 1 && nb_offset[idx][1] == 0) mag[1] = levels[pos];
@@ -171,11 +173,12 @@
                                const int bwl, const int level) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
   const int level_minus_1 = level - 1;
   int mag_count = 0;
   int nb_mag[3] = { 0 };
   const int count =
-      get_level_count_mag(nb_mag, levels, bwl, row, col, level_minus_1,
+      get_level_count_mag(nb_mag, levels, stride, row, col, level_minus_1,
                           base_ref_offset, BASE_CONTEXT_POSITION_NUM);
 
   for (int idx = 0; idx < 3; ++idx) mag_count += nb_mag[idx] > level;
@@ -272,11 +275,12 @@
                              const int bwl) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
+  const int stride = (1 << bwl) + TX_PAD_HOR;
   const int level_minus_1 = NUM_BASE_LEVELS;
   int mag = 0;
   int nb_mag[3] = { 0 };
   const int count =
-      get_level_count_mag(nb_mag, levels, bwl, row, col, level_minus_1,
+      get_level_count_mag(nb_mag, levels, stride, row, col, level_minus_1,
                           br_ref_offset, BR_CONTEXT_POSITION_NUM);
   for (int idx = 0; idx < 3; ++idx) mag = AOMMAX(mag, nb_mag[idx]);
   const int ctx = get_br_ctx_from_count_mag(row, col, count, mag);
@@ -302,6 +306,7 @@
 static INLINE int get_nz_count_mag(const uint8_t *const levels, const int bwl,
                                    const int row, const int col,
                                    const TX_CLASS tx_class, int *const mag) {
+  const int stride = (1 << bwl) + TX_PAD_HOR;
   int count = 0;
   *mag = 0;
   for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
@@ -318,7 +323,7 @@
     const int ref_row = row + row_offset;
     const int ref_col = col + col_offset;
     if (ref_col >= (1 << bwl)) continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
+    const int nb_pos = ref_row * stride + ref_col;
     const int level = levels[nb_pos];
     count += (level != 0);
 #if 1
@@ -333,6 +338,7 @@
 static INLINE int get_nz_count(const uint8_t *const levels, const int bwl,
                                const int row, const int col,
                                const TX_CLASS tx_class) {
+  const int stride = (1 << bwl) + TX_PAD_HOR;
   int count = 0;
   for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) {
     const int ref_row = row + ((tx_class == TX_CLASS_2D)
@@ -345,8 +351,7 @@
                                    : ((tx_class == TX_CLASS_VERT)
                                           ? sig_ref_offset_vert[idx][1]
                                           : sig_ref_offset_horiz[idx][1]));
-    if (ref_col >= (1 << bwl)) continue;
-    const int nb_pos = (ref_row << bwl) + ref_col;
+    const int nb_pos = ref_row * stride + ref_col;
     count += (levels[nb_pos] != 0);
   }
   return count;
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c
index f0bba6b..70006f7 100644
--- a/av1/decoder/decodetxb.c
+++ b/av1/decoder/decodetxb.c
@@ -94,7 +94,8 @@
   }
 
   memset(levels_buf, 0,
-         sizeof(*levels_buf) * (seg_eob + TX_PAD_VER * (width + TX_PAD_HOR)));
+         sizeof(*levels_buf) *
+             ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END));
   memset(signs, 0, sizeof(*signs) * seg_eob);
 
   (void)blk_row;
@@ -174,7 +175,7 @@
     }
 
     // set non-zero coefficient map.
-    levels[scan[c]] = is_nz;
+    levels[get_paded_idx(scan[c], bwl)] = is_nz;
 
     if (counts) ++(*nz_map_count)[coeff_ctx][is_nz];
 
@@ -194,7 +195,7 @@
           break;
         }
       }
-      levels[scan[c]] = k + 1;
+      levels[get_paded_idx(scan[c], bwl)] = k + 1;
     }
 #endif
   }
@@ -207,7 +208,7 @@
   int i;
   for (i = 0; i < NUM_BASE_LEVELS; ++i) {
     for (c = *eob - 1; c >= 0; --c) {
-      uint8_t *const level = &levels[scan[c]];
+      uint8_t *const level = &levels[get_paded_idx(scan[c], bwl)];
       int ctx;
 
       if (*level <= i) continue;
@@ -237,7 +238,7 @@
   // starting with the sign of the DC (if applicable)
   for (c = 0; c < *eob; ++c) {
     int8_t *const sign = &signs[scan[c]];
-    if (levels[scan[c]] == 0) continue;
+    if (levels[get_paded_idx(scan[c], bwl)] == 0) continue;
     if (c == 0) {
       int dc_sign_ctx = txb_ctx->dc_sign_ctx;
 #if LV_MAP_PROB
@@ -253,7 +254,7 @@
   }
 
   for (c = update_eob; c >= 0; --c) {
-    uint8_t *const level = &levels[scan[c]];
+    uint8_t *const level = &levels[get_paded_idx(scan[c], bwl)];
     int idx;
     int ctx;
 
@@ -302,7 +303,7 @@
 
   for (c = 0; c < *eob; ++c) {
     const int16_t dqv = (c == 0) ? dequant[0] : dequant[1];
-    const int level = levels[scan[c]];
+    const int level = levels[get_paded_idx(scan[c], bwl)];
     const tran_low_t t = ((level + tcoeffs[scan[c]]) * dqv) >> shift;
 #if CONFIG_SYMBOLRATE
     av1_record_coeff(counts, level);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c
index fa31e03..7549795 100644
--- a/av1/encoder/encodetxb.c
+++ b/av1/encoder/encodetxb.c
@@ -245,7 +245,8 @@
 static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc,
                                  const TxbInfo *const txb_info) {
   txb_info->qcoeff[coeff_idx] = qc;
-  txb_info->levels[coeff_idx] = (uint8_t)clamp(abs(qc), 0, UINT8_MAX);
+  txb_info->levels[get_paded_idx(coeff_idx, txb_info->bwl)] =
+      (uint8_t)clamp(abs(qc), 0, UINT8_MAX);
 }
 
 static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc,
@@ -256,16 +257,23 @@
 }
 
 static INLINE void av1_txb_init_levels(const tran_low_t *const coeff,
-                                       const int width, const int size,
+                                       const int width, const int height,
                                        uint8_t *const levels) {
   const int stride = width + TX_PAD_HOR;
+  uint8_t *ls = levels;
 
   memset(levels - TX_PAD_TOP * stride, 0,
          sizeof(*levels) * TX_PAD_TOP * stride);
-  memset(levels + size, 0, sizeof(*levels) * TX_PAD_BOTTOM * stride);
+  memset(levels + stride * height, 0,
+         sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END));
 
-  for (int i = 0; i < size; i++) {
-    levels[i] = (uint8_t)clamp(abs(coeff[i]), 0, UINT8_MAX);
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, UINT8_MAX);
+    }
+    for (int j = 0; j < TX_PAD_HOR; j++) {
+      *ls++ = 0;
+    }
   }
 }
 
@@ -298,7 +306,7 @@
 
   if (eob == 0) return;
 
-  av1_txb_init_levels(tcoeff, width, seg_eob, levels);
+  av1_txb_init_levels(tcoeff, width, height, levels);
 
 #if CONFIG_TXK_SEL
   av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane,
@@ -500,19 +508,17 @@
                                     int ctx_set[NUM_BASE_LEVELS]) {
   const int row = c >> bwl;
   const int col = c - (row << bwl);
-  const int stride = 1 << bwl;
+  const int width = 1 << bwl;
+  const int stride = width + TX_PAD_HOR;
   int mag_count[NUM_BASE_LEVELS] = { 0 };
   int nb_mag[NUM_BASE_LEVELS][3] = { { 0 } };
   int idx;
   int i;
 
   for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) {
-    int ref_row = row + base_ref_offset[idx][0];
-    int ref_col = col + base_ref_offset[idx][1];
-    int pos = (ref_row << bwl) + ref_col;
-
-    if (ref_col < 0 || ref_col >= stride) continue;
-
+    const int ref_row = row + base_ref_offset[idx][0];
+    const int ref_col = col + base_ref_offset[idx][1];
+    const int pos = ref_row * stride + ref_col;
     const uint8_t abs_coeff = levels[pos];
 
     for (i = 0; i < NUM_BASE_LEVELS; ++i) {
@@ -593,7 +599,7 @@
   }
   cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0];
 
-  av1_txb_init_levels(qcoeff, width, tx_size_2d[tx_size], levels);
+  av1_txb_init_levels(qcoeff, width, height, levels);
 
 #if CONFIG_TXK_SEL
   cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type);
@@ -1106,10 +1112,10 @@
     for (int i = 0; i < ref_num; ++i) {
       const int nb_row = row - ref_offset[i][0];
       const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
 
       if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
+          nb_col >= txb_info->width)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
@@ -1142,10 +1148,10 @@
     for (int i = 0; i < ref_num; ++i) {
       const int nb_row = row - ref_offset[i][0];
       const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
 
       if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
+          nb_col >= txb_info->width)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
@@ -1178,10 +1184,10 @@
     for (int i = 0; i < ref_num; ++i) {
       const int nb_row = row - ref_offset[i][0];
       const int nb_col = col - ref_offset[i][1];
-      const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+      const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
 
       if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height ||
-          nb_col >= txb_info->stride)
+          nb_col >= txb_info->width)
         continue;
 
       const int nb_scan_idx = iscan[nb_coeff_idx];
@@ -1302,10 +1308,10 @@
     const int nb_col = col - sig_ref_offset[i][1];
 
     if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
+          nb_col < txb_info->width))
       continue;
 
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
     const int nb_scan_idx = iscan[nb_coeff_idx];
     if (nb_scan_idx < eob) {
       const int scan_idx = iscan[coeff_idx];
@@ -1336,10 +1342,10 @@
   for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) {
     const int nb_row = row - base_ref_offset[i][0];
     const int nb_col = col - base_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
 
     if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
+          nb_col < txb_info->width))
       continue;
 
     const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
@@ -1373,10 +1379,10 @@
   for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) {
     const int nb_row = row - br_ref_offset[i][0];
     const int nb_col = col - br_ref_offset[i][1];
-    const int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    const int nb_coeff_idx = nb_row * txb_info->width + nb_col;
 
     if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height &&
-          nb_col < txb_info->stride))
+          nb_col < txb_info->width))
       continue;
 
     const int nb_scan_idx = iscan[nb_coeff_idx];
@@ -1483,10 +1489,10 @@
   for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
     int nb_row = row - all_ref_offset[i][0];
     int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    int nb_coeff_idx = nb_row * txb_info->width + nb_col;
     int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
     if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
+        nb_row < txb_info->height && nb_col < txb_info->width) {
       tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
       int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
       if (cost_map)
@@ -1500,10 +1506,10 @@
   for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) {
     int nb_row = row - all_ref_offset[i][0];
     int nb_col = col - all_ref_offset[i][1];
-    int nb_coeff_idx = nb_row * txb_info->stride + nb_col;
+    int nb_coeff_idx = nb_row * txb_info->width + nb_col;
     int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx];
     if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 &&
-        nb_row < txb_info->height && nb_col < txb_info->stride) {
+        nb_row < txb_info->height && nb_col < txb_info->width) {
       tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx];
       int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs);
       if (cost_map)
@@ -1658,14 +1664,17 @@
   tran_low_t tmp_qcoeff[MAX_TX_SQUARE];
   tran_low_t tmp_dqcoeff[MAX_TX_SQUARE];
   uint8_t tmp_levels_buf[TX_PAD_2D];
-  uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->stride);
+  uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->width);
   const int org_eob = txb_info->eob;
   if (dry_run) {
-    const int stride = txb_info->stride + TX_PAD_HOR;
+    const int stride = txb_info->width + TX_PAD_HOR;
+    const int levels_size =
+
+        (stride * (txb_info->height + TX_PAD_VER) + TX_PAD_END);
     memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob);
     memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob);
     memcpy(tmp_levels, org_levels - TX_PAD_TOP * stride,
-           sizeof(org_levels[0]) * stride * (txb_info->height + TX_PAD_VER));
+           sizeof(org_levels[0]) * levels_size);
     txb_info->qcoeff = tmp_qcoeff;
     txb_info->dqcoeff = tmp_dqcoeff;
     txb_info->levels = tmp_levels;
@@ -1800,14 +1809,17 @@
   tran_low_t tmp_qcoeff[MAX_TX_SQUARE];
   tran_low_t tmp_dqcoeff[MAX_TX_SQUARE];
   uint8_t tmp_levels_buf[TX_PAD_2D];
-  uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->stride);
+  uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->width);
   const int org_eob = txb_info->eob;
   if (dry_run) {
-    const int stride = txb_info->stride + TX_PAD_HOR;
+    const int stride = txb_info->width + TX_PAD_HOR;
+    const int levels_size =
+
+        (stride * (txb_info->height + TX_PAD_VER) + TX_PAD_END);
     memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob);
     memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob);
     memcpy(tmp_levels, org_levels - TX_PAD_TOP * stride,
-           sizeof(org_levels[0]) * stride * (txb_info->height + TX_PAD_VER));
+           sizeof(org_levels[0]) * levels_size);
     txb_info->qcoeff = tmp_qcoeff;
     txb_info->dqcoeff = tmp_dqcoeff;
     txb_info->levels = tmp_levels;
@@ -1910,7 +1922,6 @@
   const int16_t *dequant = p->dequant_QTX;
   const int seg_eob = tx_size_2d[tx_size];
   const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2;
-  const int stride = 1 << bwl;
   const int width = tx_size_wide[tx_size];
   const int height = tx_size_high[tx_size];
   const int is_inter = is_inter_block(mbmi);
@@ -1923,13 +1934,14 @@
   uint8_t levels_buf[TX_PAD_2D];
   uint8_t *const levels = set_levels(levels_buf, width);
 
+  assert(width == (1 << bwl));
   TxbInfo txb_info = {
     qcoeff,  levels,  dqcoeff,    tcoeff,  dequant, shift,
-    tx_size, txs_ctx, tx_type,    bwl,     stride,  height,
+    tx_size, txs_ctx, tx_type,    bwl,     width,   height,
     eob,     seg_eob, scan_order, txb_ctx, rdmult,  &cm->coeff_ctx_table
   };
 
-  av1_txb_init_levels(qcoeff, width, tx_size_2d[tx_size], levels);
+  av1_txb_init_levels(qcoeff, width, height, levels);
 
   const int update = optimize_txb(&txb_info, &txb_costs, NULL, 0, fast_mode);
 
@@ -2028,7 +2040,7 @@
     return;
   }
 
-  av1_txb_init_levels(tcoeff, width, tx_size_2d[tx_size], levels);
+  av1_txb_init_levels(tcoeff, width, height, levels);
 
 #if CONFIG_TXK_SEL
   av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane,
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h
index a08b567..f7edd35 100644
--- a/av1/encoder/encodetxb.h
+++ b/av1/encoder/encodetxb.h
@@ -34,7 +34,7 @@
   TX_SIZE txs_ctx;
   TX_TYPE tx_type;
   int bwl;
-  int stride;
+  int width;
   int height;
   int eob;
   int seg_eob;