Pad extra columns in txb levels and signs This helps the removal of horizontal availability check. Change-Id: Ie3b97eea63b4cc79ec78df119c2730a4d7cf539d
diff --git a/av1/common/enums.h b/av1/common/enums.h index b368701..fe4ce59 100644 --- a/av1/common/enums.h +++ b/av1/common/enums.h
@@ -243,13 +243,17 @@ #define MIN_TX_SIZE (1 << MIN_TX_SIZE_LOG2) #define MAX_TX_SQUARE (MAX_TX_SIZE * MAX_TX_SIZE) -#define TX_PAD_HOR 0 +// Pad 4 extra columns to remove horizontal availability check. +#define TX_PAD_HOR 4 // Pad 6 extra rows (2 on top and 4 on bottom) to remove vertical availability // check. #define TX_PAD_TOP 2 #define TX_PAD_BOTTOM 4 #define TX_PAD_VER (TX_PAD_TOP + TX_PAD_BOTTOM) -#define TX_PAD_2D ((MAX_TX_SIZE + TX_PAD_HOR) * (MAX_TX_SIZE + TX_PAD_VER)) +// Pad 16 extra bytes to avoid reading overflow in SIMD optimization. +#define TX_PAD_END 16 +#define TX_PAD_2D \ + ((MAX_TX_SIZE + TX_PAD_HOR) * (MAX_TX_SIZE + TX_PAD_VER) + TX_PAD_END) // Number of maxium size transform blocks in the maximum size superblock #define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
diff --git a/av1/common/txb_common.h b/av1/common/txb_common.h index 6d0d7bd..71471da 100644 --- a/av1/common/txb_common.h +++ b/av1/common/txb_common.h
@@ -80,20 +80,22 @@ return levels_buf + TX_PAD_TOP * (width + TX_PAD_HOR); } +static INLINE int get_paded_idx(const int idx, const int bwl) { + return idx + TX_PAD_HOR * (idx >> bwl); +} + static INLINE int get_level_count_mag(int *const mag, const uint8_t *const levels, - const int bwl, const int row, + const int stride, const int row, const int col, const int level, const int (*nb_offset)[2], const int nb_num) { - const int stride = 1 << bwl; int count = 0; for (int idx = 0; idx < nb_num; ++idx) { const int ref_row = row + nb_offset[idx][0]; const int ref_col = col + nb_offset[idx][1]; - if (ref_col < 0 || ref_col >= stride) continue; - const int pos = (ref_row << bwl) + ref_col; + const int pos = ref_row * stride + ref_col; count += levels[pos] > level; if (nb_offset[idx][0] == 0 && nb_offset[idx][1] == 1) mag[0] = levels[pos]; if (nb_offset[idx][0] == 1 && nb_offset[idx][1] == 0) mag[1] = levels[pos]; @@ -171,11 +173,12 @@ const int bwl, const int level) { const int row = c >> bwl; const int col = c - (row << bwl); + const int stride = (1 << bwl) + TX_PAD_HOR; const int level_minus_1 = level - 1; int mag_count = 0; int nb_mag[3] = { 0 }; const int count = - get_level_count_mag(nb_mag, levels, bwl, row, col, level_minus_1, + get_level_count_mag(nb_mag, levels, stride, row, col, level_minus_1, base_ref_offset, BASE_CONTEXT_POSITION_NUM); for (int idx = 0; idx < 3; ++idx) mag_count += nb_mag[idx] > level; @@ -272,11 +275,12 @@ const int bwl) { const int row = c >> bwl; const int col = c - (row << bwl); + const int stride = (1 << bwl) + TX_PAD_HOR; const int level_minus_1 = NUM_BASE_LEVELS; int mag = 0; int nb_mag[3] = { 0 }; const int count = - get_level_count_mag(nb_mag, levels, bwl, row, col, level_minus_1, + get_level_count_mag(nb_mag, levels, stride, row, col, level_minus_1, br_ref_offset, BR_CONTEXT_POSITION_NUM); for (int idx = 0; idx < 3; ++idx) mag = AOMMAX(mag, nb_mag[idx]); const int ctx = get_br_ctx_from_count_mag(row, col, count, mag); @@ -302,6 +306,7 @@ static INLINE int get_nz_count_mag(const uint8_t *const levels, const int bwl, const int row, const int col, const TX_CLASS tx_class, int *const mag) { + const int stride = (1 << bwl) + TX_PAD_HOR; int count = 0; *mag = 0; for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) { @@ -318,7 +323,7 @@ const int ref_row = row + row_offset; const int ref_col = col + col_offset; if (ref_col >= (1 << bwl)) continue; - const int nb_pos = (ref_row << bwl) + ref_col; + const int nb_pos = ref_row * stride + ref_col; const int level = levels[nb_pos]; count += (level != 0); #if 1 @@ -333,6 +338,7 @@ static INLINE int get_nz_count(const uint8_t *const levels, const int bwl, const int row, const int col, const TX_CLASS tx_class) { + const int stride = (1 << bwl) + TX_PAD_HOR; int count = 0; for (int idx = 0; idx < SIG_REF_OFFSET_NUM; ++idx) { const int ref_row = row + ((tx_class == TX_CLASS_2D) @@ -345,8 +351,7 @@ : ((tx_class == TX_CLASS_VERT) ? sig_ref_offset_vert[idx][1] : sig_ref_offset_horiz[idx][1])); - if (ref_col >= (1 << bwl)) continue; - const int nb_pos = (ref_row << bwl) + ref_col; + const int nb_pos = ref_row * stride + ref_col; count += (levels[nb_pos] != 0); } return count;
diff --git a/av1/decoder/decodetxb.c b/av1/decoder/decodetxb.c index f0bba6b..70006f7 100644 --- a/av1/decoder/decodetxb.c +++ b/av1/decoder/decodetxb.c
@@ -94,7 +94,8 @@ } memset(levels_buf, 0, - sizeof(*levels_buf) * (seg_eob + TX_PAD_VER * (width + TX_PAD_HOR))); + sizeof(*levels_buf) * + ((width + TX_PAD_HOR) * (height + TX_PAD_VER) + TX_PAD_END)); memset(signs, 0, sizeof(*signs) * seg_eob); (void)blk_row; @@ -174,7 +175,7 @@ } // set non-zero coefficient map. - levels[scan[c]] = is_nz; + levels[get_paded_idx(scan[c], bwl)] = is_nz; if (counts) ++(*nz_map_count)[coeff_ctx][is_nz]; @@ -194,7 +195,7 @@ break; } } - levels[scan[c]] = k + 1; + levels[get_paded_idx(scan[c], bwl)] = k + 1; } #endif } @@ -207,7 +208,7 @@ int i; for (i = 0; i < NUM_BASE_LEVELS; ++i) { for (c = *eob - 1; c >= 0; --c) { - uint8_t *const level = &levels[scan[c]]; + uint8_t *const level = &levels[get_paded_idx(scan[c], bwl)]; int ctx; if (*level <= i) continue; @@ -237,7 +238,7 @@ // starting with the sign of the DC (if applicable) for (c = 0; c < *eob; ++c) { int8_t *const sign = &signs[scan[c]]; - if (levels[scan[c]] == 0) continue; + if (levels[get_paded_idx(scan[c], bwl)] == 0) continue; if (c == 0) { int dc_sign_ctx = txb_ctx->dc_sign_ctx; #if LV_MAP_PROB @@ -253,7 +254,7 @@ } for (c = update_eob; c >= 0; --c) { - uint8_t *const level = &levels[scan[c]]; + uint8_t *const level = &levels[get_paded_idx(scan[c], bwl)]; int idx; int ctx; @@ -302,7 +303,7 @@ for (c = 0; c < *eob; ++c) { const int16_t dqv = (c == 0) ? dequant[0] : dequant[1]; - const int level = levels[scan[c]]; + const int level = levels[get_paded_idx(scan[c], bwl)]; const tran_low_t t = ((level + tcoeffs[scan[c]]) * dqv) >> shift; #if CONFIG_SYMBOLRATE av1_record_coeff(counts, level);
diff --git a/av1/encoder/encodetxb.c b/av1/encoder/encodetxb.c index fa31e03..7549795 100644 --- a/av1/encoder/encodetxb.c +++ b/av1/encoder/encodetxb.c
@@ -245,7 +245,8 @@ static INLINE void update_qcoeff(const int coeff_idx, const tran_low_t qc, const TxbInfo *const txb_info) { txb_info->qcoeff[coeff_idx] = qc; - txb_info->levels[coeff_idx] = (uint8_t)clamp(abs(qc), 0, UINT8_MAX); + txb_info->levels[get_paded_idx(coeff_idx, txb_info->bwl)] = + (uint8_t)clamp(abs(qc), 0, UINT8_MAX); } static INLINE void update_coeff(const int coeff_idx, const tran_low_t qc, @@ -256,16 +257,23 @@ } static INLINE void av1_txb_init_levels(const tran_low_t *const coeff, - const int width, const int size, + const int width, const int height, uint8_t *const levels) { const int stride = width + TX_PAD_HOR; + uint8_t *ls = levels; memset(levels - TX_PAD_TOP * stride, 0, sizeof(*levels) * TX_PAD_TOP * stride); - memset(levels + size, 0, sizeof(*levels) * TX_PAD_BOTTOM * stride); + memset(levels + stride * height, 0, + sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); - for (int i = 0; i < size; i++) { - levels[i] = (uint8_t)clamp(abs(coeff[i]), 0, UINT8_MAX); + for (int i = 0; i < height; i++) { + for (int j = 0; j < width; j++) { + *ls++ = (uint8_t)clamp(abs(coeff[i * width + j]), 0, UINT8_MAX); + } + for (int j = 0; j < TX_PAD_HOR; j++) { + *ls++ = 0; + } } } @@ -298,7 +306,7 @@ if (eob == 0) return; - av1_txb_init_levels(tcoeff, width, seg_eob, levels); + av1_txb_init_levels(tcoeff, width, height, levels); #if CONFIG_TXK_SEL av1_write_tx_type(cm, xd, blk_row, blk_col, block, plane, @@ -500,19 +508,17 @@ int ctx_set[NUM_BASE_LEVELS]) { const int row = c >> bwl; const int col = c - (row << bwl); - const int stride = 1 << bwl; + const int width = 1 << bwl; + const int stride = width + TX_PAD_HOR; int mag_count[NUM_BASE_LEVELS] = { 0 }; int nb_mag[NUM_BASE_LEVELS][3] = { { 0 } }; int idx; int i; for (idx = 0; idx < BASE_CONTEXT_POSITION_NUM; ++idx) { - int ref_row = row + base_ref_offset[idx][0]; - int ref_col = col + base_ref_offset[idx][1]; - int pos = (ref_row << bwl) + ref_col; - - if (ref_col < 0 || ref_col >= stride) continue; - + const int ref_row = row + base_ref_offset[idx][0]; + const int ref_col = col + base_ref_offset[idx][1]; + const int pos = ref_row * stride + ref_col; const uint8_t abs_coeff = levels[pos]; for (i = 0; i < NUM_BASE_LEVELS; ++i) { @@ -593,7 +599,7 @@ } cost = coeff_costs->txb_skip_cost[txb_skip_ctx][0]; - av1_txb_init_levels(qcoeff, width, tx_size_2d[tx_size], levels); + av1_txb_init_levels(qcoeff, width, height, levels); #if CONFIG_TXK_SEL cost += av1_tx_type_cost(cm, x, xd, mbmi->sb_type, plane, tx_size, tx_type); @@ -1106,10 +1112,10 @@ for (int i = 0; i < ref_num; ++i) { const int nb_row = row - ref_offset[i][0]; const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) + nb_col >= txb_info->width) continue; const int nb_scan_idx = iscan[nb_coeff_idx]; @@ -1142,10 +1148,10 @@ for (int i = 0; i < ref_num; ++i) { const int nb_row = row - ref_offset[i][0]; const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) + nb_col >= txb_info->width) continue; const int nb_scan_idx = iscan[nb_coeff_idx]; @@ -1178,10 +1184,10 @@ for (int i = 0; i < ref_num; ++i) { const int nb_row = row - ref_offset[i][0]; const int nb_col = col - ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; if (nb_row < 0 || nb_col < 0 || nb_row >= txb_info->height || - nb_col >= txb_info->stride) + nb_col >= txb_info->width) continue; const int nb_scan_idx = iscan[nb_coeff_idx]; @@ -1302,10 +1308,10 @@ const int nb_col = col - sig_ref_offset[i][1]; if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) + nb_col < txb_info->width)) continue; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; const int nb_scan_idx = iscan[nb_coeff_idx]; if (nb_scan_idx < eob) { const int scan_idx = iscan[coeff_idx]; @@ -1336,10 +1342,10 @@ for (int i = 0; i < BASE_CONTEXT_POSITION_NUM; ++i) { const int nb_row = row - base_ref_offset[i][0]; const int nb_col = col - base_ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) + nb_col < txb_info->width)) continue; const tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; @@ -1373,10 +1379,10 @@ for (int i = 0; i < BR_CONTEXT_POSITION_NUM; ++i) { const int nb_row = row - br_ref_offset[i][0]; const int nb_col = col - br_ref_offset[i][1]; - const int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + const int nb_coeff_idx = nb_row * txb_info->width + nb_col; if (!(nb_row >= 0 && nb_col >= 0 && nb_row < txb_info->height && - nb_col < txb_info->stride)) + nb_col < txb_info->width)) continue; const int nb_scan_idx = iscan[nb_coeff_idx]; @@ -1483,10 +1489,10 @@ for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) { int nb_row = row - all_ref_offset[i][0]; int nb_col = col - all_ref_offset[i][1]; - int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + int nb_coeff_idx = nb_row * txb_info->width + nb_col; int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx]; if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 && - nb_row < txb_info->height && nb_col < txb_info->stride) { + nb_row < txb_info->height && nb_col < txb_info->width) { tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs); if (cost_map) @@ -1500,10 +1506,10 @@ for (int i = 0; i < ALL_REF_OFFSET_NUM; ++i) { int nb_row = row - all_ref_offset[i][0]; int nb_col = col - all_ref_offset[i][1]; - int nb_coeff_idx = nb_row * txb_info->stride + nb_col; + int nb_coeff_idx = nb_row * txb_info->width + nb_col; int nb_scan_idx = txb_info->scan_order->iscan[nb_coeff_idx]; if (nb_scan_idx < txb_info->eob && nb_row >= 0 && nb_col >= 0 && - nb_row < txb_info->height && nb_col < txb_info->stride) { + nb_row < txb_info->height && nb_col < txb_info->width) { tran_low_t nb_coeff = txb_info->qcoeff[nb_coeff_idx]; int cost = get_coeff_cost(nb_coeff, nb_scan_idx, txb_info, txb_costs); if (cost_map) @@ -1658,14 +1664,17 @@ tran_low_t tmp_qcoeff[MAX_TX_SQUARE]; tran_low_t tmp_dqcoeff[MAX_TX_SQUARE]; uint8_t tmp_levels_buf[TX_PAD_2D]; - uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->stride); + uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->width); const int org_eob = txb_info->eob; if (dry_run) { - const int stride = txb_info->stride + TX_PAD_HOR; + const int stride = txb_info->width + TX_PAD_HOR; + const int levels_size = + + (stride * (txb_info->height + TX_PAD_VER) + TX_PAD_END); memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob); memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob); memcpy(tmp_levels, org_levels - TX_PAD_TOP * stride, - sizeof(org_levels[0]) * stride * (txb_info->height + TX_PAD_VER)); + sizeof(org_levels[0]) * levels_size); txb_info->qcoeff = tmp_qcoeff; txb_info->dqcoeff = tmp_dqcoeff; txb_info->levels = tmp_levels; @@ -1800,14 +1809,17 @@ tran_low_t tmp_qcoeff[MAX_TX_SQUARE]; tran_low_t tmp_dqcoeff[MAX_TX_SQUARE]; uint8_t tmp_levels_buf[TX_PAD_2D]; - uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->stride); + uint8_t *const tmp_levels = set_levels(tmp_levels_buf, txb_info->width); const int org_eob = txb_info->eob; if (dry_run) { - const int stride = txb_info->stride + TX_PAD_HOR; + const int stride = txb_info->width + TX_PAD_HOR; + const int levels_size = + + (stride * (txb_info->height + TX_PAD_VER) + TX_PAD_END); memcpy(tmp_qcoeff, org_qcoeff, sizeof(org_qcoeff[0]) * max_eob); memcpy(tmp_dqcoeff, org_dqcoeff, sizeof(org_dqcoeff[0]) * max_eob); memcpy(tmp_levels, org_levels - TX_PAD_TOP * stride, - sizeof(org_levels[0]) * stride * (txb_info->height + TX_PAD_VER)); + sizeof(org_levels[0]) * levels_size); txb_info->qcoeff = tmp_qcoeff; txb_info->dqcoeff = tmp_dqcoeff; txb_info->levels = tmp_levels; @@ -1910,7 +1922,6 @@ const int16_t *dequant = p->dequant_QTX; const int seg_eob = tx_size_2d[tx_size]; const int bwl = b_width_log2_lookup[txsize_to_bsize[tx_size]] + 2; - const int stride = 1 << bwl; const int width = tx_size_wide[tx_size]; const int height = tx_size_high[tx_size]; const int is_inter = is_inter_block(mbmi); @@ -1923,13 +1934,14 @@ uint8_t levels_buf[TX_PAD_2D]; uint8_t *const levels = set_levels(levels_buf, width); + assert(width == (1 << bwl)); TxbInfo txb_info = { qcoeff, levels, dqcoeff, tcoeff, dequant, shift, - tx_size, txs_ctx, tx_type, bwl, stride, height, + tx_size, txs_ctx, tx_type, bwl, width, height, eob, seg_eob, scan_order, txb_ctx, rdmult, &cm->coeff_ctx_table }; - av1_txb_init_levels(qcoeff, width, tx_size_2d[tx_size], levels); + av1_txb_init_levels(qcoeff, width, height, levels); const int update = optimize_txb(&txb_info, &txb_costs, NULL, 0, fast_mode); @@ -2028,7 +2040,7 @@ return; } - av1_txb_init_levels(tcoeff, width, tx_size_2d[tx_size], levels); + av1_txb_init_levels(tcoeff, width, height, levels); #if CONFIG_TXK_SEL av1_update_tx_type_count(cm, xd, blk_row, blk_col, block, plane,
diff --git a/av1/encoder/encodetxb.h b/av1/encoder/encodetxb.h index a08b567..f7edd35 100644 --- a/av1/encoder/encodetxb.h +++ b/av1/encoder/encodetxb.h
@@ -34,7 +34,7 @@ TX_SIZE txs_ctx; TX_TYPE tx_type; int bwl; - int stride; + int width; int height; int eob; int seg_eob;