Fix highbd inv_txfm's mismatch between C and SIMD
Fix av1_highbd_inv_txfm_add_4x16_sse4_1 by
swap the clamp and rounding operation.
Change-Id: I39030850e92664dac4c329a4d663746ae55070da
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index f4cdfa5..5625fa6 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -115,9 +115,7 @@
specialize qw/av1_inv_txfm_add ssse3 avx2/; # neon/;
add_proto qw/void av1_highbd_inv_txfm_add/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
-# TODO(http://crbug.com/aomedia/2350): avx2 is disabled due to test vector
-# mismatches.
-specialize qw/av1_highbd_inv_txfm_add sse4_1/;
+specialize qw/av1_highbd_inv_txfm_add sse4_1 avx2/;
add_proto qw/void av1_highbd_inv_txfm_add_4x4/, "const tran_low_t *dqcoeff, uint8_t *dst, int stride, const TxfmParam *txfm_param";
specialize qw/av1_highbd_inv_txfm_add_4x4 sse4_1/;
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index 8a8641d..82c7a64 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5752,11 +5752,15 @@
load_buffer_32bit_input(input_row, input_stride, buf0_cur, txfm_size_row);
for (int i = 0; i < (txfm_size_row >> 2); i++) {
row_txfm(buf0 + (i << 2), buf0 + (i << 2),
- inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+ inv_cos_bit_row[txw_idx][txh_idx], 1, bd, -shift[0]);
}
av1_round_shift_array_32_sse4_1(buf0, buf0, txfm_size_row, -shift[0]);
+ const int log_range = AOMMAX(16, bd + 6);
+ const __m128i clamp_lo = _mm_set1_epi32(-(1 << (log_range - 1)));
+ const __m128i clamp_hi = _mm_set1_epi32((1 << (log_range - 1)) - 1);
+ highbd_clamp_epi32_sse4_1(buf0, buf0, &clamp_lo, &clamp_hi, txfm_size_row);
if (lr_flip) {
for (int j = 0; j < buf_size_h_div8; ++j) {
TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
@@ -5926,16 +5930,28 @@
assert(av1_ext_tx_used[txfm_param->tx_set_type][txfm_param->tx_type]);
const TX_SIZE tx_size = txfm_param->tx_size;
switch (tx_size) {
+ case TX_8X8:
+ av1_highbd_inv_txfm_add_8x8_sse4_1(input, dest, stride, txfm_param);
+ break;
case TX_4X8:
av1_highbd_inv_txfm_add_4x8_sse4_1(input, dest, stride, txfm_param);
break;
case TX_8X4:
av1_highbd_inv_txfm_add_8x4_sse4_1(input, dest, stride, txfm_param);
break;
+ case TX_4X4:
+ av1_highbd_inv_txfm_add_4x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_16X4:
+ av1_highbd_inv_txfm_add_16x4_sse4_1(input, dest, stride, txfm_param);
+ break;
+ case TX_4X16:
+ av1_highbd_inv_txfm_add_4x16_sse4_1(input, dest, stride, txfm_param);
+ break;
default:
- // TODO(http://crbug.com/aomedia/2350): the remaining sse4_1 versions
- // cause test vector mismatches.
- av1_highbd_inv_txfm_add_c(input, dest, stride, txfm_param);
+ av1_highbd_inv_txfm2d_add_universe_sse4_1(
+ input, dest, stride, txfm_param->tx_type, tx_size, txfm_param->eob,
+ txfm_param->bd);
break;
}
}
diff --git a/test/av1_highbd_iht_test.cc b/test/av1_highbd_iht_test.cc
index 6d77cbf..7f077b6 100644
--- a/test/av1_highbd_iht_test.cc
+++ b/test/av1_highbd_iht_test.cc
@@ -308,8 +308,7 @@
::testing::Values(av1_highbd_inv_txfm_add_sse4_1));
#endif
-// TODO(http://crbug.com/aomedia/2350): these cause test vector mismatches.
-#if 0 // HAVE_AVX2
+#if HAVE_AVX2
INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdInvTxfm2d,
::testing::Values(av1_highbd_inv_txfm_add_avx2));
#endif