Fix a bug in av1_quantize_lp AVX2 optimization In av1_quantize_lp AVX2 optimization, eob is scanned from coeff results. To get the correct eob, iscan should be used instead of scan. This CL fixed the bug, and resolved AVX2 annd c mismatch. The unit test for this function was added. BUG=aomedia:3156 Change-Id: Id173dc292ca5951ef1f1a75f37fa31f4615e2da0 (cherry picked from commit 984b81aac328a7eb2d5ea93787be295c4e14b66c)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 63e5758..231908e 100644 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -330,9 +330,9 @@ add_proto qw/void av1_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp sse2 avx2 neon/; - add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan"; - specialize qw/av1_quantize_lp avx2 neon/; - + # TODO(any): need to fix the bug in neon optimization and re-enable it. + add_proto qw/void av1_quantize_lp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; + specialize qw/av1_quantize_lp avx2/; add_proto qw/void av1_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/av1_quantize_fp_32x32 neon avx2/;
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c index de1e1b3..66be3b6 100644 --- a/av1/encoder/av1_quantize.c +++ b/av1/encoder/av1_quantize.c
@@ -213,7 +213,8 @@ const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan) { + const int16_t *scan, const int16_t *iscan) { + (void)iscan; int eob = -1; memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
diff --git a/av1/encoder/nonrd_pickmode.c b/av1/encoder/nonrd_pickmode.c index a3f1216..3d710dd 100644 --- a/av1/encoder/nonrd_pickmode.c +++ b/av1/encoder/nonrd_pickmode.c
@@ -833,20 +833,21 @@ aom_hadamard_lp_16x16(src_diff, diff_stride, low_coeff); av1_quantize_lp(low_coeff, 16 * 16, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, - p->dequant_QTX, eob, scan_order->scan); + p->dequant_QTX, eob, scan_order->scan, + scan_order->iscan); break; case TX_8X8: aom_hadamard_lp_8x8(src_diff, diff_stride, low_coeff); av1_quantize_lp(low_coeff, 8 * 8, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, - scan_order->scan); + scan_order->scan, scan_order->iscan); break; default: assert(tx_size == TX_4X4); aom_fdct4x4_lp(src_diff, low_coeff, diff_stride); av1_quantize_lp(low_coeff, 4 * 4, p->round_fp_QTX, p->quant_fp_QTX, low_qcoeff, low_dqcoeff, p->dequant_QTX, eob, - scan_order->scan); + scan_order->scan, scan_order->iscan); break; #endif }
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c index f5f7ee1..591edd7 100644 --- a/av1/encoder/x86/av1_quantize_avx2.c +++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -154,22 +154,18 @@ return _mm_extract_epi16(eob, 1); } -static INLINE void store_zero_tran_low(int16_t *a) { - const __m256i zero = _mm256_setzero_si256(); - _mm256_storeu_si256((__m256i *)(a), zero); -} - void av1_quantize_lp_avx2(const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan) { + const int16_t *scan, const int16_t *iscan) { + (void)scan; __m128i eob; __m256i round256, quant256, dequant256; - __m256i eob256, thr256; + __m256i eob256; coeff_ptr += n_coeffs; - scan += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -205,7 +201,7 @@ _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256); } - eob256 = scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256); + eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); n_coeffs += 8 * 2; } @@ -214,30 +210,22 @@ quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31); round256 = _mm256_permute2x128_si256(round256, round256, 0x31); - thr256 = _mm256_srai_epi16(dequant256, 1); - // AC only loop while (n_coeffs < 0) { __m256i coeff256 = _mm256_loadu_si256((const __m256i *)(coeff_ptr + n_coeffs)); __m256i qcoeff256 = _mm256_abs_epi16(coeff256); - int32_t nzflag = - _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256)); - if (nzflag) { - __m256i qtmp256; - qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); - qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); - qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); - _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256); - coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); - _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256); - eob256 = _mm256_max_epi16( - eob256, scan_eob_256((const __m256i *)(scan + n_coeffs), &coeff256)); - } else { - store_zero_tran_low(qcoeff_ptr + n_coeffs); - store_zero_tran_low(dqcoeff_ptr + n_coeffs); - } + __m256i qtmp256; + qcoeff256 = _mm256_adds_epi16(qcoeff256, round256); + qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256); + qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256); + _mm256_storeu_si256((__m256i *)(qcoeff_ptr + n_coeffs), qcoeff256); + coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); + _mm256_storeu_si256((__m256i *)(dqcoeff_ptr + n_coeffs), coeff256); + eob256 = _mm256_max_epi16( + eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); + n_coeffs += 8 * 2; }
diff --git a/test/quantize_lp_func_test.cc b/test/quantize_lp_func_test.cc new file mode 100644 index 0000000..b02222d --- /dev/null +++ b/test/quantize_lp_func_test.cc
@@ -0,0 +1,336 @@ +/* + * Copyright (c) 2021, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <tuple> + +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +#include "config/aom_config.h" +#include "config/aom_dsp_rtcd.h" +#include "config/av1_rtcd.h" + +#include "aom/aom_codec.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/encoder.h" +#include "av1/common/scan.h" +#include "test/acm_random.h" +#include "test/register_state_check.h" +#include "test/util.h" + +namespace { +using libaom_test::ACMRandom; + +#define QUAN_LP_PARAM_LIST \ + const int16_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, \ + const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, \ + const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, \ + const int16_t *iscan + +typedef void (*QuantizeFunc)(QUAN_LP_PARAM_LIST); + +using std::tuple; +typedef tuple<QuantizeFunc, QuantizeFunc, TX_SIZE, aom_bit_depth_t> + QuantizeParam; + +typedef struct { + QUANTS quant; + Dequants dequant; +} QuanTable; + +const int kTestNum = 1000; + +template <typename CoeffType> +class QuantizeTestBase : public ::testing::TestWithParam<QuantizeParam> { + protected: + QuantizeTestBase() + : quant_ref_(GET_PARAM(0)), quant_(GET_PARAM(1)), tx_size_(GET_PARAM(2)), + bd_(GET_PARAM(3)) {} + + virtual ~QuantizeTestBase() {} + + virtual void SetUp() { + qtab_ = reinterpret_cast<QuanTable *>(aom_memalign(32, sizeof(*qtab_))); + const int n_coeffs = coeff_num(); + coeff_ = reinterpret_cast<CoeffType *>( + aom_memalign(32, 6 * n_coeffs * sizeof(CoeffType))); + InitQuantizer(); + } + + virtual void TearDown() { + aom_free(qtab_); + qtab_ = NULL; + aom_free(coeff_); + coeff_ = NULL; + } + + void InitQuantizer() { + av1_build_quantizer(bd_, 0, 0, 0, 0, 0, &qtab_->quant, &qtab_->dequant); + } + + virtual void RunQuantizeFunc(const CoeffType *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, + const int16_t *quant_ptr, CoeffType *qcoeff_ptr, + CoeffType *qcoeff_ref_ptr, + CoeffType *dqcoeff_ptr, + CoeffType *dqcoeff_ref_ptr, + const int16_t *dequant_ptr, + uint16_t *eob_ref_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) = 0; + + void QuantizeRun(bool is_loop, int q = 0, int test_num = 1) { + CoeffType *coeff_ptr = coeff_; + const intptr_t n_coeffs = coeff_num(); + + CoeffType *qcoeff_ref = coeff_ptr + n_coeffs; + CoeffType *dqcoeff_ref = qcoeff_ref + n_coeffs; + + CoeffType *qcoeff = dqcoeff_ref + n_coeffs; + CoeffType *dqcoeff = qcoeff + n_coeffs; + uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs); + + // Testing uses 2-D DCT scan order table + const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT); + + // Testing uses luminance quantization table + const int16_t *round = 0; + const int16_t *quant = 0; + round = qtab_->quant.y_round_fp[q]; + quant = qtab_->quant.y_quant_fp[q]; + + const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q]; + + for (int i = 0; i < test_num; ++i) { + if (is_loop) FillCoeffRandom(); + + memset(qcoeff_ref, 0, 5 * n_coeffs * sizeof(*qcoeff_ref)); + + RunQuantizeFunc(coeff_ptr, n_coeffs, round, quant, qcoeff, qcoeff_ref, + dqcoeff, dqcoeff_ref, dequant, &eob[0], &eob[1], sc->scan, + sc->iscan); + + quant_ref_(coeff_ptr, n_coeffs, round, quant, qcoeff_ref, dqcoeff_ref, + dequant, &eob[0], sc->scan, sc->iscan); + + API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round, quant, qcoeff, + dqcoeff, dequant, &eob[1], sc->scan, + sc->iscan)); + + for (int j = 0; j < n_coeffs; ++j) { + ASSERT_EQ(qcoeff_ref[j], qcoeff[j]) + << "Q mismatch on test: " << i << " at position: " << j + << " Q: " << q << " coeff: " << coeff_ptr[j]; + } + + for (int j = 0; j < n_coeffs; ++j) { + ASSERT_EQ(dqcoeff_ref[j], dqcoeff[j]) + << "Dq mismatch on test: " << i << " at position: " << j + << " Q: " << q << " coeff: " << coeff_ptr[j]; + } + + ASSERT_EQ(eob[0], eob[1]) + << "eobs mismatch on test: " << i << " Q: " << q; + } + } + + void CompareResults(const CoeffType *buf_ref, const CoeffType *buf, int size, + const char *text, int q, int number) { + int i; + for (i = 0; i < size; ++i) { + ASSERT_EQ(buf_ref[i], buf[i]) << text << " mismatch on test: " << number + << " at position: " << i << " Q: " << q; + } + } + + int coeff_num() const { return av1_get_max_eob(tx_size_); } + + void FillCoeff(CoeffType c) { + const int n_coeffs = coeff_num(); + for (int i = 0; i < n_coeffs; ++i) { + coeff_[i] = c; + } + } + + void FillCoeffRandom() { + const int n_coeffs = coeff_num(); + FillCoeffZero(); + int num = rnd_.Rand16() % n_coeffs; + for (int i = 0; i < num; ++i) { + coeff_[i] = GetRandomCoeff(); + } + } + + void FillCoeffRandomRows(int num) { + FillCoeffZero(); + for (int i = 0; i < num; ++i) { + coeff_[i] = GetRandomCoeff(); + } + } + + void FillCoeffZero() { FillCoeff(0); } + + void FillCoeffConstant() { + CoeffType c = GetRandomCoeff(); + FillCoeff(c); + } + + void FillDcOnly() { + FillCoeffZero(); + coeff_[0] = GetRandomCoeff(); + } + + void FillDcLargeNegative() { + FillCoeffZero(); + // Generate a qcoeff which contains 512/-512 (0x0100/0xFE00) to catch issues + // like BUG=883 where the constant being compared was incorrectly + // initialized. + coeff_[0] = -8191; + } + + CoeffType GetRandomCoeff() { + CoeffType coeff; + if (bd_ == AOM_BITS_8) { + coeff = + clamp(static_cast<int16_t>(rnd_.Rand16()), INT16_MIN + 1, INT16_MAX); + } else { + CoeffType min = -(1 << (7 + bd_)); + CoeffType max = -min - 1; + coeff = clamp(static_cast<CoeffType>(rnd_.Rand31()), min, max); + } + return coeff; + } + + ACMRandom rnd_; + QuanTable *qtab_; + CoeffType *coeff_; + QuantizeFunc quant_ref_; + QuantizeFunc quant_; + TX_SIZE tx_size_; + aom_bit_depth_t bd_; +}; + +class FullPrecisionQuantizeLpTest : public QuantizeTestBase<int16_t> { + void RunQuantizeFunc(const int16_t *coeff_ptr, intptr_t n_coeffs, + const int16_t *round_ptr, const int16_t *quant_ptr, + int16_t *qcoeff_ptr, int16_t *qcoeff_ref_ptr, + int16_t *dqcoeff_ptr, int16_t *dqcoeff_ref_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ref_ptr, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) override { + quant_ref_(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ref_ptr, + dqcoeff_ref_ptr, dequant_ptr, eob_ref_ptr, scan, iscan); + + API_REGISTER_STATE_CHECK(quant_(coeff_ptr, n_coeffs, round_ptr, quant_ptr, + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, + eob_ptr, scan, iscan)); + } +}; + +TEST_P(FullPrecisionQuantizeLpTest, ZeroInput) { + FillCoeffZero(); + QuantizeRun(false); +} + +TEST_P(FullPrecisionQuantizeLpTest, LargeNegativeInput) { + FillDcLargeNegative(); + QuantizeRun(false, 0, 1); +} + +TEST_P(FullPrecisionQuantizeLpTest, DcOnlyInput) { + FillDcOnly(); + QuantizeRun(false, 0, 1); +} + +TEST_P(FullPrecisionQuantizeLpTest, RandomInput) { + QuantizeRun(true, 0, kTestNum); +} + +TEST_P(FullPrecisionQuantizeLpTest, MultipleQ) { + for (int q = 0; q < QINDEX_RANGE; ++q) { + QuantizeRun(true, q, kTestNum); + } +} + +// Force the coeff to be half the value of the dequant. This exposes a +// mismatch found in av1_quantize_fp_sse2(). +TEST_P(FullPrecisionQuantizeLpTest, CoeffHalfDequant) { + FillCoeff(16); + QuantizeRun(false, 25, 1); +} + +TEST_P(FullPrecisionQuantizeLpTest, DISABLED_Speed) { + int16_t *coeff_ptr = coeff_; + const intptr_t n_coeffs = coeff_num(); + + int16_t *qcoeff_ref = coeff_ptr + n_coeffs; + int16_t *dqcoeff_ref = qcoeff_ref + n_coeffs; + + int16_t *qcoeff = dqcoeff_ref + n_coeffs; + int16_t *dqcoeff = qcoeff + n_coeffs; + uint16_t *eob = (uint16_t *)(dqcoeff + n_coeffs); + + // Testing uses 2-D DCT scan order table + const SCAN_ORDER *const sc = get_default_scan(tx_size_, DCT_DCT); + + // Testing uses luminance quantization table + const int q = 22; + const int16_t *round_fp = qtab_->quant.y_round_fp[q]; + const int16_t *quant_fp = qtab_->quant.y_quant_fp[q]; + const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q]; + const int kNumTests = 5000000; + aom_usec_timer timer, simd_timer; + int rows = tx_size_high[tx_size_]; + int cols = tx_size_wide[tx_size_]; + rows = AOMMIN(32, rows); + cols = AOMMIN(32, cols); + for (int cnt = 0; cnt <= rows; cnt++) { + FillCoeffRandomRows(cnt * cols); + + aom_usec_timer_start(&timer); + for (int n = 0; n < kNumTests; ++n) { + quant_ref_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff, + dequant, eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&timer); + + aom_usec_timer_start(&simd_timer); + for (int n = 0; n < kNumTests; ++n) { + quant_(coeff_ptr, n_coeffs, round_fp, quant_fp, qcoeff, dqcoeff, dequant, + eob, sc->scan, sc->iscan); + } + aom_usec_timer_mark(&simd_timer); + + const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer)); + const int simd_elapsed_time = + static_cast<int>(aom_usec_timer_elapsed(&simd_timer)); + printf("c_time = %d \t simd_time = %d \t Gain = %f \n", elapsed_time, + simd_elapsed_time, ((float)elapsed_time / simd_elapsed_time)); + } +} + +using std::make_tuple; + +#if HAVE_AVX2 +const QuantizeParam kQParamArrayAVX2[] = { + // av1_quantize_lp is only called in nonrd_pickmode.c, and is used for 16X16, + // 8X8, and 4X4. + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, + static_cast<TX_SIZE>(TX_16X16), AOM_BITS_8), + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, + static_cast<TX_SIZE>(TX_8X8), AOM_BITS_8), + make_tuple(&av1_quantize_lp_c, &av1_quantize_lp_avx2, + static_cast<TX_SIZE>(TX_4X4), AOM_BITS_8) +}; + +INSTANTIATE_TEST_SUITE_P(AVX2, FullPrecisionQuantizeLpTest, + ::testing::ValuesIn(kQParamArrayAVX2)); +#endif + +} // namespace
diff --git a/test/test.cmake b/test/test.cmake index 9f58511..3efcfac 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -277,6 +277,7 @@ "${AOM_ROOT}/test/obmc_variance_test.cc" "${AOM_ROOT}/test/pickrst_test.cc" "${AOM_ROOT}/test/quantize_func_test.cc" + "${AOM_ROOT}/test/quantize_lp_func_test.cc" "${AOM_ROOT}/test/sad_test.cc" "${AOM_ROOT}/test/subtract_test.cc" "${AOM_ROOT}/test/reconinter_test.cc" @@ -323,7 +324,8 @@ if(NOT (HAVE_SSE2 OR HAVE_NEON)) list(REMOVE_ITEM AOM_UNIT_TEST_ENCODER_SOURCES - "${AOM_ROOT}/test/quantize_func_test.cc") + "${AOM_ROOT}/test/quantize_func_test.cc" + "${AOM_ROOT}/test/quantize_lp_func_test.cc") endif() if(HAVE_SSE4_1)