Optimize aom_quantize_b_adaptive module
Added SSE2 variant for aom_quantize_b_adaptive_c
For speed = 1, 2, 3 and 4 presets observed encode time reduction of
1.2%, 1.84%, 5.4% and 2.38% (averaged across multiple test cases).
Module gains improved by factor of ~4.3x w.r.t C code.
Change-Id: Iee2d40653d2263805c6505e9eeb8659d2d5499c9
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 10528d5..abf6a60 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -194,6 +194,7 @@
"${AOM_ROOT}/aom_dsp/x86/highbd_subtract_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/highbd_variance_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_sse2.c"
+ "${AOM_ROOT}/aom_dsp/x86/adaptive_quantize_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/quantize_x86.h"
"${AOM_ROOT}/aom_dsp/x86/sum_squares_sse2.c"
"${AOM_ROOT}/aom_dsp/x86/variance_sse2.c")
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 18a1852..0137eaf 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -518,6 +518,9 @@
add_proto qw/void aom_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b sse2/, "$ssse3_x86_64", "$avx_x86_64";
+ add_proto qw/void aom_quantize_b_adaptive/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+ specialize qw/aom_quantize_b_adaptive sse2/;
+
add_proto qw/void aom_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/aom_quantize_b_32x32/, "$ssse3_x86_64", "$avx_x86_64";
diff --git a/aom_dsp/quantize.c b/aom_dsp/quantize.c
index 1c29de8..ced34b4 100644
--- a/aom_dsp/quantize.c
+++ b/aom_dsp/quantize.c
@@ -11,9 +11,7 @@
#include "aom_dsp/quantize.h"
#include "aom_mem/aom_mem.h"
-
-#define EOB_FACTOR 325
-#define SKIP_EOB_FACTOR_ADJUST 200
+#include "av1/encoder/av1_quantize.h"
void quantize_b_adaptive_helper_c(
const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
diff --git a/aom_dsp/x86/adaptive_quantize_sse2.c b/aom_dsp/x86/adaptive_quantize_sse2.c
new file mode 100644
index 0000000..21db867
--- /dev/null
+++ b/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -0,0 +1,214 @@
+/*
+ * Copyright (c) 2019, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include "config/aom_dsp_rtcd.h"
+#include "aom/aom_integer.h"
+#include "av1/encoder/av1_quantize.h"
+#include "aom_dsp/x86/quantize_x86.h"
+
+void aom_quantize_b_adaptive_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ const __m128i zero = _mm_setzero_si128();
+ int index = 16;
+ int non_zero_count = (int)n_coeffs;
+ __m128i zbin, round, quant, dequant, shift;
+ __m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
+ __m128i qcoeff0, qcoeff1;
+ __m128i cmp_mask0, cmp_mask1;
+ __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+
+ int prescan_add[2];
+ for (int i = 0; i < 2; ++i)
+ prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
+
+ // max buffer is of size 256 as this functions calls with
+ // maximum n_coeffs as 256
+ int16_t prescan[256];
+ memset(prescan, -1, n_coeffs * sizeof(int16_t));
+
+ // TODO(Aniket): Experiment the following loop with intrinsic
+ for (int i = (int)n_coeffs - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ const qm_val_t wt = 1 << AOM_QM_BITS;
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int prescan_add_val = prescan_add[rc != 0];
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ prescan[rc] = 0;
+ non_zero_count--;
+ } else {
+ break;
+ }
+ }
+#if SKIP_EOB_FACTOR_ADJUST
+ int first = -1;
+#endif
+ // Setup global values.
+ load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
+ dequant_ptr, &dequant, quant_shift_ptr, &shift);
+
+ // Do DC and first 15 AC.
+ coeff0 = load_coefficients(coeff_ptr);
+ coeff1 = load_coefficients(coeff_ptr + 8);
+
+ // Poor man's abs().
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)prescan);
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + 12), zero);
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ } else {
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+
+ round = _mm_unpackhi_epi64(round, round);
+ quant = _mm_unpackhi_epi64(quant, quant);
+ shift = _mm_unpackhi_epi64(shift, shift);
+
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ // Reinsert signs
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ // Mask out zbin threshold coeffs
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr);
+ store_coefficients(qcoeff1, qcoeff_ptr + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ dequant = _mm_unpackhi_epi64(dequant, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr);
+ store_coefficients(coeff1, dqcoeff_ptr + 8);
+
+ eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+ }
+
+ // AC only loop.
+ // TODO(Aniket): Reduce the processing of coeff quatization
+ // based on eob logic
+ while (index < n_coeffs) {
+ coeff0 = load_coefficients(coeff_ptr + index);
+ coeff1 = load_coefficients(coeff_ptr + index + 8);
+
+ coeff0_sign = _mm_srai_epi16(coeff0, 15);
+ coeff1_sign = _mm_srai_epi16(coeff1, 15);
+ qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+ prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
+ prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+
+ cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+
+ all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
+ if (_mm_movemask_epi8(all_zero) == 0) {
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(qcoeff_ptr + index + 12), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 4), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 8), zero);
+ _mm_store_si128((__m128i *)(dqcoeff_ptr + index + 12), zero);
+ index += 16;
+ continue;
+ }
+ calculate_qcoeff(&qcoeff0, round, quant, shift);
+ calculate_qcoeff(&qcoeff1, round, quant, shift);
+
+ qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+ qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+ qcoeff0 = _mm_and_si128(qcoeff0, cmp_mask0);
+ qcoeff1 = _mm_and_si128(qcoeff1, cmp_mask1);
+
+ store_coefficients(qcoeff0, qcoeff_ptr + index);
+ store_coefficients(qcoeff1, qcoeff_ptr + index + 8);
+
+ coeff0 = calculate_dqcoeff(qcoeff0, dequant);
+ coeff1 = calculate_dqcoeff(qcoeff1, dequant);
+
+ store_coefficients(coeff0, dqcoeff_ptr + index);
+ store_coefficients(coeff1, dqcoeff_ptr + index + 8);
+
+ eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
+ zero);
+ eob = _mm_max_epi16(eob, eob0);
+ index += 16;
+ }
+
+ *eob_ptr = accumulate_eob(eob);
+
+#if SKIP_EOB_FACTOR_ADJUST
+ // TODO(Aniket): Experiment the following loop with intrinsic by combining
+ // with the quantization loop above
+ for (int i = 0; i < non_zero_count; i++) {
+ const int rc = scan[i];
+ const int qcoeff = qcoeff_ptr[rc];
+ if (qcoeff) {
+ first = i;
+ break;
+ }
+ }
+ if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
+ const int rc = scan[(*eob_ptr - 1)];
+ if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ const int coeff = coeff_ptr[rc] * wt;
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
+ const int prescan_add_val =
+ ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
+ if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ *eob_ptr = 0;
+ }
+ }
+ }
+#endif
+}
diff --git a/av1/encoder/av1_quantize.c b/av1/encoder/av1_quantize.c
index 162724b..7466a0b 100644
--- a/av1/encoder/av1_quantize.c
+++ b/av1/encoder/av1_quantize.c
@@ -276,10 +276,10 @@
} else {
switch (qparam->log_scale) {
case 0:
- aom_quantize_b_adaptive_c(
- coeff_ptr, n_coeffs, p->zbin_QTX, p->round_QTX, p->quant_QTX,
- p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr, p->dequant_QTX,
- eob_ptr, sc->scan, sc->iscan);
+ aom_quantize_b_adaptive(coeff_ptr, n_coeffs, p->zbin_QTX,
+ p->round_QTX, p->quant_QTX,
+ p->quant_shift_QTX, qcoeff_ptr, dqcoeff_ptr,
+ p->dequant_QTX, eob_ptr, sc->scan, sc->iscan);
break;
case 1:
aom_quantize_b_32x32_adaptive_c(
diff --git a/av1/encoder/av1_quantize.h b/av1/encoder/av1_quantize.h
index c709234..6419265 100644
--- a/av1/encoder/av1_quantize.h
+++ b/av1/encoder/av1_quantize.h
@@ -22,6 +22,9 @@
extern "C" {
#endif
+#define EOB_FACTOR 325
+#define SKIP_EOB_FACTOR_ADJUST 200
+
typedef struct QUANT_PARAM {
int log_scale;
TX_SIZE tx_size;
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index e7afed9..907181d 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -191,6 +191,13 @@
}
}
+ void FillCoeffRandomRows(int num) {
+ FillCoeffZero();
+ for (int i = 0; i < num; ++i) {
+ coeff_[i] = GetRandomCoeff();
+ }
+ }
+
void FillCoeffZero() { FillCoeff(0); }
void FillCoeffConstant() {
@@ -287,28 +294,31 @@
const int16_t *dequant = qtab_->dequant.y_dequant_QTX[q];
const int kNumTests = 5000000;
aom_usec_timer timer, simd_timer;
+ int rows = tx_size_high[tx_size_];
+ int cols = tx_size_wide[tx_size_];
+ for (int cnt = 0; cnt <= rows; cnt++) {
+ FillCoeffRandomRows(cnt * cols);
- FillCoeffRandom();
+ aom_usec_timer_start(&timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
+ qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&timer);
- aom_usec_timer_start(&timer);
- for (int n = 0; n < kNumTests; ++n) {
- quant_ref_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift,
- qcoeff, dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ aom_usec_timer_start(&simd_timer);
+ for (int n = 0; n < kNumTests; ++n) {
+ quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
+ dqcoeff, dequant, eob, sc->scan, sc->iscan);
+ }
+ aom_usec_timer_mark(&simd_timer);
+
+ const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+ const int simd_elapsed_time =
+ static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
+ printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
+ simd_elapsed_time, (elapsed_time / simd_elapsed_time));
}
- aom_usec_timer_mark(&timer);
-
- aom_usec_timer_start(&simd_timer);
- for (int n = 0; n < kNumTests; ++n) {
- quant_(coeff_ptr, n_coeffs, zbin, round_fp, quant_fp, quant_shift, qcoeff,
- dqcoeff, dequant, eob, sc->scan, sc->iscan);
- }
- aom_usec_timer_mark(&simd_timer);
-
- const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
- const int simd_elapsed_time =
- static_cast<int>(aom_usec_timer_elapsed(&simd_timer));
- printf("c_time = %d \t simd_time = %d \t Gain = %d \n", elapsed_time,
- simd_elapsed_time, (elapsed_time / simd_elapsed_time));
}
using ::testing::make_tuple;
@@ -403,7 +413,13 @@
make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
TX_64X64, TYPE_B, AOM_BITS_10),
make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
- TX_64X64, TYPE_B, AOM_BITS_12)
+ TX_64X64, TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2,
+ TX_16X16, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_8X8,
+ TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_quantize_b_adaptive_c, &aom_quantize_b_adaptive_sse2, TX_4X4,
+ TYPE_B, AOM_BITS_8)
};
INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,