Optimize aom_highbd_quantize_b_64x64 module
Added SSE2 variant for aom_highbd_quantize_b_64x64_c
When tested for multiple test cases observed 0.3%
average reduction in encoder time for speed = 4 preset.
Module gains improved by factor of ~1.5x w.r.t C code.
Change-Id: If17d4cd853f4db4d3caa2d1977939d441961558a
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ff0f653..927e352 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -533,7 +533,7 @@
specialize qw/aom_highbd_quantize_b_32x32 sse2/;
add_proto qw/void aom_highbd_quantize_b_64x64/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-
+ specialize qw/aom_highbd_quantize_b_64x64 sse2/;
} # CONFIG_AV1_ENCODER
#
diff --git a/aom_dsp/x86/highbd_quantize_intrin_sse2.c b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
index 58e5f98..2f4ffd3 100644
--- a/aom_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/aom_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -146,3 +146,61 @@
}
*eob_ptr = eob + 1;
}
+
+void aom_highbd_quantize_b_64x64_sse2(
+ const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ __m128i zbins[2];
+ __m128i nzbins[2];
+ int idx = 0;
+ int idx_arr[1024];
+ int i, eob = -1;
+ const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 2);
+ const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 2);
+ (void)scan;
+ zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
+ zbins[1] = _mm_set1_epi32(zbin1_tmp);
+
+ nzbins[0] = _mm_setzero_si128();
+ nzbins[1] = _mm_setzero_si128();
+ nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
+ nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
+
+ memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+ memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs / 4; i++) {
+ __m128i coeffs, cmp1, cmp2;
+ int test;
+ coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
+ cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
+ cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
+ cmp1 = _mm_and_si128(cmp1, cmp2);
+ test = _mm_movemask_epi8(cmp1);
+ if (!(test & 0xf)) idx_arr[idx++] = i * 4;
+ if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
+ if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
+ if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ const int rc = idx_arr[i];
+ const int coeff = coeff_ptr[rc];
+ const int coeff_sign = (coeff >> 31);
+ const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+ const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 2);
+ const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+ const uint32_t abs_qcoeff =
+ (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 14);
+ qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
+ dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 4;
+ if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
+ }
+ *eob_ptr = eob + 1;
+}
diff --git a/test/quantize_func_test.cc b/test/quantize_func_test.cc
index 2975c1d..e7afed9 100644
--- a/test/quantize_func_test.cc
+++ b/test/quantize_func_test.cc
@@ -398,6 +398,12 @@
TX_32X32, TYPE_B, AOM_BITS_10),
make_tuple(&aom_highbd_quantize_b_32x32_c, &aom_highbd_quantize_b_32x32_sse2,
TX_32X32, TYPE_B, AOM_BITS_12),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_8),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_10),
+ make_tuple(&aom_highbd_quantize_b_64x64_c, &aom_highbd_quantize_b_64x64_sse2,
+ TX_64X64, TYPE_B, AOM_BITS_12)
};
INSTANTIATE_TEST_CASE_P(SSE2, QuantizeTest,