Optimize adaptive quantization modules
SSE2 variants of aom_quantize_b_adaptive and aom_quantize_b_32x32_adaptive
are optimized for improved performance.
For speed = 3 and 4 presets, observed encode time reduction of
1.07% and 0.67% (averaged across multiple test cases).
Performance improved by factor of ~1.7x
Change-Id: Icb0769fc44b3a75b18b530c3d0ab172d8a99a0a0
diff --git a/aom_dsp/x86/adaptive_quantize_sse2.c b/aom_dsp/x86/adaptive_quantize_sse2.c
index 3822c27..fc9de59 100644
--- a/aom_dsp/x86/adaptive_quantize_sse2.c
+++ b/aom_dsp/x86/adaptive_quantize_sse2.c
@@ -22,41 +22,31 @@
const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- const __m128i zero = _mm_setzero_si128();
int index = 16;
- int non_zero_count = (int)n_coeffs;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
+ const __m128i zero = _mm_setzero_si128();
__m128i zbin, round, quant, dequant, shift;
__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
__m128i qcoeff0, qcoeff1;
__m128i cmp_mask0, cmp_mask1;
- __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
- const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], 0),
- ROUND_POWER_OF_TWO(zbin_ptr[1], 0) };
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
int prescan_add[2];
- for (int i = 0; i < 2; ++i)
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
- // max buffer is of size 256 as this functions calls with
- // maximum n_coeffs as 256
- int16_t prescan[256];
- memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
- // TODO(Aniket): Experiment the following loop with intrinsic
- for (int i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const qm_val_t wt = 1 << AOM_QM_BITS;
- const int coeff = coeff_ptr[rc] * wt;
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int prescan_add_val = prescan_add[rc != 0];
- if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
- prescan[rc] = 0;
- non_zero_count--;
- } else {
- break;
- }
+ thresh[i] = (zbin_ptr[i] * wt + prescan_add[i]) - 1;
}
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
#if SKIP_EOB_FACTOR_ADJUST
int first = -1;
#endif
@@ -74,13 +64,15 @@
qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
- prescan0 = _mm_loadu_si128((const __m128i *)prescan);
- prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
- cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
if (_mm_movemask_epi8(all_zero) == 0) {
_mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -121,13 +113,9 @@
store_coefficients(coeff0, dqcoeff_ptr);
store_coefficients(coeff1, dqcoeff_ptr + 8);
-
- eob = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
}
// AC only loop.
- // TODO(Aniket): Reduce the processing of coeff quatization
- // based on eob logic
while (index < n_coeffs) {
coeff0 = load_coefficients(coeff_ptr + index);
coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -137,11 +125,13 @@
qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
- prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
- prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
- cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
- cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
if (_mm_movemask_epi8(all_zero) == 0) {
@@ -174,14 +164,27 @@
store_coefficients(coeff0, dqcoeff_ptr + index);
store_coefficients(coeff1, dqcoeff_ptr + index + 8);
- eob0 = scan_for_eob(&coeff0, &coeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
- eob = _mm_max_epi16(eob, eob0);
index += 16;
}
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
- *eob_ptr = accumulate_eob(eob);
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
#if SKIP_EOB_FACTOR_ADJUST
// TODO(Aniket): Experiment the following loop with intrinsic by combining
// with the quantization loop above
@@ -196,14 +199,14 @@
if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
const int rc = scan[(*eob_ptr - 1)];
if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
- const qm_val_t wt = (1 << AOM_QM_BITS);
const int coeff = coeff_ptr[rc] * wt;
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
const int factor = EOB_FACTOR + SKIP_EOB_FACTOR_ADJUST;
const int prescan_add_val =
ROUND_POWER_OF_TWO(dequant_ptr[rc != 0] * factor, 7);
- if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
+ if (abs_coeff <
+ (zbin_ptr[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
qcoeff_ptr[rc] = 0;
dqcoeff_ptr[rc] = 0;
*eob_ptr = 0;
@@ -220,8 +223,11 @@
tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
int index = 16;
- int non_zero_count = (int)n_coeffs;
const int log_scale = 1;
+ int non_zero_count = 0;
+ int non_zero_count_prescan_add_zero = 0;
+ int is_found0 = 0, is_found1 = 0;
+ int eob = -1;
const __m128i zero = _mm_setzero_si128();
const __m128i one = _mm_set1_epi16(1);
const __m128i log_scale_vec = _mm_set1_epi16(log_scale);
@@ -229,34 +235,23 @@
__m128i coeff0, coeff1, coeff0_sign, coeff1_sign;
__m128i qcoeff0, qcoeff1;
__m128i cmp_mask0, cmp_mask1;
- __m128i eob = zero, eob0, prescan0, prescan1, all_zero;
+ __m128i all_zero;
+ __m128i mask0 = zero, mask1 = zero;
+
const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
-
int prescan_add[2];
- for (int i = 0; i < 2; ++i)
+ int thresh[4];
+ const qm_val_t wt = (1 << AOM_QM_BITS);
+ for (int i = 0; i < 2; ++i) {
prescan_add[i] = ROUND_POWER_OF_TWO(dequant_ptr[i] * EOB_FACTOR, 7);
-
- // max buffer is of size 1024 as this functions calls with
- // maximum n_coeffs as 1024
- int16_t prescan[1024];
- memset(prescan, -1, n_coeffs * sizeof(int16_t));
-
- // TODO(Aniket): Experiment the following loop with intrinsic
- for (int i = (int)n_coeffs - 1; i >= 0; i--) {
- const int rc = scan[i];
- const qm_val_t wt = 1 << AOM_QM_BITS;
- const int coeff = coeff_ptr[rc] * wt;
- const int coeff_sign = (coeff >> 31);
- const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
- const int prescan_add_val = prescan_add[rc != 0];
- if (abs_coeff < (zbins[rc != 0] * (1 << AOM_QM_BITS) + prescan_add_val)) {
- prescan[rc] = 0;
- non_zero_count--;
- } else {
- break;
- }
+ thresh[i] = (zbins[i] * wt + prescan_add[i]) - 1;
}
+ thresh[2] = thresh[3] = thresh[1];
+ __m128i threshold[2];
+ threshold[0] = _mm_loadu_si128((__m128i *)&thresh[0]);
+ threshold[1] = _mm_unpackhi_epi64(threshold[0], threshold[0]);
+
#if SKIP_EOB_FACTOR_ADJUST
int first = -1;
#endif
@@ -273,6 +268,7 @@
zbin = _mm_srli_epi16(zbin, log_scale);
round = _mm_srli_epi16(round, log_scale);
zbin = _mm_sub_epi16(zbin, one);
+
// Do DC and first 15 AC.
coeff0 = load_coefficients(coeff_ptr);
coeff1 = load_coefficients(coeff_ptr + 8);
@@ -282,13 +278,15 @@
qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
- prescan0 = _mm_loadu_si128((const __m128i *)prescan);
- prescan1 = _mm_loadu_si128((const __m128i *)(prescan + 8));
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan, &is_found0, &mask0);
- cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
zbin = _mm_unpackhi_epi64(zbin, zbin); // Switch DC to AC
- cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan, &is_found1, &mask1);
+
+ threshold[0] = threshold[1];
all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
if (_mm_movemask_epi8(all_zero) == 0) {
_mm_store_si128((__m128i *)(qcoeff_ptr), zero);
@@ -305,11 +303,9 @@
dequant = _mm_unpackhi_epi64(dequant, dequant);
} else {
calculate_qcoeff_log_scale(&qcoeff0, round, quant, &shift, &log_scale);
-
round = _mm_unpackhi_epi64(round, round);
quant = _mm_unpackhi_epi64(quant, quant);
shift = _mm_unpackhi_epi64(shift, shift);
-
calculate_qcoeff_log_scale(&qcoeff1, round, quant, &shift, &log_scale);
// Reinsert signs
@@ -328,14 +324,9 @@
dequant = _mm_unpackhi_epi64(dequant, dequant);
calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
dqcoeff_ptr + 8, &log_scale);
-
- eob =
- scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
}
// AC only loop.
- // TODO(Aniket): Reduce the processing of coeff quatization
- // based on eob logic
while (index < n_coeffs) {
coeff0 = load_coefficients(coeff_ptr + index);
coeff1 = load_coefficients(coeff_ptr + index + 8);
@@ -345,11 +336,13 @@
qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
- prescan0 = _mm_loadu_si128((const __m128i *)(prescan + index));
- prescan1 = _mm_loadu_si128((const __m128i *)(prescan + index + 8));
+ update_mask0(&qcoeff0, &qcoeff1, threshold, iscan + index, &is_found0,
+ &mask0);
- cmp_mask0 = _mm_and_si128(prescan0, _mm_cmpgt_epi16(qcoeff0, zbin));
- cmp_mask1 = _mm_and_si128(prescan1, _mm_cmpgt_epi16(qcoeff1, zbin));
+ cmp_mask0 = _mm_cmpgt_epi16(qcoeff0, zbin);
+ cmp_mask1 = _mm_cmpgt_epi16(qcoeff1, zbin);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan + index, &is_found1, &mask1);
all_zero = _mm_or_si128(cmp_mask0, cmp_mask1);
if (_mm_movemask_epi8(all_zero) == 0) {
@@ -380,15 +373,27 @@
dqcoeff_ptr + index, &log_scale);
calculate_dqcoeff_and_store_log_scale(qcoeff1, dequant, zero,
dqcoeff_ptr + index + 8, &log_scale);
-
- eob0 = scan_for_eob(&qcoeff0, &qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
- zero);
- eob = _mm_max_epi16(eob, eob0);
index += 16;
}
+ if (is_found0) non_zero_count = calculate_non_zero_count(mask0);
+ if (is_found1)
+ non_zero_count_prescan_add_zero = calculate_non_zero_count(mask1);
- *eob_ptr = accumulate_eob(eob);
+ for (int i = non_zero_count_prescan_add_zero - 1; i >= non_zero_count; i--) {
+ const int rc = scan[i];
+ qcoeff_ptr[rc] = 0;
+ dqcoeff_ptr[rc] = 0;
+ }
+ for (int i = non_zero_count - 1; i >= 0; i--) {
+ const int rc = scan[i];
+ if (qcoeff_ptr[rc]) {
+ eob = i;
+ break;
+ }
+ }
+
+ *eob_ptr = eob + 1;
#if SKIP_EOB_FACTOR_ADJUST
// TODO(Aniket): Experiment the following loop with intrinsic by combining
// with the quantization loop above
@@ -403,7 +408,6 @@
if ((*eob_ptr - 1) >= 0 && first == (*eob_ptr - 1)) {
const int rc = scan[(*eob_ptr - 1)];
if (qcoeff_ptr[rc] == 1 || qcoeff_ptr[rc] == -1) {
- const qm_val_t wt = (1 << AOM_QM_BITS);
const int coeff = coeff_ptr[rc] * wt;
const int coeff_sign = (coeff >> 31);
const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
diff --git a/aom_dsp/x86/quantize_x86.h b/aom_dsp/x86/quantize_x86.h
index b2de01b..5b040a2 100644
--- a/aom_dsp/x86/quantize_x86.h
+++ b/aom_dsp/x86/quantize_x86.h
@@ -143,3 +143,60 @@
_mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
_mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
}
+
+static INLINE void update_mask1(__m128i *cmp_mask0, __m128i *cmp_mask1,
+ const int16_t *iscan_ptr, int *is_found,
+ __m128i *mask) {
+ __m128i all_zero;
+ __m128i temp_mask = _mm_setzero_si128();
+ all_zero = _mm_or_si128(*cmp_mask0, *cmp_mask1);
+ if (_mm_movemask_epi8(all_zero)) {
+ __m128i iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr));
+ __m128i mask0 = _mm_and_si128(*cmp_mask0, iscan0);
+ __m128i iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + 8));
+ __m128i mask1 = _mm_and_si128(*cmp_mask1, iscan1);
+ temp_mask = _mm_max_epi16(mask0, mask1);
+ *is_found = 1;
+ }
+ *mask = _mm_max_epi16(temp_mask, *mask);
+}
+
+static INLINE void update_mask0(__m128i *qcoeff0, __m128i *qcoeff1,
+ __m128i *threshold, const int16_t *iscan_ptr,
+ int *is_found, __m128i *mask) {
+ __m128i zero = _mm_setzero_si128();
+ __m128i coeff[4], cmp_mask0, cmp_mask1, cmp_mask2, cmp_mask3;
+
+ coeff[0] = _mm_unpacklo_epi16(*qcoeff0, zero);
+ coeff[1] = _mm_unpackhi_epi16(*qcoeff0, zero);
+ coeff[2] = _mm_unpacklo_epi16(*qcoeff1, zero);
+ coeff[3] = _mm_unpackhi_epi16(*qcoeff1, zero);
+
+ coeff[0] = _mm_slli_epi32(coeff[0], AOM_QM_BITS);
+ cmp_mask0 = _mm_cmpgt_epi32(coeff[0], threshold[0]);
+ coeff[1] = _mm_slli_epi32(coeff[1], AOM_QM_BITS);
+ cmp_mask1 = _mm_cmpgt_epi32(coeff[1], threshold[1]);
+ coeff[2] = _mm_slli_epi32(coeff[2], AOM_QM_BITS);
+ cmp_mask2 = _mm_cmpgt_epi32(coeff[2], threshold[1]);
+ coeff[3] = _mm_slli_epi32(coeff[3], AOM_QM_BITS);
+ cmp_mask3 = _mm_cmpgt_epi32(coeff[3], threshold[1]);
+
+ cmp_mask0 = _mm_packs_epi32(cmp_mask0, cmp_mask1);
+ cmp_mask1 = _mm_packs_epi32(cmp_mask2, cmp_mask3);
+
+ update_mask1(&cmp_mask0, &cmp_mask1, iscan_ptr, is_found, mask);
+}
+
+static INLINE int calculate_non_zero_count(__m128i mask) {
+ __m128i mask0, mask1;
+ int non_zero_count = 0;
+ mask0 = _mm_unpackhi_epi64(mask, mask);
+ mask1 = _mm_max_epi16(mask0, mask);
+ mask0 = _mm_shuffle_epi32(mask1, 1);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ mask1 = _mm_srli_epi32(mask0, 16);
+ mask0 = _mm_max_epi16(mask0, mask1);
+ non_zero_count = _mm_extract_epi16(mask0, 0) + 1;
+
+ return non_zero_count;
+}