Optimize aom_quantize_b functions
Optimize aom_quantize_b_neon, aom_quantize_b_32x32_neon,
aom_quantize_b_64x64_neon by removing the initial memset function
calls that write zeroes to qcoeff_ptr and dqcoeff_ptr. Instead,
write those zero entries inside the processing loops.
In av1_quantize.c the aom_quantize_b functions are called with NULL
values for qm_ptr and iqm_ptr. The unit tests also have the same
behaviour. When these pointers are not NULL, the
aom_quantize_b_helper_c function is called instead. Remove the NULL
checks for these pointer inside the quantize_b functions.
Change-Id: I0032ac5346992fcb949ecaddf8214e3d0b62efd8
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index 8d13e57..e9c0ca7 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c
@@ -351,6 +351,33 @@
iscan, 2);
}
+static inline uint16x8_t quantize_b_logscale0_8(
+ int16x8_t coeff, int16x8_t abs, uint16x8_t cond, int16x8_t round,
+ int16x8_t dequant, int16x8_t quant, int16x8_t quant_shift,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ int16x8_t coeff_sign = vreinterpretq_s16_u16(vcltq_s16(coeff, zero));
+
+ int16x8_t tmp = vqaddq_s16(abs, round);
+ tmp = vsraq_n_s16(tmp, vqdmulhq_s16(tmp, quant), 1);
+ tmp = vqdmulhq_s16(tmp, quant_shift);
+
+ int16x8_t qcoeff = vsubq_s16(veorq_s16(tmp, coeff_sign), coeff_sign);
+ qcoeff = vbslq_s16(cond, qcoeff, zero);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ int16x8_t dqcoeff = vmulq_s16(tmp, dequant);
+ dqcoeff = vsubq_s16(veorq_s16(dqcoeff, coeff_sign), coeff_sign);
+ dqcoeff = vbslq_s16(cond, dqcoeff, zero);
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+ uint16x8_t tmp_mask = vcgtq_s16(tmp, zero);
+ uint16x8_t nz_mask = vandq_u16(tmp_mask, cond);
+
+ return nz_mask;
+}
+
void aom_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr, const int16_t *round_ptr,
const int16_t *quant_ptr,
@@ -361,88 +388,58 @@
(void)quant_shift_ptr;
(void)scan;
- const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
- memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
- memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+ int16x8_t v_zbins = vdupq_n_s16(zbin_ptr[1]);
+ int16x8_t v_round = vdupq_n_s16(round_ptr[1]);
+ int16x8_t v_dequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t v_quant = vdupq_n_s16(quant_ptr[1]);
+ // Shift by 1 in order to save one shift in the kernel function.
+ int16x8_t v_quant_shift = vdupq_n_s16(quant_shift_ptr[1] >> 1);
- const int16x8_t zero = vdupq_n_s16(0);
- int16x8_t v_eobmax_76543210 = vreinterpretq_s16_u16(vceqq_s16(zero, zero));
-
- int16x8_t vzbins = vdupq_n_s16(zbins[1]), vround = vdupq_n_s16(round_ptr[1]);
- int16x8_t vdequant = vdupq_n_s16(dequant_ptr[1]);
- int16x8_t vquant = vdupq_n_s16(quant_ptr[1]);
- int16x8_t vquant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-
- int16x8_t v_coeff = load_tran_low_to_s16q(&coeff_ptr[0]);
- int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ int16x8_t v_zbins0 = vsetq_lane_s16(zbin_ptr[0], v_zbins, 0);
+ int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
int16x8_t v_abs = vabsq_s16(v_coeff);
+ uint16x8_t v_cond = vcgeq_s16(v_abs, v_zbins0);
- vzbins = vsetq_lane_s16(zbins[0], vzbins, 0);
-
- uint16x8_t vcond = vcgeq_s16(v_abs, vzbins);
- uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
if (nz_check) {
- vround = vsetq_lane_s16(round_ptr[0], vround, 0);
- vquant = vsetq_lane_s16(quant_ptr[0], vquant, 0);
- vdequant = vsetq_lane_s16(dequant_ptr[0], vdequant, 0);
- vquant_shift = vsetq_lane_s16(quant_shift_ptr[0], vquant_shift, 0);
+ int16x8_t v_round0 = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ int16x8_t v_quant0 = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ int16x8_t v_dequant0 = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ // Shift by 1 in order to save one shift in the kernel function.
+ int16x8_t v_quant_shift0 =
+ vsetq_lane_s16(quant_shift_ptr[0] >> 1, v_quant_shift, 0);
- int16x8_t vtmp = vqaddq_s16(v_abs, vround);
- int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
- vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
+ const uint16x8_t v_nz_mask = quantize_b_logscale0_8(
+ v_coeff, v_abs, v_cond, v_round0, v_dequant0, v_quant0, v_quant_shift0,
+ qcoeff_ptr, dqcoeff_ptr);
- int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
- int16x8_t coeff_nz_mask =
- vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[0]));
- store_s16q_to_tran_low(&qcoeff_ptr[0], coeff_nz_mask);
- int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
-
- vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
- coeff_nz_mask =
- vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[0]));
- store_s16q_to_tran_low(&dqcoeff_ptr[0], coeff_nz_mask);
-
- vround = vsetq_lane_s16(round_ptr[1], vround, 0);
- vquant = vsetq_lane_s16(quant_ptr[1], vquant, 0);
- vdequant = vsetq_lane_s16(dequant_ptr[1], vdequant, 0);
- vquant_shift = vsetq_lane_s16(quant_shift_ptr[1], vquant_shift, 0);
-
- uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
- const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
- int16x8_t v_iscan = vld1q_s16(&iscan[0]);
- vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
- v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ int16x8_t v_iscan = vld1q_s16(iscan);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr, vdupq_n_s16(0));
}
- vzbins = vsetq_lane_s16(zbins[1], vzbins, 0);
for (int i = 8; i < n_coeffs; i += 8) {
- v_coeff = load_tran_low_to_s16q(&coeff_ptr[i]);
- v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
v_abs = vabsq_s16(v_coeff);
- vcond = vcgeq_s16(v_abs, vzbins);
+ v_cond = vcgeq_s16(v_abs, v_zbins);
- nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(vcond)), 0);
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
if (nz_check) {
- int16x8_t vtmp = vqaddq_s16(v_abs, vround);
- int16x8_t vtmp2 = vsraq_n_s16(vtmp, vqdmulhq_s16(vtmp, vquant), 1);
+ const uint16x8_t v_nz_mask = quantize_b_logscale0_8(
+ v_coeff, v_abs, v_cond, v_round, v_dequant, v_quant, v_quant_shift,
+ qcoeff_ptr + i, dqcoeff_ptr + i);
- vtmp2 = vshrq_n_s16(vqdmulhq_s16(vtmp2, vquant_shift), 1);
- int16x8_t vdest = vsubq_s16(veorq_s16(vtmp2, v_coeff_sign), v_coeff_sign);
- int16x8_t coeff_nz_mask =
- vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&qcoeff_ptr[i]));
- store_s16q_to_tran_low(&qcoeff_ptr[i], coeff_nz_mask);
- int16x8_t v_deq_abs = vmulq_s16(vtmp2, vdequant);
- vdest = vsubq_s16(veorq_s16(v_deq_abs, v_coeff_sign), v_coeff_sign);
- coeff_nz_mask =
- vbslq_s16(vcond, vdest, load_tran_low_to_s16q(&dqcoeff_ptr[i]));
- store_s16q_to_tran_low(&dqcoeff_ptr[i], coeff_nz_mask);
-
- uint16x8_t vtmp_mask = vcgtq_s16(vtmp2, zero);
- const uint16x8_t v_nz_mask = vandq_u16(vtmp_mask, vcond);
- int16x8_t v_iscan = vld1q_s16(&iscan[i]);
- vcond = vandq_u16(v_nz_mask, vcgtq_s16(v_iscan, v_eobmax_76543210));
- v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
+ int16x8_t v_iscan = vld1q_s16(iscan + i);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr + i, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr + i, vdupq_n_s16(0));
}
}
*eob_ptr = get_max_eob(v_eobmax_76543210);
@@ -899,6 +896,35 @@
}
}
+static inline uint16x8_t quantize_b_logscale1_8(
+ int16x8_t coeff, int16x8_t abs, uint16x8_t cond, int16x8_t round,
+ int16x8_t dequant, int16x8_t quant, int16x8_t quant_shift,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+ const int16x8_t zero = vdupq_n_s16(0);
+
+ int16x8_t coeff_sign = vreinterpretq_s16_u16(vcltq_s16(coeff, zero));
+
+ int16x8_t tmp = vqaddq_s16(abs, round);
+ tmp = vsraq_n_s16(tmp, vqdmulhq_s16(tmp, quant), 1);
+ tmp = vqdmulhq_s16(tmp, quant_shift);
+
+ int16x8_t qcoeff = vsubq_s16(veorq_s16(tmp, coeff_sign), coeff_sign);
+ qcoeff = vbslq_s16(cond, qcoeff, zero);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ // Shift by log_scale = 1.
+ int16x8_t dqcoeff = vreinterpretq_s16_u16(vhaddq_u16(
+ vreinterpretq_u16_s16(vmulq_s16(tmp, dequant)), vdupq_n_u16(0)));
+ dqcoeff = vsubq_s16(veorq_s16(dqcoeff, coeff_sign), coeff_sign);
+ dqcoeff = vbslq_s16(cond, dqcoeff, zero);
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+ uint16x8_t tmp_mask = vcgtq_s16(tmp, zero);
+ const uint16x8_t nz_mask = vandq_u16(tmp_mask, cond);
+
+ return nz_mask;
+}
+
void aom_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
const int16_t *zbin_ptr,
const int16_t *round_ptr,
@@ -907,10 +933,100 @@
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr,
- dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
- NULL, NULL, 1);
+ (void)scan;
+
+ const int log_scale = 1;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int rounds[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+
+ int16x8_t v_zbins = vdupq_n_s16(zbins[1]);
+ int16x8_t v_round = vdupq_n_s16(rounds[1]);
+ int16x8_t v_dequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t v_quant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t v_quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_zbins0 = vsetq_lane_s16(zbins[0], v_zbins, 0);
+ int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ uint16x8_t v_cond = vcgeq_s16(v_abs, v_zbins0);
+
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
+ if (nz_check) {
+ int16x8_t v_round0 = vsetq_lane_s16(rounds[0], v_round, 0);
+ int16x8_t v_quant0 = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ int16x8_t v_dequant0 = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ int16x8_t v_quant_shift0 =
+ vsetq_lane_s16(quant_shift_ptr[0], v_quant_shift, 0);
+
+ const uint16x8_t v_nz_mask = quantize_b_logscale1_8(
+ v_coeff, v_abs, v_cond, v_round0, v_dequant0, v_quant0, v_quant_shift0,
+ qcoeff_ptr, dqcoeff_ptr);
+
+ int16x8_t v_iscan = vld1q_s16(iscan);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr, vdupq_n_s16(0));
+ }
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
+ v_abs = vabsq_s16(v_coeff);
+ v_cond = vcgeq_s16(v_abs, v_zbins);
+
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
+ if (nz_check) {
+ const uint16x8_t v_nz_mask = quantize_b_logscale1_8(
+ v_coeff, v_abs, v_cond, v_round, v_dequant, v_quant, v_quant_shift,
+ qcoeff_ptr + i, dqcoeff_ptr + i);
+
+ int16x8_t v_iscan = vld1q_s16(iscan + i);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr + i, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr + i, vdupq_n_s16(0));
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
+}
+
+static inline uint16x8_t quantize_b_logscale2_8(
+ int16x8_t coeff, int16x8_t abs, uint16x8_t cond, int16x8_t round,
+ int16x8_t dequant, int16x8_t quant, int16x8_t quant_shift,
+ tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr) {
+ const int16x8_t zero = vdupq_n_s16(0);
+ const int16x8_t one = vdupq_n_s16(1);
+
+ int16x8_t coeff_sign = vreinterpretq_s16_u16(vcltq_s16(coeff, zero));
+
+ int16x8_t tmp = vqaddq_s16(abs, round);
+ tmp = vsraq_n_s16(tmp, vqdmulhq_s16(tmp, quant), 1);
+ int16x8_t ones = vandq_s16(vshrq_n_s16(vmulq_s16(tmp, quant_shift), 14), one);
+ tmp = vqdmulhq_s16(tmp, quant_shift);
+ tmp = vaddq_s16(vshlq_s16(tmp, one), ones);
+
+ int16x8_t qcoeff = vsubq_s16(veorq_s16(tmp, coeff_sign), coeff_sign);
+ qcoeff = vbslq_s16(cond, qcoeff, zero);
+ store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+ // Shift right by log_scale = 2.
+ int16x8_t dqcoeff = vreinterpretq_s16_u16(
+ vshrq_n_u16(vreinterpretq_u16_s16(vmulq_s16(tmp, dequant)), 2));
+ dqcoeff = vorrq_s16(vshlq_n_s16(vqdmulhq_s16(tmp, dequant), 13), dqcoeff);
+ dqcoeff = vsubq_s16(veorq_s16(dqcoeff, coeff_sign), coeff_sign);
+ dqcoeff = vbslq_s16(cond, dqcoeff, zero);
+ store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff);
+
+ uint16x8_t tmp_mask = vcgtq_s16(tmp, zero);
+ const uint16x8_t nz_mask = vandq_u16(tmp_mask, cond);
+
+ return nz_mask;
}
void aom_quantize_b_64x64_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -921,8 +1037,65 @@
tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
const int16_t *dequant_ptr, uint16_t *eob_ptr,
const int16_t *scan, const int16_t *iscan) {
- aom_quantize_b_helper_neon(coeff_ptr, n_coeffs, zbin_ptr, round_ptr,
- quant_ptr, quant_shift_ptr, qcoeff_ptr,
- dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan,
- NULL, NULL, 2);
+ (void)scan;
+
+ const int log_scale = 2;
+ const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale) };
+ const int rounds[2] = { ROUND_POWER_OF_TWO(round_ptr[0], log_scale),
+ ROUND_POWER_OF_TWO(round_ptr[1], log_scale) };
+
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+
+ int16x8_t v_zbins = vdupq_n_s16(zbins[1]);
+ int16x8_t v_round = vdupq_n_s16(rounds[1]);
+ int16x8_t v_dequant = vdupq_n_s16(dequant_ptr[1]);
+ int16x8_t v_quant = vdupq_n_s16(quant_ptr[1]);
+ int16x8_t v_quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
+
+ int16x8_t v_zbins0 = vsetq_lane_s16(zbins[0], v_zbins, 0);
+ int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr);
+ int16x8_t v_abs = vabsq_s16(v_coeff);
+ uint16x8_t v_cond = vcgeq_s16(v_abs, v_zbins0);
+
+ uint64_t nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
+ if (nz_check) {
+ int16x8_t v_round0 = vsetq_lane_s16(rounds[0], v_round, 0);
+ int16x8_t v_quant0 = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ int16x8_t v_dequant0 = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+ int16x8_t v_quant_shift0 =
+ vsetq_lane_s16(quant_shift_ptr[0], v_quant_shift, 0);
+
+ const uint16x8_t v_nz_mask = quantize_b_logscale2_8(
+ v_coeff, v_abs, v_cond, v_round0, v_dequant0, v_quant0, v_quant_shift0,
+ qcoeff_ptr, dqcoeff_ptr);
+
+ int16x8_t v_iscan = vld1q_s16(iscan);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr, vdupq_n_s16(0));
+ }
+
+ for (int i = 8; i < n_coeffs; i += 8) {
+ v_coeff = load_tran_low_to_s16q(coeff_ptr + i);
+ v_abs = vabsq_s16(v_coeff);
+ v_cond = vcgeq_s16(v_abs, v_zbins);
+
+ nz_check = vget_lane_u64(vreinterpret_u64_u8(vmovn_u16(v_cond)), 0);
+ if (nz_check) {
+ const uint16x8_t v_nz_mask = quantize_b_logscale2_8(
+ v_coeff, v_abs, v_cond, v_round, v_dequant, v_quant, v_quant_shift,
+ qcoeff_ptr + i, dqcoeff_ptr + i);
+
+ int16x8_t v_iscan = vld1q_s16(iscan + i);
+ int16x8_t v_eobmax = vmaxq_s16(v_iscan, v_eobmax_76543210);
+ v_eobmax_76543210 = vbslq_s16(v_nz_mask, v_eobmax, v_eobmax_76543210);
+ } else {
+ store_s16q_to_tran_low(qcoeff_ptr + i, vdupq_n_s16(0));
+ store_s16q_to_tran_low(dqcoeff_ptr + i, vdupq_n_s16(0));
+ }
+ }
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
}