Optimize quantize_fp_neon function for LBD and HBD
Optimized EOB calculation by removing iscan adjustment in
get_max_lane_eob(). Move this logic to get_max_eob(), which adds 1 to
the maximum value.
Use -1 instead of 0 when comparing against iscan values, to
differentiate between the case when non-zero element is at position 0
and that one where no non-zero value exists.
This is a port from SVT-AV1:
gitlab.com/AOMediaCodec/SVT-AV1/-/commit/f86adc26b60e5796f560ac1971962267b3a63ed4
Change-Id: I500eb61d508284b439f1efceac42595bff5fb520
diff --git a/av1/encoder/arm/av1_highbd_quantize_neon.c b/av1/encoder/arm/av1_highbd_quantize_neon.c
index c1016db..fd4749d 100644
--- a/av1/encoder/arm/av1_highbd_quantize_neon.c
+++ b/av1/encoder/arm/av1_highbd_quantize_neon.c
@@ -60,15 +60,15 @@
static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
int16x8_t v_eobmax,
uint16x8_t v_mask) {
- const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
- const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ const int16x8_t v_iscan = vld1q_s16(iscan);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
return vmaxq_s16(v_eobmax, v_nz_iscan);
}
static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
#if AOM_ARCH_AARCH64
- return (uint16_t)vmaxvq_s16(v_eobmax);
+ int16_t max_val = vmaxvq_s16(v_eobmax);
+ return (uint16_t)max_val + 1;
#else
const int16x4_t v_eobmax_3210 =
vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -80,7 +80,7 @@
vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
const int16x4_t v_eobmax_final =
vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
- return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
#endif
}
diff --git a/av1/encoder/arm/quantize_neon.c b/av1/encoder/arm/quantize_neon.c
index cbeafc7..8d13e57 100644
--- a/av1/encoder/arm/quantize_neon.c
+++ b/av1/encoder/arm/quantize_neon.c
@@ -29,7 +29,8 @@
static inline uint16_t get_max_eob(int16x8_t v_eobmax) {
#if AOM_ARCH_AARCH64
- return (uint16_t)vmaxvq_s16(v_eobmax);
+ int16_t max_val = vmaxvq_s16(v_eobmax);
+ return (uint16_t)max_val + 1;
#else
const int16x4_t v_eobmax_3210 =
vmax_s16(vget_low_s16(v_eobmax), vget_high_s16(v_eobmax));
@@ -41,16 +42,15 @@
vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
const int16x4_t v_eobmax_final =
vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
- return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ return (uint16_t)vget_lane_s16(v_eobmax_final, 0) + 1;
#endif
}
static inline int16x8_t get_max_lane_eob(const int16_t *iscan,
int16x8_t v_eobmax,
uint16x8_t v_mask) {
- const int16x8_t v_iscan = vld1q_s16(&iscan[0]);
- const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, vdupq_n_s16(1));
- const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan_plus1, vdupq_n_s16(0));
+ const int16x8_t v_iscan = vld1q_s16(iscan);
+ const int16x8_t v_nz_iscan = vbslq_s16(v_mask, v_iscan, vdupq_n_s16(-1));
return vmaxq_s16(v_eobmax, v_nz_iscan);
}
@@ -445,7 +445,7 @@
v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
}
}
- *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
}
#define QM_MULL_SHIFT(x0, x1) \
@@ -580,7 +580,7 @@
v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
}
}
- *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
}
static void aom_quantize_b_helper_32x32_neon(
@@ -718,7 +718,7 @@
v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
}
}
- *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
}
static void aom_quantize_b_helper_64x64_neon(
@@ -867,7 +867,7 @@
v_eobmax_76543210 = vbslq_s16(vcond, v_iscan, v_eobmax_76543210);
}
}
- *eob_ptr = get_max_eob(v_eobmax_76543210) + 1;
+ *eob_ptr = get_max_eob(v_eobmax_76543210);
}
void aom_quantize_b_helper_neon(