Optimize Neon implementation of aom_int_pro_row
Unroll to operate on 4 rows per iteration. Also increase the number
of accumulators from 2 to 4, removing a bottleneck.
Change-Id: I4149f7f026ac15d5eee676b7e1b65ea873716d76
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index dadf373..2da64b4 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c
@@ -93,29 +93,58 @@
void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
const int ref_stride, const int width,
const int height, int norm_factor) {
- const uint8_t *idx = ref;
- const uint16x8_t zero = vdupq_n_u16(0);
- const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+ assert(width % 16 == 0);
+ assert(height % 4 == 0);
- for (int wd = 0; wd < width; wd += 16) {
- uint16x8_t vec0 = zero;
- uint16x8_t vec1 = zero;
- idx = ref + wd;
- for (int ht = 0; ht < height; ++ht) {
- const uint8x16_t tmp = vld1q_u8(idx);
- idx += ref_stride;
- vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
- vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+ const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+ uint16x8_t sum_lo[2], sum_hi[2];
+
+ int w = 0;
+ do {
+ const uint8_t *r = ref + w;
+ uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride);
+ uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride);
+ uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride);
+ uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride);
+
+ sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ r += 4 * ref_stride;
+
+ for (int h = height - 4; h != 0; h -= 4) {
+ r0 = vld1q_u8(r + 0 * ref_stride);
+ r1 = vld1q_u8(r + 1 * ref_stride);
+ r2 = vld1q_u8(r + 2 * ref_stride);
+ r3 = vld1q_u8(r + 3 * ref_stride);
+
+ uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo);
+ sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi);
+ sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo);
+ sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi);
+
+ r += 4 * ref_stride;
}
- const int16x8_t result0 =
- vshlq_s16(vreinterpretq_s16_u16(vec0), neg_norm_factor);
- const int16x8_t result1 =
- vshlq_s16(vreinterpretq_s16_u16(vec1), neg_norm_factor);
+ sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+ sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
- vst1q_s16(hbuf + wd, result0);
- vst1q_s16(hbuf + wd + 8, result1);
- }
+ const int16x8_t avg0 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+ const int16x8_t avg1 =
+ vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+ vst1q_s16(hbuf + w, avg0);
+ vst1q_s16(hbuf + w + 8, avg1);
+ w += 16;
+ } while (w < width);
}
void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,