Optimize Neon implementation of aom_int_pro_row Unroll to operate on 4 rows per iteration. Also increase the number of accumulators from 2 to 4, removing a bottleneck. Change-Id: I4149f7f026ac15d5eee676b7e1b65ea873716d76

commit: cddfcd0f519e8083701fe67959fc27ee27e651c2 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Wed May 24 15:26:30 2023 +0100
committer: James Zern <jzern@google.com> Tue May 30 22:20:09 2023 +0000
tree: 33ebeccead7560c96914f8f8cd48428590264e12
parent: 1f1d6fcb3aca5b6942da121c4e16a8423a7e0f66 [diff]
diff --git a/aom_dsp/arm/avg_neon.c b/aom_dsp/arm/avg_neon.c
index dadf373..2da64b4 100644
--- a/aom_dsp/arm/avg_neon.c
+++ b/aom_dsp/arm/avg_neon.c

@@ -93,29 +93,58 @@
 void aom_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref,
                           const int ref_stride, const int width,
                           const int height, int norm_factor) {
-  const uint8_t *idx = ref;
-  const uint16x8_t zero = vdupq_n_u16(0);
-  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+  assert(width % 16 == 0);
+  assert(height % 4 == 0);
 
-  for (int wd = 0; wd < width; wd += 16) {
-    uint16x8_t vec0 = zero;
-    uint16x8_t vec1 = zero;
-    idx = ref + wd;
-    for (int ht = 0; ht < height; ++ht) {
-      const uint8x16_t tmp = vld1q_u8(idx);
-      idx += ref_stride;
-      vec0 = vaddw_u8(vec0, vget_low_u8(tmp));
-      vec1 = vaddw_u8(vec1, vget_high_u8(tmp));
+  const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+  uint16x8_t sum_lo[2], sum_hi[2];
+
+  int w = 0;
+  do {
+    const uint8_t *r = ref + w;
+    uint8x16_t r0 = vld1q_u8(r + 0 * ref_stride);
+    uint8x16_t r1 = vld1q_u8(r + 1 * ref_stride);
+    uint8x16_t r2 = vld1q_u8(r + 2 * ref_stride);
+    uint8x16_t r3 = vld1q_u8(r + 3 * ref_stride);
+
+    sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+    sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+    sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+    sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+    r += 4 * ref_stride;
+
+    for (int h = height - 4; h != 0; h -= 4) {
+      r0 = vld1q_u8(r + 0 * ref_stride);
+      r1 = vld1q_u8(r + 1 * ref_stride);
+      r2 = vld1q_u8(r + 2 * ref_stride);
+      r3 = vld1q_u8(r + 3 * ref_stride);
+
+      uint16x8_t tmp0_lo = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+      uint16x8_t tmp0_hi = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+      uint16x8_t tmp1_lo = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+      uint16x8_t tmp1_hi = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+      sum_lo[0] = vaddq_u16(sum_lo[0], tmp0_lo);
+      sum_hi[0] = vaddq_u16(sum_hi[0], tmp0_hi);
+      sum_lo[1] = vaddq_u16(sum_lo[1], tmp1_lo);
+      sum_hi[1] = vaddq_u16(sum_hi[1], tmp1_hi);
+
+      r += 4 * ref_stride;
     }
 
-    const int16x8_t result0 =
-        vshlq_s16(vreinterpretq_s16_u16(vec0), neg_norm_factor);
-    const int16x8_t result1 =
-        vshlq_s16(vreinterpretq_s16_u16(vec1), neg_norm_factor);
+    sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+    sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
 
-    vst1q_s16(hbuf + wd, result0);
-    vst1q_s16(hbuf + wd + 8, result1);
-  }
+    const int16x8_t avg0 =
+        vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+    const int16x8_t avg1 =
+        vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+    vst1q_s16(hbuf + w, avg0);
+    vst1q_s16(hbuf + w + 8, avg1);
+    w += 16;
+  } while (w < width);
 }
 
 void aom_int_pro_col_neon(int16_t *vbuf, const uint8_t *ref,
commit	cddfcd0f519e8083701fe67959fc27ee27e651c2	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Wed May 24 15:26:30 2023 +0100
committer	James Zern <jzern@google.com>	Tue May 30 22:20:09 2023 +0000
tree	33ebeccead7560c96914f8f8cd48428590264e12
parent	1f1d6fcb3aca5b6942da121c4e16a8423a7e0f66 [diff]