Specialize Neon subpel variance by filter value for large blocks The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes (>= 16x16) as we need to be doing enough work to make the cost of finding the optimal implementation worth it. Change-Id: Ie3dd99093fde19ee9abf3233e1c828c97510b464

commit: 2ce72a4331b205018974e6b6d8747a9a87374266 [log] [tgz]
author: Jonathan Wright <jonathan.wright@arm.com> Tue Jul 19 18:35:38 2022 +0100
committer: James Zern <jzern@google.com> Thu Sep 08 02:34:02 2022 +0000
tree: d1f5d79aa5ee66ff82e81ea00f7895b81f7c19fe
parent: ce3b001c886dc788bad36cb95ddcef4556bb62c5 [diff]
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 6f66be1..4615038 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c

@@ -121,6 +121,30 @@
                                dst_height, filter_offset);
 }
 
+static void var_filter_block2d_avg(const uint8_t *src_ptr, uint8_t *dst_ptr,
+                                   int src_stride, int pixel_step,
+                                   int dst_width, int dst_height) {
+  // We only specialise on the filter values for large block sizes (>= 16x16.)
+  assert(dst_width >= 16 && dst_width % 16 == 0);
+
+  int i = 0;
+  do {
+    int j = 0;
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr + j);
+      uint8x16_t s1 = vld1q_u8(src_ptr + j + pixel_step);
+      uint8x16_t avg = vrhaddq_u8(s0, s1);
+      vst1q_u8(dst_ptr + j, avg);
+
+      j += 16;
+    } while (j < dst_width);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_width;
+    i++;
+  } while (i < dst_height);
+}
+
 #define SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                          \
   unsigned int aom_sub_pixel_variance##w##x##h##_neon(                   \
       const uint8_t *src, int src_stride, int xoffset, int yoffset,      \
@@ -133,6 +157,56 @@
     return aom_variance##w##x##h(tmp1, w, ref, ref_stride, sse);         \
   }
 
+#define SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int aom_sub_pixel_variance##w##x##h##_neon(                        \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,           \
+      const uint8_t *ref, int ref_stride, unsigned int *sse) {                \
+    if (xoffset == 0) {                                                       \
+      if (yoffset == 0) {                                                     \
+        return aom_variance##w##x##h##_neon(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp, src_stride, src_stride, w, h);       \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      } else {                                                                \
+        uint8_t tmp[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp, src_stride, src_stride, h,      \
+                                    yoffset);                                 \
+        return aom_variance##w##x##h##_neon(tmp, w, ref, ref_stride, sse);    \
+      }                                                                       \
+    } else if (xoffset == 4) {                                                \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h + padding);       \
+      if (yoffset == 0) {                                                     \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * (h + padding)];                                      \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    } else {                                                                  \
+      uint8_t tmp0[w * (h + padding)];                                        \
+      var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),    \
+                                  xoffset);                                   \
+      if (yoffset == 0) {                                                     \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);   \
+      } else if (yoffset == 4) {                                              \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                       \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      } else {                                                                \
+        uint8_t tmp1[w * h];                                                  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);            \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);   \
+      }                                                                       \
+    }                                                                         \
+  }
+
 SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
 SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
 
@@ -141,19 +215,19 @@
 SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
 
 SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
-SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
-SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
@@ -163,12 +237,13 @@
 SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
 
 SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
-SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
 
-SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
 
 #endif  // !CONFIG_REALTIME_ONLY
 
 #undef SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_SUBPEL_VARIANCE_WXH_NEON
commit	2ce72a4331b205018974e6b6d8747a9a87374266	[log] [tgz]
author	Jonathan Wright <jonathan.wright@arm.com>	Tue Jul 19 18:35:38 2022 +0100
committer	James Zern <jzern@google.com>	Thu Sep 08 02:34:02 2022 +0000
tree	d1f5d79aa5ee66ff82e81ea00f7895b81f7c19fe
parent	ce3b001c886dc788bad36cb95ddcef4556bb62c5 [diff]