Specialize Neon masked sub-pixel variance by filter value Use the same specialization as for the other sub-pixel variance functions. The rationale for the specialization is as follows: The optimal implementation of the bilinear interpolation depends on the filter values being used. For both horizontal and vertical interpolation this can simplify to just taking the source values, or averaging the source and reference values - which can be computed more easily than a bilinear interpolation with arbitrary filter values. This patch introduces tests to find the most optimal bilinear interpolation implementation based on the filter values being used. This new specialization is only used for larger block sizes. Change-Id: I1c59e31bee9a14059baa9938cc0dc9bc91e9bef4

commit: 92b5cd32ac27098b2d51c4ac506026d1cabbeeef [log] [tgz]
author: Salome Thirot <salome.thirot@arm.com> Fri Apr 28 18:02:45 2023 +0100
committer: James Zern <jzern@google.com> Thu May 04 02:24:23 2023 +0000
tree: ab224708bee244775e81d64ba8a93ad70c8d748e
parent: 6d461a343105ac04fce4f25fb5285a1013b29359 [diff]
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 94f755a..9599ae0 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c

@@ -668,6 +668,89 @@
     return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);        \
   }
 
+#define SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)             \
+  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    if (xoffset == 0) {                                                        \
+      uint8_t tmp0[w * h];                                                     \
+      if (yoffset == 0) {                                                      \
+        aom_comp_mask_pred_neon(tmp0, second_pred, w, h, src, src_stride, msk, \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp0, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, src_stride, w, h);       \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, src_stride, h,      \
+                                    yoffset);                                  \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    } else if (xoffset == 4) {                                                 \
+      uint8_t tmp0[w * (h + padding)];                                         \
+      if (yoffset == 0) {                                                      \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, h);                \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_avg(src, tmp0, src_stride, 1, w, (h + padding));    \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    } else {                                                                   \
+      if (yoffset == 0) {                                                      \
+        uint8_t tmp0[w * h];                                                   \
+        uint8_t tmp1[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, h, xoffset);     \
+        aom_comp_mask_pred_neon(tmp1, second_pred, w, h, tmp0, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp1, w, ref, ref_stride, sse);    \
+      } else if (yoffset == 4) {                                               \
+        uint8_t tmp0[w * (h + padding)];                                       \
+        uint8_t tmp1[w * h];                                                   \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
+                                    xoffset);                                  \
+        var_filter_block2d_avg(tmp0, tmp1, w, w, w, h);                        \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+      } else {                                                                 \
+        uint8_t tmp0[w * (h + padding)];                                       \
+        uint8_t tmp1[w * (h + padding)];                                       \
+        uint8_t tmp2[w * h];                                                   \
+        var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),   \
+                                    xoffset);                                  \
+        var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);             \
+        aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk,         \
+                                msk_stride, invert_mask);                      \
+        return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);    \
+      }                                                                        \
+    }                                                                          \
+  }
+
 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
 
@@ -676,28 +759,29 @@
 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
 
 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
 
-MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
 
-MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
 
-MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
 
 // Realtime mode doesn't use 4x rectangular blocks.
 #if !CONFIG_REALTIME_ONLY
 MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
 MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
 MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
-MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
 #endif  // !CONFIG_REALTIME_ONLY
 
 #undef MASKED_SUBPEL_VARIANCE_WXH_NEON
+#undef SPECIALIZED_MASKED_SUBPEL_VARIANCE_WXH_NEON
commit	92b5cd32ac27098b2d51c4ac506026d1cabbeeef	[log] [tgz]
author	Salome Thirot <salome.thirot@arm.com>	Fri Apr 28 18:02:45 2023 +0100
committer	James Zern <jzern@google.com>	Thu May 04 02:24:23 2023 +0000
tree	ab224708bee244775e81d64ba8a93ad70c8d748e
parent	6d461a343105ac04fce4f25fb5285a1013b29359 [diff]