Add Neon implementations of OBMC sub pixel variance functions

Add a Neon implementation of aom_obmc_sub_pixel_variance<w>x<h> for
all block sizes - where previously only scalar C implementations
existed.

Also add test coverage for the new Neon functions.

Change-Id: Iba428b1edadcb0ac3b4b2549267c4b47259c2158
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index dae52e6..e555e1e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1542,7 +1542,7 @@
       add_proto qw/unsigned int/, "aom_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
       add_proto qw/unsigned int/, "aom_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
       specialize "aom_obmc_variance${w}x${h}", qw/sse4_1 avx2 neon/;
-      specialize "aom_obmc_sub_pixel_variance${w}x${h}", q/sse4_1/;
+      specialize "aom_obmc_sub_pixel_variance${w}x${h}", qw/sse4_1 neon/;
     }
 
     if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index a058860..7946439 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -549,3 +549,48 @@
 
 #undef SUBPEL_AVG_VARIANCE_WXH_NEON
 #undef SPECIALIZED_SUBPEL_AVG_VARIANCE_WXH_NEON
+
+#if !CONFIG_REALTIME_ONLY
+
+#define OBMC_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                   \
+  unsigned int aom_obmc_sub_pixel_variance##w##x##h##_neon(            \
+      const uint8_t *pre, int pre_stride, int xoffset, int yoffset,    \
+      const int32_t *wsrc, const int32_t *mask, unsigned int *sse) {   \
+    uint8_t tmp0[w * (h + padding)];                                   \
+    uint8_t tmp1[w * h];                                               \
+    var_filter_block2d_bil_w##w(pre, tmp0, pre_stride, 1, h + padding, \
+                                xoffset);                              \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);         \
+    return aom_obmc_variance##w##x##h(tmp1, w, wsrc, mask, sse);       \
+  }
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+OBMC_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+#undef OBMC_SUBPEL_VARIANCE_WXH_NEON
+#endif  // !CONFIG_REALTIME_ONLY
diff --git a/test/variance_test.cc b/test/variance_test.cc
index ce5afec..2c272f0 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -3310,6 +3310,35 @@
 INSTANTIATE_TEST_SUITE_P(NEON, AvxSubpelAvgVarianceTest,
                          ::testing::ValuesIn(kArraySubpelAvgVariance_neon));
 
+#if !CONFIG_REALTIME_ONLY
+const ObmcSubpelVarianceParams kArrayObmcSubpelVariance_neon[] = {
+  ObmcSubpelVarianceParams(7, 7, &aom_obmc_sub_pixel_variance128x128_neon, 0),
+  ObmcSubpelVarianceParams(7, 6, &aom_obmc_sub_pixel_variance128x64_neon, 0),
+  ObmcSubpelVarianceParams(6, 7, &aom_obmc_sub_pixel_variance64x128_neon, 0),
+  ObmcSubpelVarianceParams(6, 6, &aom_obmc_sub_pixel_variance64x64_neon, 0),
+  ObmcSubpelVarianceParams(6, 5, &aom_obmc_sub_pixel_variance64x32_neon, 0),
+  ObmcSubpelVarianceParams(5, 6, &aom_obmc_sub_pixel_variance32x64_neon, 0),
+  ObmcSubpelVarianceParams(5, 5, &aom_obmc_sub_pixel_variance32x32_neon, 0),
+  ObmcSubpelVarianceParams(5, 4, &aom_obmc_sub_pixel_variance32x16_neon, 0),
+  ObmcSubpelVarianceParams(4, 5, &aom_obmc_sub_pixel_variance16x32_neon, 0),
+  ObmcSubpelVarianceParams(4, 4, &aom_obmc_sub_pixel_variance16x16_neon, 0),
+  ObmcSubpelVarianceParams(4, 3, &aom_obmc_sub_pixel_variance16x8_neon, 0),
+  ObmcSubpelVarianceParams(3, 4, &aom_obmc_sub_pixel_variance8x16_neon, 0),
+  ObmcSubpelVarianceParams(3, 3, &aom_obmc_sub_pixel_variance8x8_neon, 0),
+  ObmcSubpelVarianceParams(3, 2, &aom_obmc_sub_pixel_variance8x4_neon, 0),
+  ObmcSubpelVarianceParams(2, 3, &aom_obmc_sub_pixel_variance4x8_neon, 0),
+  ObmcSubpelVarianceParams(2, 2, &aom_obmc_sub_pixel_variance4x4_neon, 0),
+  ObmcSubpelVarianceParams(6, 4, &aom_obmc_sub_pixel_variance64x16_neon, 0),
+  ObmcSubpelVarianceParams(4, 6, &aom_obmc_sub_pixel_variance16x64_neon, 0),
+  ObmcSubpelVarianceParams(5, 3, &aom_obmc_sub_pixel_variance32x8_neon, 0),
+  ObmcSubpelVarianceParams(3, 5, &aom_obmc_sub_pixel_variance8x32_neon, 0),
+  ObmcSubpelVarianceParams(4, 2, &aom_obmc_sub_pixel_variance16x4_neon, 0),
+  ObmcSubpelVarianceParams(2, 4, &aom_obmc_sub_pixel_variance4x16_neon, 0),
+};
+INSTANTIATE_TEST_SUITE_P(NEON, AvxObmcSubpelVarianceTest,
+                         ::testing::ValuesIn(kArrayObmcSubpelVariance_neon));
+#endif
+
 const GetSseSumParams kArrayGetSseSum8x8Quad_neon[] = {
   GetSseSumParams(7, 7, &aom_get_var_sse_sum_8x8_quad_neon, 0),
   GetSseSumParams(6, 6, &aom_get_var_sse_sum_8x8_quad_neon, 0),