Add Neon implementation of masked sub-pixel variance

Add Neon implementation of masked sub-pixel variance functions, as well
as the corresponding tests.

Change-Id: I91504129c7f2526ea056605eab0fe0dffecb925b
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b3597a6..76827cd 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1467,7 +1467,7 @@
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "aom_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *ref, int ref_stride, const uint8_t *second_pred, const uint8_t *msk, int msk_stride, int invert_mask, unsigned int *sse";
-    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+    specialize "aom_masked_sub_pixel_variance${w}x${h}", qw/ssse3 neon/;
   }
 
   if (aom_config("CONFIG_AV1_HIGHBITDEPTH") eq "yes") {
diff --git a/aom_dsp/arm/subpel_variance_neon.c b/aom_dsp/arm/subpel_variance_neon.c
index 07f5801..94f755a 100644
--- a/aom_dsp/arm/subpel_variance_neon.c
+++ b/aom_dsp/arm/subpel_variance_neon.c
@@ -650,3 +650,54 @@
 #undef OBMC_SUBPEL_VARIANCE_WXH_NEON
 #undef SPECIALIZED_OBMC_SUBPEL_VARIANCE_WXH_NEON
 #endif  // !CONFIG_REALTIME_ONLY
+
+#define MASKED_SUBPEL_VARIANCE_WXH_NEON(w, h, padding)                         \
+  unsigned int aom_masked_sub_pixel_variance##w##x##h##_neon(                  \
+      const uint8_t *src, int src_stride, int xoffset, int yoffset,            \
+      const uint8_t *ref, int ref_stride, const uint8_t *second_pred,          \
+      const uint8_t *msk, int msk_stride, int invert_mask,                     \
+      unsigned int *sse) {                                                     \
+    uint8_t tmp0[w * (h + padding)];                                           \
+    uint8_t tmp1[w * h];                                                       \
+    uint8_t tmp2[w * h];                                                       \
+    var_filter_block2d_bil_w##w(src, tmp0, src_stride, 1, (h + padding),       \
+                                xoffset);                                      \
+    var_filter_block2d_bil_w##w(tmp0, tmp1, w, w, h, yoffset);                 \
+    aom_comp_mask_pred_neon(tmp2, second_pred, w, h, tmp1, w, msk, msk_stride, \
+                            invert_mask);                                      \
+    return aom_variance##w##x##h##_neon(tmp2, w, ref, ref_stride, sse);        \
+  }
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 4, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 8, 2)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 4, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 16, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 16, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 32, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 16, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 64, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 64, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 128, 1)
+
+MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 64, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(128, 128, 1)
+
+// Realtime mode doesn't use 4x rectangular blocks.
+#if !CONFIG_REALTIME_ONLY
+MASKED_SUBPEL_VARIANCE_WXH_NEON(4, 16, 2)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(8, 32, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 4, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(16, 64, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(32, 8, 1)
+MASKED_SUBPEL_VARIANCE_WXH_NEON(64, 16, 1)
+#endif  // !CONFIG_REALTIME_ONLY
+
+#undef MASKED_SUBPEL_VARIANCE_WXH_NEON
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
index 4a4cb1a..e76403e 100644
--- a/test/masked_variance_test.cc
+++ b/test/masked_variance_test.cc
@@ -514,4 +514,59 @@
                          ::testing::ValuesIn(hbd_sub_pel_var_test));
 #endif  // CONFIG_AV1_HIGHBITDEPTH
 #endif  // HAVE_SSSE3
+
+#if HAVE_NEON
+
+const MaskedSubPixelVarianceParam sub_pel_var_test[] = {
+  make_tuple(&aom_masked_sub_pixel_variance128x128_neon,
+             &aom_masked_sub_pixel_variance128x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance128x64_neon,
+             &aom_masked_sub_pixel_variance128x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x128_neon,
+             &aom_masked_sub_pixel_variance64x128_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x64_neon,
+             &aom_masked_sub_pixel_variance64x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance64x32_neon,
+             &aom_masked_sub_pixel_variance64x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x64_neon,
+             &aom_masked_sub_pixel_variance32x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x32_neon,
+             &aom_masked_sub_pixel_variance32x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x16_neon,
+             &aom_masked_sub_pixel_variance32x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x32_neon,
+             &aom_masked_sub_pixel_variance16x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x16_neon,
+             &aom_masked_sub_pixel_variance16x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x8_neon,
+             &aom_masked_sub_pixel_variance16x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x16_neon,
+             &aom_masked_sub_pixel_variance8x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x8_neon,
+             &aom_masked_sub_pixel_variance8x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x4_neon,
+             &aom_masked_sub_pixel_variance8x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x8_neon,
+             &aom_masked_sub_pixel_variance4x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x4_neon,
+             &aom_masked_sub_pixel_variance4x4_c),
+#if !CONFIG_REALTIME_ONLY
+  make_tuple(&aom_masked_sub_pixel_variance64x16_neon,
+             &aom_masked_sub_pixel_variance64x16_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x64_neon,
+             &aom_masked_sub_pixel_variance16x64_c),
+  make_tuple(&aom_masked_sub_pixel_variance32x8_neon,
+             &aom_masked_sub_pixel_variance32x8_c),
+  make_tuple(&aom_masked_sub_pixel_variance8x32_neon,
+             &aom_masked_sub_pixel_variance8x32_c),
+  make_tuple(&aom_masked_sub_pixel_variance16x4_neon,
+             &aom_masked_sub_pixel_variance16x4_c),
+  make_tuple(&aom_masked_sub_pixel_variance4x16_neon,
+             &aom_masked_sub_pixel_variance4x16_c),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON_C_COMPARE, MaskedSubPixelVarianceTest,
+                         ::testing::ValuesIn(sub_pel_var_test));
+#endif  // HAVE_NEON
 }  // namespace