Implement Neon variance functions for all block sizes

Add a Neon implementation of aom_variance<w>x<h> for the following
block sizes - that previously only used a scalar C implementation on
Arm:
 * 64x16
 * 32x8
 * 16x4, 16x64
 * 8x32
 * 4x16

Also add test coverage for new Neon functions.

Change-Id: I7f3684cde55f5a03f50852915dd43848b1dff3bb
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 5e081a7..f5c9b03 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1378,12 +1378,12 @@
   specialize qw/aom_sub_pixel_avg_variance4x4          msa sse2 ssse3/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    specialize qw/aom_variance4x16 sse2/;
-    specialize qw/aom_variance16x4 sse2 avx2/;
-    specialize qw/aom_variance8x32 sse2/;
-    specialize qw/aom_variance32x8 sse2 avx2/;
-    specialize qw/aom_variance16x64 sse2 avx2/;
-    specialize qw/aom_variance64x16 sse2 avx2/;
+    specialize qw/aom_variance4x16  neon sse2/;
+    specialize qw/aom_variance16x4  neon sse2 avx2/;
+    specialize qw/aom_variance8x32  neon sse2/;
+    specialize qw/aom_variance32x8  neon sse2 avx2/;
+    specialize qw/aom_variance16x64 neon sse2 avx2/;
+    specialize qw/aom_variance64x16 neon sse2 avx2/;
 
     specialize qw/aom_sub_pixel_variance4x16 neon sse2 ssse3/;
     specialize qw/aom_sub_pixel_variance16x4 neon avx2 sse2 ssse3/;
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index 1b6c43a..8505d1b 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -205,19 +205,25 @@
 
 VARIANCE_WXH_NEON(4, 4, 4)
 VARIANCE_WXH_NEON(4, 8, 5)
+VARIANCE_WXH_NEON(4, 16, 6)
 
 VARIANCE_WXH_NEON(8, 4, 5)
 VARIANCE_WXH_NEON(8, 8, 6)
 VARIANCE_WXH_NEON(8, 16, 7)
+VARIANCE_WXH_NEON(8, 32, 8)
 
+VARIANCE_WXH_NEON(16, 4, 6)
 VARIANCE_WXH_NEON(16, 8, 7)
 VARIANCE_WXH_NEON(16, 16, 8)
 VARIANCE_WXH_NEON(16, 32, 9)
+VARIANCE_WXH_NEON(16, 64, 10)
 
+VARIANCE_WXH_NEON(32, 8, 8)
 VARIANCE_WXH_NEON(32, 16, 9)
 VARIANCE_WXH_NEON(32, 32, 10)
 VARIANCE_WXH_NEON(32, 64, 11)
 
+VARIANCE_WXH_NEON(64, 16, 10)
 VARIANCE_WXH_NEON(64, 32, 11)
 VARIANCE_WXH_NEON(64, 64, 12)
 VARIANCE_WXH_NEON(64, 128, 13)
diff --git a/test/variance_test.cc b/test/variance_test.cc
index e96f933..d41b4be 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -2822,25 +2822,36 @@
                          ::testing::Values(MseParams(4, 4,
                                                      &aom_mse16x16_neon)));
 
-INSTANTIATE_TEST_SUITE_P(
-    NEON, AvxVarianceTest,
-    ::testing::Values(VarianceParams(7, 7, &aom_variance128x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(7, 6, &aom_variance128x64_neon),
-                      VarianceParams(6, 7, &aom_variance64x128_neon),
-                      VarianceParams(6, 6, &aom_variance64x64_neon),
-                      VarianceParams(6, 5, &aom_variance64x32_neon),
-                      VarianceParams(5, 6, &aom_variance32x64_neon),
-                      VarianceParams(5, 5, &aom_variance32x32_neon),
-                      VarianceParams(5, 4, &aom_variance32x16_neon),
-                      VarianceParams(4, 5, &aom_variance16x32_neon),
-                      VarianceParams(4, 4, &aom_variance16x16_neon),
-                      VarianceParams(4, 3, &aom_variance16x8_neon),
-                      VarianceParams(3, 4, &aom_variance8x16_neon),
-                      VarianceParams(3, 3, &aom_variance8x8_neon),
-                      VarianceParams(3, 2, &aom_variance8x4_neon),
-                      VarianceParams(2, 3, &aom_variance4x8_neon),
-                      VarianceParams(2, 2, &aom_variance4x4_neon)));
+const VarianceParams kArrayVariance_neon[] = {
+  VarianceParams(7, 7, &aom_variance128x128_neon),
+  VarianceParams(6, 6, &aom_variance64x64_neon),
+  VarianceParams(7, 6, &aom_variance128x64_neon),
+  VarianceParams(6, 7, &aom_variance64x128_neon),
+  VarianceParams(6, 6, &aom_variance64x64_neon),
+  VarianceParams(6, 5, &aom_variance64x32_neon),
+  VarianceParams(5, 6, &aom_variance32x64_neon),
+  VarianceParams(5, 5, &aom_variance32x32_neon),
+  VarianceParams(5, 4, &aom_variance32x16_neon),
+  VarianceParams(4, 5, &aom_variance16x32_neon),
+  VarianceParams(4, 4, &aom_variance16x16_neon),
+  VarianceParams(4, 3, &aom_variance16x8_neon),
+  VarianceParams(3, 4, &aom_variance8x16_neon),
+  VarianceParams(3, 3, &aom_variance8x8_neon),
+  VarianceParams(3, 2, &aom_variance8x4_neon),
+  VarianceParams(2, 3, &aom_variance4x8_neon),
+  VarianceParams(2, 2, &aom_variance4x4_neon),
+#if !CONFIG_REALTIME_ONLY
+  VarianceParams(2, 4, &aom_variance4x16_neon),
+  VarianceParams(4, 2, &aom_variance16x4_neon),
+  VarianceParams(3, 5, &aom_variance8x32_neon),
+  VarianceParams(5, 3, &aom_variance32x8_neon),
+  VarianceParams(4, 6, &aom_variance16x64_neon),
+  VarianceParams(6, 4, &aom_variance64x16_neon),
+#endif
+};
+
+INSTANTIATE_TEST_SUITE_P(NEON, AvxVarianceTest,
+                         ::testing::ValuesIn(kArrayVariance_neon));
 
 const SubpelVarianceParams kArraySubpelVariance_neon[] = {
   SubpelVarianceParams(7, 7, &aom_sub_pixel_variance128x128_neon, 0),