Add Neon sad_skip[_4d] functions for 4x4, 8x4 and 16x4 blocks

Add Neon implementations of sad_skip and sad_skip_x4d functions for
4x4, 8x4 and 16x4 block sizes. Also add the corresponding tests.

Change-Id: I2e194eb2a2e7e56e95c440f0cedd431e5228a17e
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index b32e334..2feed0b 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -806,9 +806,12 @@
   specialize qw/aom_sad_skip_16x8                     sse2  neon/;
   specialize qw/aom_sad_skip_8x16                     sse2  neon/;
   specialize qw/aom_sad_skip_8x8                      sse2  neon/;
+  specialize qw/aom_sad_skip_8x4                            neon/;
   specialize qw/aom_sad_skip_4x8                      sse2  neon/;
+  specialize qw/aom_sad_skip_4x4                            neon/;
 
   specialize qw/aom_sad_skip_4x16                     sse2  neon/;
+  specialize qw/aom_sad_skip_16x4                           neon/;
   specialize qw/aom_sad_skip_8x32                     sse2  neon/;
   specialize qw/aom_sad_skip_32x8                     sse2  neon/;
   specialize qw/aom_sad_skip_16x64                    sse2  neon/;
@@ -1034,11 +1037,14 @@
   specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon/;
   specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon/;
   specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x4x4d              neon/;
   specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_8x4x4d               neon/;
   specialize qw/aom_sad_skip_4x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_4x8x4d          sse2 neon/;
+  specialize qw/aom_sad_skip_4x4x4d               neon/;
 
   specialize qw/aom_sad128x128x3d avx2/;
   specialize qw/aom_sad128x64x3d  avx2/;
diff --git a/aom_dsp/arm/sad4d_neon.c b/aom_dsp/arm/sad4d_neon.c
index 7e491bb..d949e4a 100644
--- a/aom_dsp/arm/sad4d_neon.c
+++ b/aom_dsp/arm/sad4d_neon.c
@@ -287,8 +287,8 @@
 
   src += 2 * src_stride;
   int ref_offset = 2 * ref_stride;
-  int i = (h - 1) / 2;
-  do {
+  int i = h / 2;
+  while (--i != 0) {
     s = load_unaligned_u8(src, src_stride);
     r0 = load_unaligned_u8(ref[0] + ref_offset, ref_stride);
     r1 = load_unaligned_u8(ref[1] + ref_offset, ref_stride);
@@ -302,7 +302,7 @@
 
     src += 2 * src_stride;
     ref_offset += 2 * ref_stride;
-  } while (--i != 0);
+  }
 
   vst1q_u32(res, horizontal_add_4d_u16x8(sum));
 }
@@ -356,13 +356,16 @@
     res[3] <<= 1;                                                           \
   }
 
+SAD_SKIP_WXH_4D_NEON(4, 4)
 SAD_SKIP_WXH_4D_NEON(4, 8)
 SAD_SKIP_WXH_4D_NEON(4, 16)
 
+SAD_SKIP_WXH_4D_NEON(8, 4)
 SAD_SKIP_WXH_4D_NEON(8, 8)
 SAD_SKIP_WXH_4D_NEON(8, 16)
 SAD_SKIP_WXH_4D_NEON(8, 32)
 
+SAD_SKIP_WXH_4D_NEON(16, 4)
 SAD_SKIP_WXH_4D_NEON(16, 8)
 SAD_SKIP_WXH_4D_NEON(16, 16)
 SAD_SKIP_WXH_4D_NEON(16, 32)
diff --git a/aom_dsp/arm/sad_neon.c b/aom_dsp/arm/sad_neon.c
index 6a22289..841908f 100644
--- a/aom_dsp/arm/sad_neon.c
+++ b/aom_dsp/arm/sad_neon.c
@@ -348,13 +348,16 @@
            sad##w##xh_neon(src, 2 * src_stride, ref, 2 * ref_stride, (h) / 2); \
   }
 
+SAD_SKIP_WXH_NEON(4, 4)
 SAD_SKIP_WXH_NEON(4, 8)
 SAD_SKIP_WXH_NEON(4, 16)
 
+SAD_SKIP_WXH_NEON(8, 4)
 SAD_SKIP_WXH_NEON(8, 8)
 SAD_SKIP_WXH_NEON(8, 16)
 SAD_SKIP_WXH_NEON(8, 32)
 
+SAD_SKIP_WXH_NEON(16, 4)
 SAD_SKIP_WXH_NEON(16, 8)
 SAD_SKIP_WXH_NEON(16, 16)
 SAD_SKIP_WXH_NEON(16, 32)
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 707a3ec..c10e929 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -1886,11 +1886,14 @@
   make_tuple(16, 8, &aom_sad_skip_16x8_neon, -1),
   make_tuple(8, 16, &aom_sad_skip_8x16_neon, -1),
   make_tuple(8, 8, &aom_sad_skip_8x8_neon, -1),
+  make_tuple(8, 4, &aom_sad_skip_8x4_neon, -1),
   make_tuple(4, 8, &aom_sad_skip_4x8_neon, -1),
+  make_tuple(4, 4, &aom_sad_skip_4x4_neon, -1),
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad_skip_64x16_neon, -1),
   make_tuple(32, 8, &aom_sad_skip_32x8_neon, -1),
   make_tuple(16, 64, &aom_sad_skip_16x64_neon, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4_neon, -1),
   make_tuple(8, 32, &aom_sad_skip_8x32_neon, -1),
   make_tuple(4, 16, &aom_sad_skip_4x16_neon, -1),
 #endif
@@ -1910,13 +1913,16 @@
   make_tuple(16, 32, &aom_sad_skip_16x32x4d_neon, -1),
   make_tuple(16, 16, &aom_sad_skip_16x16x4d_neon, -1),
   make_tuple(16, 8, &aom_sad_skip_16x8x4d_neon, -1),
-  make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
   make_tuple(8, 16, &aom_sad_skip_8x16x4d_neon, -1),
+  make_tuple(8, 8, &aom_sad_skip_8x8x4d_neon, -1),
+  make_tuple(8, 4, &aom_sad_skip_8x4x4d_neon, -1),
   make_tuple(4, 8, &aom_sad_skip_4x8x4d_neon, -1),
+  make_tuple(4, 4, &aom_sad_skip_4x4x4d_neon, -1),
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad_skip_64x16x4d_neon, -1),
   make_tuple(32, 8, &aom_sad_skip_32x8x4d_neon, -1),
   make_tuple(16, 64, &aom_sad_skip_16x64x4d_neon, -1),
+  make_tuple(16, 4, &aom_sad_skip_16x4x4d_neon, -1),
   make_tuple(8, 32, &aom_sad_skip_8x32x4d_neon, -1),
   make_tuple(4, 16, &aom_sad_skip_4x16x4d_neon, -1),
 #endif