[CFL] NEON Version of luma_subsampling_420_lbd

Includes unit tests for conformance and speed.

NEON/CFLSubsampleTest (Odroid C2, Cortex-A53)
4x4: C time = 1003 us, SIMD time = 811 us (~1.2x)
8x8: C time = 3042 us, SIMD time = 1174 us (~2.6x)
16x16: C time = 11792 us, SIMD time = 2389 us (~4.9x)
32x32: C time = 45370 us, SIMD time = 9224 us (~4.9x)

Change-Id: I717714e942680ca51a0544f32e2548386290ce32
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index cf97e8d..beaaa13 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -22,6 +22,49 @@
   return vaddq_u16(vld1q_u16(buf), vld1q_u16(buf + offset));
 }
 
+// Load half of a vector and duplicated in other half
+static INLINE uint8x8_t vldh_dup_u8(const uint8_t *ptr) {
+  return vreinterpret_u8_u32(vld1_dup_u32((const uint32_t *)ptr));
+}
+
+// Store half of a vector.
+static INLINE void vsth_s16(int16_t *ptr, int16x4_t val) {
+  *((uint32_t *)ptr) = vreinterpret_u32_s16(val)[0];
+}
+
+static void cfl_luma_subsampling_420_lbd_neon(const uint8_t *input,
+                                              int input_stride,
+                                              int16_t *pred_buf_q3, int width,
+                                              int height) {
+  const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+  do {
+    if (width == 4) {
+      const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vldh_dup_u8(input + input_stride));
+      vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
+    } else if (width == 8) {
+      const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+      const uint16x4_t sum = vpadal_u8(top, vld1_u8(input + input_stride));
+      vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(sum), 1));
+    } else {
+      const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+      const uint16x8_t sum = vpadalq_u8(top, vld1q_u8(input + input_stride));
+      vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(sum), 1));
+      if (width == 32) {
+        const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
+        const uint16x8_t next_sum =
+            vpadalq_u8(next_top, vld1q_u8(input + 16 + input_stride));
+        vst1q_s16(pred_buf_q3 + 8,
+                  vshlq_n_s16(vreinterpretq_s16_u16(next_sum), 1));
+      }
+    }
+    input += luma_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_GET_SUBSAMPLE_FUNCTION(neon)
+
 static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
                                          int height, int round_offset,
                                          const int num_pel_log2) {
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 8017b40..834a1c2 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -564,7 +564,7 @@
 specialize qw/get_subtract_average_fn sse2 avx2 neon/;
 
 add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2/;
+specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
 
 add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
 specialize qw/get_predict_lbd_fn ssse3 avx2/;
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 98f4ec2..658d50a 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -480,8 +480,14 @@
 const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
     get_subtract_average_fn_neon) };
 
+const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES(
+    cfl_get_luma_subsampling_420_lbd_neon) };
+
 INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
                         ::testing::ValuesIn(sub_avg_sizes_neon));
 
+INSTANTIATE_TEST_CASE_P(NEON, CFLSubsampleTest,
+                        ::testing::ValuesIn(subsample_sizes_neon));
+
 #endif
 }  // namespace