[CFL] Neon Version of 4:4:4 HBD Subsampling

Includes unit tests for conformance and speed.

NEON/CFLSubsampleHBD444SpeedTest
4x4: C time = 1515 us, SIMD time = 513 us (~3x)
8x8: C time = 5569 us, SIMD time = 1302 us (~4.3x)
16x16: C time = 22337 us, SIMD time = 4203 us (~5.3x)
32x32: C time = 87936 us, SIMD time = 17046 us (~5.2x)

Change-Id: I708147da051ebcd28c51cceaba1017a658911c88
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index a759b9d..e381448 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -230,6 +230,37 @@
   } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
 }
 
+static void cfl_luma_subsampling_444_hbd_neon(const uint16_t *input,
+                                              int input_stride,
+                                              int16_t *pred_buf_q3, int width,
+                                              int height) {
+  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+  do {
+    if (width == 4) {
+      const int16x4_t top = vreinterpret_s16_u16(vld1_u16(input));
+      vst1_s16(pred_buf_q3, vshl_n_s16(top, 3));
+    } else if (width == 8) {
+      const int16x8_t top = vreinterpretq_s16_u16(vld1q_u16(input));
+      vst1q_s16(pred_buf_q3, vshlq_n_s16(top, 3));
+    } else if (width == 16) {
+      const uint16x8x2_t top = vld2q_u16(input);
+      int16x8x2_t results;
+      results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
+      results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
+      vst2q_s16(pred_buf_q3, results);
+    } else {
+      const uint16x8x4_t top = vld4q_u16(input);
+      int16x8x4_t results;
+      results.val[0] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[0]), 3);
+      results.val[1] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[1]), 3);
+      results.val[2] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[2]), 3);
+      results.val[3] = vshlq_n_s16(vreinterpretq_s16_u16(top.val[3]), 3);
+      vst4q_s16(pred_buf_q3, results);
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
 CFL_GET_SUBSAMPLE_FUNCTION(neon)
 
 static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 6a7440d..e9c605b 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -362,7 +362,7 @@
 specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
 
 add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
-specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2/;
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3 avx2 neon/;
 
 add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
 specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index a47be66..3b15e86 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -335,10 +335,6 @@
   }
 }
 
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when HBD 444 SIMD
-// will be implemented
-CFL_SUBSAMPLE_FUNCTIONS(c, 444, hbd)
-
 CFL_GET_SUBSAMPLE_FUNCTION(c)
 
 static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
@@ -349,8 +345,7 @@
     }
     return cfl_get_luma_subsampling_422_hbd(tx_size);
   }
-  // TODO(ltrudeau) Remove _c when HBD 444 SIMD is added
-  return cfl_get_luma_subsampling_444_hbd_c(tx_size);
+  return cfl_get_luma_subsampling_444_hbd(tx_size);
 }
 
 static INLINE cfl_subsample_lbd_fn cfl_subsampling_lbd(TX_SIZE tx_size,
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index 766515b..969cd31 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -156,7 +156,8 @@
   CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 444, lbd) \
   CFL_SUBSAMPLE_FUNCTIONS(arch, 420, hbd) \
-  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd)
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 422, hbd) \
+  CFL_SUBSAMPLE_FUNCTIONS(arch, 444, hbd)
 
 // Null function used for invalid tx_sizes
 static INLINE void cfl_subtract_average_null(int16_t *pred_buf_q3) {
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index aa68f78..1e869cf 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -297,7 +297,6 @@
   } while (pred_buf_q3 < end);
 }
 
-CFL_SUBSAMPLE_FUNCTIONS(ssse3, 444, hbd)
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
 static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 91af71f..020c124 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -529,11 +529,9 @@
 };
 
 const subsample_hbd_param subsample_hbd_sizes_neon[] = {
-  ALL_CFL_TX_SIZES_SUBSAMPLE(
-      cfl_get_luma_subsampling_420_hbd_neon,
-      cfl_get_luma_subsampling_422_hbd_neon,
-      cfl_get_luma_subsampling_444_hbd_c)  // TODO(ltrudeau) replace with
-                                           // 444 when SIMD is available
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_neon,
+                             cfl_get_luma_subsampling_422_hbd_neon,
+                             cfl_get_luma_subsampling_444_hbd_neon)
 };
 
 const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(