[CFL] SSSE3 Version of 4:4:4 HBD Subsampling

Includes unit tests for conformance and speed.

SSSE3/CFLSubsampleHBD444SpeedTest
4x4: C time = 155 us, SIMD time = 49 us (~3.2x)
8x8: C time = 522 us, SIMD time = 80 us (~6.5x)
16x16: C time = 2067 us, SIMD time = 286 us (~7.2x)
32x32: C time = 7045 us, SIMD time = 1044 us (~6.7x)

Change-Id: I0979ae2284765954b45fe9bb16ee618db1c4b36e
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 554598d..60d8ead 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -361,6 +361,9 @@
 add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd/, "TX_SIZE tx_size";
 specialize qw/cfl_get_luma_subsampling_422_hbd ssse3 avx2 neon/;
 
+add_proto qw/cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_444_hbd ssse3/;
+
 add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
 specialize qw/get_predict_lbd_fn ssse3 avx2 neon/;
 
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 250375f..0fd15d1 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -266,6 +266,48 @@
   } while (pred_buf_m128i < end);
 }
 
+static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input,
+                                                      int input_stride,
+                                                      int16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+
+  __m128i row, row_1, row_2, row_3;
+  do {
+    if (width == 4) {
+      row = _mm_loadl_epi64((__m128i *)input);
+    } else {
+      row = _mm_loadu_si128((__m128i *)input);
+      if (width >= 16) {
+        row_1 = _mm_loadu_si128((__m128i *)(input + 8));
+        row_1 = _mm_slli_epi16(row_1, 3);
+      }
+      if (width == 32) {
+        row_2 = _mm_loadu_si128((__m128i *)(input + 16));
+        row_2 = _mm_slli_epi16(row_2, 3);
+        row_3 = _mm_loadu_si128((__m128i *)(input + 24));
+        row_3 = _mm_slli_epi16(row_3, 3);
+      }
+    }
+    row = _mm_slli_epi16(row, 3);
+
+    if (width == 4) {
+      _mm_storel_epi64((__m128i *)pred_buf_q3, row);
+    } else {
+      _mm_storeu_si128((__m128i *)pred_buf_q3, row);
+      if (width >= 16) {
+        _mm_storeu_si128((__m128i *)(pred_buf_q3 + 8), row_1);
+      }
+      if (width == 32) {
+        _mm_storeu_si128((__m128i *)(pred_buf_q3 + 16), row_2);
+        _mm_storeu_si128((__m128i *)(pred_buf_q3 + 24), row_3);
+      }
+    }
+    input += input_stride;
+  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
+CFL_SUBSAMPLE_FUNCTIONS(ssse3, 444, hbd)
 CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
 
 static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 33bced7..d00ae76 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -315,8 +315,7 @@
     CFLSubsampleTest::SetUp();
     fun_420_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
     fun_422_ref = cfl_get_luma_subsampling_422_hbd_c(tx_size);
-    // TODO(ltrudeau) Replace with 444 when SIMD is available
-    fun_444_ref = cfl_get_luma_subsampling_420_hbd_c(tx_size);
+    fun_444_ref = cfl_get_luma_subsampling_444_hbd_c(tx_size);
   }
 };
 
@@ -337,6 +336,14 @@
   subsampleSpeedTest(fun_422, fun_422_ref, &ACMRandom::Rand12);
 }
 
+TEST_P(CFLSubsampleHBDTest, SubsampleHBD444Test) {
+  subsampleTest(fun_444, fun_444_ref, width, height, &ACMRandom::Rand12);
+}
+
+TEST_P(CFLSubsampleHBDTest, DISABLED_SubsampleHBD444SpeedTest) {
+  subsampleSpeedTest(fun_444, fun_444_ref, &ACMRandom::Rand12);
+}
+
 typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
 typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
 class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
@@ -448,11 +455,9 @@
 };
 
 const subsample_hbd_param subsample_hbd_sizes_ssse3[] = {
-  ALL_CFL_TX_SIZES_SUBSAMPLE(
-      cfl_get_luma_subsampling_420_hbd_ssse3,
-      cfl_get_luma_subsampling_422_hbd_ssse3,
-      cfl_get_luma_subsampling_420_hbd_ssse3)  // TODO(ltrudeau) replace with
-                                               // 444 when SIMD is available
+  ALL_CFL_TX_SIZES_SUBSAMPLE(cfl_get_luma_subsampling_420_hbd_ssse3,
+                             cfl_get_luma_subsampling_422_hbd_ssse3,
+                             cfl_get_luma_subsampling_444_hbd_ssse3)
 };
 
 const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
@@ -488,8 +493,8 @@
   ALL_CFL_TX_SIZES_SUBSAMPLE(
       cfl_get_luma_subsampling_420_hbd_avx2,
       cfl_get_luma_subsampling_422_hbd_avx2,
-      cfl_get_luma_subsampling_420_hbd_avx2)  // TODO(ltrudeau) replace with
-                                              // 444 when SIMD is available
+      cfl_get_luma_subsampling_444_hbd_c)  // TODO(ltrudeau) replace with
+                                           // 444 when SIMD is available
 };
 
 const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
@@ -529,8 +534,8 @@
   ALL_CFL_TX_SIZES_SUBSAMPLE(
       cfl_get_luma_subsampling_420_hbd_neon,
       cfl_get_luma_subsampling_422_hbd_neon,
-      cfl_get_luma_subsampling_420_hbd_neon)  // TODO(ltrudeau) replace with
-                                              // 444 when SIMD is available
+      cfl_get_luma_subsampling_444_hbd_c)  // TODO(ltrudeau) replace with
+                                           // 444 when SIMD is available
 };
 
 const predict_param predict_sizes_neon[] = { ALL_CFL_TX_SIZES(