[CFL] SSSE3/AVX2 versions of cfl_build_prediction_hbd

Includes unit tests for conformance and speed.

SSSE3/CFLPredictHBDTest:
4x4: C time = 1436 us, SIMD time = 358 us (~4x)
8x8: C time = 4821 us, SIMD time = 598 us (~8.1x)
16x16: C time = 18528 us, SIMD time = 1793 us (~10x)
32x32: C time = 72998 us, SIMD time = 6400 us (~11x)

AVX2/CFLPredictHBDTest:
4x4: C time = 1436 us, SIMD time = 398 us (~3.6x)
8x8: C time = 4924 us, SIMD time = 644 us (~7.6x)
16x16: C time = 18624 us, SIMD time = 1617 us (~12x)
32x32: C time = 73509 us, SIMD time = 3635 us (~20x)

Change-Id: Icbcfefbf165facdbd77c9b3861af2bbf464254a0
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index 7a04952..7ad7a67 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -50,12 +50,16 @@
 
 typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
 
+typedef cfl_predict_hbd_fn (*get_predict_fn_hbd)(TX_SIZE tx_size);
+
 typedef std::tr1::tuple<int, int, subtract_fn> subtract_param;
 
 typedef std::tr1::tuple<int, int, get_subsample_fn> subsample_param;
 
 typedef std::tr1::tuple<TX_SIZE, get_predict_fn> predict_param;
 
+typedef std::tr1::tuple<TX_SIZE, get_predict_fn_hbd> predict_param_hbd;
+
 static void assertFaster(int ref_elapsed_time, int elapsed_time) {
   EXPECT_GT(ref_elapsed_time, elapsed_time)
       << "Error: CFLSubtractSpeedTest, SIMD slower than C." << std::endl
@@ -151,6 +155,40 @@
   }
 };
 
+class CFLPredictHBDTest : public ::testing::TestWithParam<predict_param_hbd> {
+ public:
+  virtual ~CFLPredictHBDTest() {}
+  virtual void SetUp() { predict = GET_PARAM(1); }
+
+ protected:
+  int Width() const { return tx_size_wide[GET_PARAM(0)]; }
+  int Height() const { return tx_size_high[GET_PARAM(0)]; }
+  TX_SIZE Tx_size() const { return GET_PARAM(0); }
+  DECLARE_ALIGNED(32, uint16_t, chroma_pels_ref[CFL_BUF_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, sub_luma_pels_ref[CFL_BUF_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, chroma_pels[CFL_BUF_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, sub_luma_pels[CFL_BUF_SQUARE]);
+  get_predict_fn_hbd predict;
+  int bd;
+  int alpha_q3;
+  uint8_t dc;
+  void init(int width, int height) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    bd = 12;
+    alpha_q3 = rnd(33) - 16;
+    dc = rnd(1 << bd);
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        chroma_pels[j * CFL_BUF_LINE + i] = dc;
+        chroma_pels_ref[j * CFL_BUF_LINE + i] = dc;
+        sub_luma_pels_ref[j * CFL_BUF_LINE + i] =
+            sub_luma_pels[j * CFL_BUF_LINE + i] =
+                rnd(1 << bd) - (1 << (bd - 1));
+      }
+    }
+  }
+};
+
 TEST_P(CFLSubtractTest, SubtractTest) {
   const int width = Width();
   const int height = Height();
@@ -296,6 +334,58 @@
   assertFaster(ref_elapsed_time, elapsed_time);
 }
 
+TEST_P(CFLPredictHBDTest, PredictHBDTest) {
+  const int width = Width();
+  const int height = Height();
+  const TX_SIZE tx_size = Tx_size();
+
+  for (int it = 0; it < NUM_ITERATIONS; it++) {
+    init(width, height);
+    predict(tx_size)(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size,
+                     alpha_q3, bd);
+    get_predict_hbd_fn_c(tx_size)(sub_luma_pels_ref, chroma_pels_ref,
+                                  CFL_BUF_LINE, tx_size, alpha_q3, bd);
+    for (int j = 0; j < height; j++) {
+      for (int i = 0; i < width; i++) {
+        ASSERT_EQ(chroma_pels_ref[j * CFL_BUF_LINE + i],
+                  chroma_pels[j * CFL_BUF_LINE + i]);
+      }
+    }
+  }
+}
+
+TEST_P(CFLPredictHBDTest, DISABLED_PredictHBDSpeedTest) {
+  const int width = Width();
+  const int height = Height();
+  const TX_SIZE tx_size = Tx_size();
+
+  aom_usec_timer ref_timer;
+  aom_usec_timer timer;
+
+  init(width, height);
+  cfl_predict_hbd_fn predict_impl = get_predict_hbd_fn_c(tx_size);
+  aom_usec_timer_start(&ref_timer);
+
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_impl(sub_luma_pels_ref, chroma_pels_ref, CFL_BUF_LINE, tx_size,
+                 alpha_q3, bd);
+  }
+  aom_usec_timer_mark(&ref_timer);
+  int ref_elapsed_time = (int)aom_usec_timer_elapsed(&ref_timer);
+
+  predict_impl = predict(tx_size);
+  aom_usec_timer_start(&timer);
+  for (int k = 0; k < NUM_ITERATIONS_SPEED; k++) {
+    predict_impl(sub_luma_pels, chroma_pels, CFL_BUF_LINE, tx_size, alpha_q3,
+                 bd);
+  }
+  aom_usec_timer_mark(&timer);
+  int elapsed_time = (int)aom_usec_timer_elapsed(&timer);
+
+  printSpeed(ref_elapsed_time, elapsed_time, width, height);
+  assertFaster(ref_elapsed_time, elapsed_time);
+}
+
 #if HAVE_SSE2
 const subtract_param subtract_sizes_sse2[] = { ALL_CFL_SIZES(
     av1_cfl_subtract_sse2) };
@@ -312,11 +402,17 @@
 const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
     get_predict_lbd_fn_ssse3) };
 
+const predict_param_hbd predict_sizes_hbd_ssse3[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_ssse3) };
+
 INSTANTIATE_TEST_CASE_P(SSSE3, CFLSubsampleTest,
                         ::testing::ValuesIn(subsample_sizes_ssse3));
 
 INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictTest,
                         ::testing::ValuesIn(predict_sizes_ssse3));
+
+INSTANTIATE_TEST_CASE_P(SSSE3, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_ssse3));
 #endif
 
 #if HAVE_AVX2
@@ -329,6 +425,9 @@
 const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
     get_predict_lbd_fn_avx2) };
 
+const predict_param_hbd predict_sizes_hbd_avx2[] = { ALL_CFL_TX_SIZES(
+    get_predict_hbd_fn_avx2) };
+
 INSTANTIATE_TEST_CASE_P(AVX2, CFLSubtractTest,
                         ::testing::ValuesIn(subtract_sizes_avx2));
 
@@ -337,5 +436,8 @@
 
 INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictTest,
                         ::testing::ValuesIn(predict_sizes_avx2));
+
+INSTANTIATE_TEST_CASE_P(AVX2, CFLPredictHBDTest,
+                        ::testing::ValuesIn(predict_sizes_hbd_avx2));
 #endif
 }  // namespace