HBD hybrid transform 16x16 SSE4.1 optimization

- Tx_type: DCT_DCT, DCT_ADST, ADST_DCT, ADST_ADST.
- Update vp10_fht16x16_test.cc to do bit-exact test against
  latest C version.
- HBD encoder speed improves ~1.8%.

Change-Id: Icfc799a212e5289bcf6cedcae3722032133a2bc6
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 0e1bfa2..8833250 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -25,16 +25,30 @@
 namespace {
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
-
+using std::tr1::tuple;
 using libvpx_test::FhtFunc;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int>
-Ht16x16Param;
+typedef tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int> Ht16x16Param;
 
 void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                 int tx_type) {
   vp10_fht16x16_c(in, out, stride, tx_type);
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt16x16Param;
+
+void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  vp10_fwd_txfm2d_16x16_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 class VP10Trans16x16HT
     : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Ht16x16Param> {
@@ -70,60 +84,77 @@
   RunCoeffCheck();
 }
 
-#if CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
-TEST(VP10Trans16x16HTSpeedTest, C_version) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 20000;
-    int bit_depth = 8;
-    int mask = (1 << bit_depth) - 1;
-    const int num_coeffs = 256;
-    int16_t *input = new int16_t[num_coeffs];
-    tran_low_t *output = new tran_low_t[num_coeffs];
-    const int stride = 16;
-    int tx_type;
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HighbdTrans16x16HT
+    : public ::testing::TestWithParam<HighbdHt16x16Param> {
+ public:
+  virtual ~VP10HighbdTrans16x16HT() {}
 
-    for (int i = 0; i < count_test_block; ++i) {
-      for (int j = 0; j < num_coeffs; ++j) {
-        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
-      }
-      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
-        vp10_fht16x16_c(input, output, stride, tx_type);
-      }
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht16x16_ref;
+    tx_type_  = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 256;
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void VP10HighbdTrans16x16HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 16;
+  const int num_tests = 200000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
     }
 
-    delete[] input;
-    delete[] output;
-}
-#endif  // CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_, output_, stride, tx_type_,
+                                       bit_depth_));
 
-#if HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
-TEST(VP10Trans16x16HTSpeedTest, SSE2_version) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 20000;
-    int bit_depth = 8;
-    int mask = (1 << bit_depth) - 1;
-    const int num_coeffs = 256;
-    int16_t *input = reinterpret_cast<int16_t *>
-        (vpx_memalign(16, sizeof(int16_t) * num_coeffs));
-    tran_low_t *output = reinterpret_cast<tran_low_t *>
-        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs));
-
-    const int stride = 16;
-    int tx_type;
-
-    for (int i = 0; i < count_test_block; ++i) {
-      for (int j = 0; j < num_coeffs; ++j) {
-        input[j] = (rnd.Rand8() & mask) - (rnd.Rand8() & mask);
-      }
-      for (tx_type = V_DCT; tx_type <= H_FLIPADST; ++tx_type) {
-        vp10_fht16x16_sse2(input, output, stride, tx_type);
-      }
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j
+          << " at test block: " << i;
     }
-
-    vpx_free(input);
-    vpx_free(output);
+  }
 }
-#endif  // HAVE_SSE2 && CONFIG_EXT_TX && !CONFIG_VP9_HIGHBITDEPTH
+
+TEST_P(VP10HighbdTrans16x16HT, HighbdCoeffCheck) {
+  RunBitexactCheck();
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 using std::tr1::make_tuple;
 
@@ -167,4 +198,20 @@
     ::testing::ValuesIn(kArrayHt16x16Param_sse2));
 #endif  // HAVE_SSE2
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 0, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 0, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 1, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 1, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12)
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans16x16HT,
+    ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 }  // namespace