Integrate HBD row/column flip fwd txfm SSE4.1 optimization

- Integrate 5 flip transform types for each 4x4, 8x8, and 16x16
  block, for experiment, EXT_TX.
- Encoder speed improves about 12%-15%.
- Update the unit tests for bit-exact result against C.

Change-Id: Idf27c87f1e516ca5b66c7b70142477a115404ccb
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
index 8833250..97f2c02 100644
--- a/test/vp10_fht16x16_test.cc
+++ b/test/vp10_fht16x16_test.cc
@@ -207,7 +207,19 @@
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
     make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
-    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12)
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
+#if CONFIG_EXT_TX
+#endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, VP10HighbdTrans16x16HT,
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
index 5b81095..1309827 100644
--- a/test/vp10_fht4x4_test.cc
+++ b/test/vp10_fht4x4_test.cc
@@ -38,8 +38,10 @@
 typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                               int tx_type, int bd);
 typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
-                        int tx_type, int bd);
-// Target optimized function, tx_type, bit depth
+                           int tx_type, int bd);
+
+// HighbdHt4x4Param argument list:
+// <Target optimized function, tx_type, bit depth>
 typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
 
 void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
@@ -96,12 +98,12 @@
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = 16;
 
-    input_ = reinterpret_cast<int16_t *>
-       (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
-    output_ = reinterpret_cast<int32_t *>
-        (vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
-    output_ref_ = reinterpret_cast<int32_t *>
-        (vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
   }
 
   virtual void TearDown() {
@@ -197,9 +199,7 @@
 #endif  // HAVE_SSE2
 
 #if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, VP10HighbdTrans4x4HT,
-    ::testing::Values(
+const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10),
@@ -207,7 +207,25 @@
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12),
          make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10),
-         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12)));
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans4x4HT,
+      ::testing::ValuesIn(kArrayHighbdHt4x4Param));
+
 #endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
 
 }  // namespace
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
index aadd77d..2c33939 100644
--- a/test/vp10_fht8x8_test.cc
+++ b/test/vp10_fht8x8_test.cc
@@ -207,7 +207,19 @@
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10),
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12),
     make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10),
-    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12)
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
 };
 INSTANTIATE_TEST_CASE_P(
     SSE4_1, VP10HighbdTrans8x8HT,
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
index d876ef4..33bcab4 100644
--- a/vp10/encoder/hybrid_fwd_txfm.c
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -195,7 +195,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -211,7 +211,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -232,7 +231,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -249,7 +248,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
@@ -270,7 +268,7 @@
     case FLIPADST_FLIPADST:
     case ADST_FLIPADST:
     case FLIPADST_ADST:
-      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
       break;
     case V_DCT:
     case H_DCT:
@@ -287,7 +285,6 @@
 #endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
index ce9089e..2ad59cf 100644
--- a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -239,6 +239,43 @@
       fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
       write_buffer_4x4(in, coeff);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#endif
     default:
       assert(0);
   }
@@ -960,6 +997,56 @@
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
     default:
       assert(0);
   }
@@ -1806,6 +1893,56 @@
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
     default:
       assert(0);
   }