Optimize highbd fwd_txfm modules

Added sse4_1 variant for identity tx_type of 4,8,16.

Enabled sse4_1 optimizations for tx_sizes 4x4, 4x8,
8x4, 8x8, 4x16, 16x4, 8x16, 16x8, 16x16.

Module level gains:
Tx_size    Gain w.r.t. C
4x4         7.14x
4x8         4.42x
4x16        4.25x
8x4         5.00x
8x8         4.57x
8x16        3.40x
16x16       3.00x
16x4        4.80x
16x8        3.28x

When tested for 20 frames of crowd_run_360p_10 at 1 mbps
for speed=1 preset, observed ~2.5% reduction in encoder time.

Change-Id: I4f2b2405a4becb02a07d991b491a01926892ec44
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 1b0e90f..35c4dd7 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -87,63 +87,21 @@
     av1_highbd_fwht4x4(src_diff, coeff, diff_stride);
     return;
   }
-  switch (tx_type) {
-    // use the c version for anything including identity for now
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_4x4_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-    default:
-      av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-  }
+  av1_fwd_txfm2d_4x4(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }
 
 static void highbd_fwd_txfm_4x8(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  switch (txfm_param->tx_type) {
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_fwd_txfm2d_4x8_c(src_diff, dst_coeff, diff_stride,
-                           txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                         txfm_param->bd);
-      break;
-  }
+  av1_fwd_txfm2d_4x8(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_8x4(const int16_t *src_diff, tran_low_t *coeff,
                                 int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  switch (txfm_param->tx_type) {
-    case IDTX:
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-      av1_fwd_txfm2d_8x4_c(src_diff, dst_coeff, diff_stride,
-                           txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                         txfm_param->bd);
-      break;
-  }
+  av1_fwd_txfm2d_8x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                     txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_8x16(const int16_t *src_diff, tran_low_t *coeff,
@@ -151,21 +109,7 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-      // use the c version for anything including identity for now
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_8x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-    default:
-      av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-  }
+  av1_fwd_txfm2d_8x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }
 
 static void highbd_fwd_txfm_16x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -173,21 +117,7 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-      // use the c version for anything including identity for now
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_16x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-    default:
-      av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-  }
+  av1_fwd_txfm2d_16x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }
 
 static void highbd_fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
@@ -225,43 +155,15 @@
 static void highbd_fwd_txfm_16x4(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  switch (txfm_param->tx_type) {
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_16x4_c(src_diff, dst_coeff, diff_stride,
-                            txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                          txfm_param->bd);
-      break;
-  }
+  av1_fwd_txfm2d_16x4(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_4x16(const int16_t *src_diff, tran_low_t *coeff,
                                  int diff_stride, TxfmParam *txfm_param) {
   int32_t *dst_coeff = (int32_t *)coeff;
-  switch (txfm_param->tx_type) {
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_4x16_c(src_diff, dst_coeff, diff_stride,
-                            txfm_param->tx_type, txfm_param->bd);
-      break;
-    default:
-      av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
-                          txfm_param->bd);
-      break;
-  }
+  av1_fwd_txfm2d_4x16(src_diff, dst_coeff, diff_stride, txfm_param->tx_type,
+                      txfm_param->bd);
 }
 
 static void highbd_fwd_txfm_32x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -301,21 +203,7 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    // use the c version for anything including identity for now
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_8x8_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-    default:
-      av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-  }
+  av1_fwd_txfm2d_8x8(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }
 
 static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
@@ -323,21 +211,7 @@
   int32_t *dst_coeff = (int32_t *)coeff;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int bd = txfm_param->bd;
-  switch (tx_type) {
-    // use the c version for anything including identity for now
-    case V_DCT:
-    case H_DCT:
-    case V_ADST:
-    case H_ADST:
-    case V_FLIPADST:
-    case H_FLIPADST:
-    case IDTX:
-      av1_fwd_txfm2d_16x16_c(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-    default:
-      av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
-      break;
-  }
+  av1_fwd_txfm2d_16x16(src_diff, dst_coeff, diff_stride, tx_type, bd);
 }
 
 static void highbd_fwd_txfm_32x32(const int16_t *src_diff, tran_low_t *coeff,
diff --git a/av1/encoder/x86/highbd_fwd_txfm_sse4.c b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
index 9f28c4a..bef8abc 100644
--- a/av1/encoder/x86/highbd_fwd_txfm_sse4.c
+++ b/av1/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -188,7 +188,30 @@
   out[2] = _mm_unpacklo_epi64(v1, v3);
   out[3] = _mm_unpackhi_epi64(v1, v3);
 }
+static void idtx4x4_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
+  __m128i v[4];
 
+  for (int i = 0; i < 4; i++) {
+    a_low = _mm_mullo_epi32(in[i * col_num], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+
+  // Transpose for 4x4
+  v[0] = _mm_unpacklo_epi32(out[0], out[1]);
+  v[1] = _mm_unpackhi_epi32(out[0], out[1]);
+  v[2] = _mm_unpacklo_epi32(out[2], out[3]);
+  v[3] = _mm_unpackhi_epi32(out[2], out[3]);
+
+  out[0] = _mm_unpacklo_epi64(v[0], v[2]);
+  out[1] = _mm_unpackhi_epi64(v[0], v[2]);
+  out[2] = _mm_unpacklo_epi64(v[1], v[3]);
+  out[3] = _mm_unpackhi_epi64(v[1], v[3]);
+}
 void av1_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
                                int input_stride, TX_TYPE tx_type, int bd) {
   __m128i in[4];
@@ -251,6 +274,48 @@
       fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
       write_buffer_4x4(in, coeff);
       break;
+    case IDTX:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fdct4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, input_stride, 0, 0, shift[0]);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, fwd_cos_bit_col[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 1, 0, shift[0]);
+      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, input_stride, 0, 1, shift[0]);
+      idtx4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      fadst4x4_sse4_1(in, in, fwd_cos_bit_row[txw_idx][txh_idx], 1);
+      write_buffer_4x4(in, coeff);
+      break;
     default: assert(0);
   }
   (void)bd;
@@ -748,7 +813,20 @@
     out[col_num * 7 + col] = v0;
   }
 }
+static void idtx8x8_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
 
+  for (int i = 0; i < col_num; i += 1) {
+    out[0 + 8 * i] = _mm_add_epi32(in[0 + 8 * i], in[0 + 8 * i]);
+    out[1 + 8 * i] = _mm_add_epi32(in[1 + 8 * i], in[1 + 8 * i]);
+    out[2 + 8 * i] = _mm_add_epi32(in[2 + 8 * i], in[2 + 8 * i]);
+    out[3 + 8 * i] = _mm_add_epi32(in[3 + 8 * i], in[3 + 8 * i]);
+    out[4 + 8 * i] = _mm_add_epi32(in[4 + 8 * i], in[4 + 8 * i]);
+    out[5 + 8 * i] = _mm_add_epi32(in[5 + 8 * i], in[5 + 8 * i]);
+    out[6 + 8 * i] = _mm_add_epi32(in[6 + 8 * i], in[6 + 8 * i]);
+    out[7 + 8 * i] = _mm_add_epi32(in[7 + 8 * i], in[7 + 8 * i]);
+  }
+}
 void av1_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff, int stride,
                                TX_TYPE tx_type, int bd) {
   __m128i in[16], out[16];
@@ -838,6 +916,69 @@
       transpose_8x8(out, in);
       write_buffer_8x8(in, coeff);
       break;
+    case IDTX:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0, shift[0]);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0, shift[0]);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1, shift[0]);
+      idtx8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      col_txfm_8x8_rounding(out, -shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], 2);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
     default: assert(0);
   }
   (void)bd;
@@ -1579,7 +1720,19 @@
   output += size_8x8;
   write_buffer_8x8(&in[48], output);
 }
+static void idtx16x16_sse4_1(__m128i *in, __m128i *out, int bit, int col_num) {
+  (void)bit;
+  __m128i fact = _mm_set1_epi32(2 * NewSqrt2);
+  __m128i offset = _mm_set1_epi32(1 << (NewSqrt2Bits - 1));
+  __m128i a_low;
 
+  int num_iters = 16 * col_num;
+  for (int i = 0; i < num_iters; i++) {
+    a_low = _mm_mullo_epi32(in[i], fact);
+    a_low = _mm_add_epi32(a_low, offset);
+    out[i] = _mm_srai_epi32(a_low, NewSqrt2Bits);
+  }
+}
 void av1_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
                                  int stride, TX_TYPE tx_type, int bd) {
   __m128i in[64], out[64];
@@ -1669,6 +1822,69 @@
       transpose_16x16(out, in);
       write_buffer_16x16(in, coeff);
       break;
+    case IDTX:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in, stride, 0, 0, shift[0]);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in, stride, 1, 0, shift[0]);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in, stride, 0, 1, shift[0]);
+      idtx16x16_sse4_1(in, out, fwd_cos_bit_col[txw_idx][txh_idx], col_num);
+      col_txfm_16x16_rounding(out, -shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, fwd_cos_bit_row[txw_idx][txh_idx], col_num);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
     default: assert(0);
   }
   (void)bd;
@@ -1689,13 +1905,13 @@
   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   fadst8x8_sse4_1,  // ADST_FLIPADST
   fadst8x8_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct8x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x8_arr[TX_TYPES] = {
@@ -1708,13 +1924,13 @@
   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   fadst8x8_sse4_1,  // ADST_FLIPADST
   fadst8x8_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx8x8_sse4_1,   // IDTX
+  fdct4x8_sse4_1,   // V_DCT
+  idtx8x8_sse4_1,   // H_DCT
+  fadst8x8_sse4_1,  // V_ADST
+  idtx8x8_sse4_1,   // H_ADST
+  fadst8x8_sse4_1,  // V_FLIPADST
+  idtx8x8_sse4_1    // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x16_arr[TX_TYPES] = {
@@ -1727,13 +1943,13 @@
   fadst16x16_sse4_1,  // FLIPADST_FLIPADST
   fadst16x16_sse4_1,  // ADST_FLIPADST
   fadst16x16_sse4_1,  // FLIPADST_ADST
-  NULL,               // IDTX
-  NULL,               // V_DCT
-  NULL,               // H_DCT
-  NULL,               // V_ADST
-  NULL,               // H_ADST
-  NULL,               // V_FLIPADST
-  NULL                // H_FLIPADST
+  idtx16x16_sse4_1,   // IDTX
+  idtx16x16_sse4_1,   // V_DCT
+  fdct16x16_sse4_1,   // H_DCT
+  idtx16x16_sse4_1,   // V_ADST
+  fadst16x16_sse4_1,  // H_ADST
+  idtx16x16_sse4_1,   // V_FLIPADST
+  fadst16x16_sse4_1   // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x16_arr[TX_TYPES] = {
@@ -1746,13 +1962,13 @@
   fadst16x16_sse4_1,  // FLIPADST_FLIPADST
   fadst16x16_sse4_1,  // ADST_FLIPADST
   fadst16x16_sse4_1,  // FLIPADST_ADST
-  NULL,               // IDTX
-  NULL,               // V_DCT
-  NULL,               // H_DCT
-  NULL,               // V_ADST
-  NULL,               // H_ADST
-  NULL,               // V_FLIPADST
-  NULL                // H_FLIPADST
+  idtx16x16_sse4_1,   // IDTX
+  fdct16x16_sse4_1,   // V_DCT
+  idtx16x16_sse4_1,   // H_DCT
+  fadst16x16_sse4_1,  // V_ADST
+  idtx16x16_sse4_1,   // H_ADST
+  fadst16x16_sse4_1,  // V_FLIPADST
+  idtx16x16_sse4_1    // H_FLIPADST
 };
 static const fwd_transform_1d_sse4_1 row_highbd_txfm8x8_arr[TX_TYPES] = {
   fdct8x8_sse4_1,   // DCT_DCT
@@ -1764,13 +1980,13 @@
   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   fadst8x8_sse4_1,  // ADST_FLIPADST
   fadst8x8_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct8x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x8_arr[TX_TYPES] = {
@@ -1783,13 +1999,13 @@
   fadst8x8_sse4_1,  // FLIPADST_FLIPADST
   fadst8x8_sse4_1,  // ADST_FLIPADST
   fadst8x8_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx8x8_sse4_1,   // IDTX
+  idtx8x8_sse4_1,   // V_DCT
+  fdct4x8_sse4_1,   // H_DCT
+  idtx8x8_sse4_1,   // V_ADST
+  fadst8x8_sse4_1,  // H_ADST
+  idtx8x8_sse4_1,   // V_FLIPADST
+  fadst8x8_sse4_1   // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 row_highbd_txfm4x4_arr[TX_TYPES] = {
@@ -1802,13 +2018,13 @@
   fadst4x4_sse4_1,  // FLIPADST_FLIPADST
   fadst4x4_sse4_1,  // ADST_FLIPADST
   fadst4x4_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx4x4_sse4_1,   // IDTX
+  idtx4x4_sse4_1,   // V_DCT
+  fdct4x4_sse4_1,   // H_DCT
+  idtx4x4_sse4_1,   // V_ADST
+  fadst4x4_sse4_1,  // H_ADST
+  idtx4x4_sse4_1,   // V_FLIPADST
+  fadst4x4_sse4_1   // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 col_highbd_txfm4x4_arr[TX_TYPES] = {
@@ -1821,13 +2037,13 @@
   fadst4x4_sse4_1,  // FLIPADST_FLIPADST
   fadst4x4_sse4_1,  // ADST_FLIPADST
   fadst4x4_sse4_1,  // FLIPADST_ADST
-  NULL,             // IDTX
-  NULL,             // V_DCT
-  NULL,             // H_DCT
-  NULL,             // V_ADST
-  NULL,             // H_ADST
-  NULL,             // V_FLIPADST
-  NULL              // H_FLIPADST
+  idtx4x4_sse4_1,   // IDTX
+  fdct4x4_sse4_1,   // V_DCT
+  idtx4x4_sse4_1,   // H_DCT
+  fadst4x4_sse4_1,  // V_ADST
+  idtx4x4_sse4_1,   // H_ADST
+  fadst4x4_sse4_1,  // V_FLIPADST
+  idtx4x4_sse4_1    // H_FLIPADST
 };
 
 static const fwd_transform_1d_sse4_1 col_highbd_txfm8x32_arr[TX_TYPES] = {