Optimize highbd inv_txfm modules

Added SSE4_1 variants for vertical identity txfm types.

Module level gains:
Tx_size   Gain w.r.t. C
8x8         6.65x
8x16        6.15x
16x8        9.59x
16x16       10.07x

When tested for multiple test cases observed 1.1%
average reduction in encoder time for speed = 1 preset.

Change-Id: I586db59840646fa6830647996ea4d84c0e6713dc
diff --git a/av1/common/x86/highbd_inv_txfm_avx2.c b/av1/common/x86/highbd_inv_txfm_avx2.c
index 2fe5777..4de31b6 100644
--- a/av1/common/x86/highbd_inv_txfm_avx2.c
+++ b/av1/common/x86/highbd_inv_txfm_avx2.c
@@ -4322,13 +4322,13 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
     case V_DCT:
     case V_ADST:
     case V_FLIPADST:
@@ -4413,13 +4413,13 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
     case V_DCT:
     case V_ADST:
     case V_FLIPADST:
@@ -4482,13 +4482,13 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                 txfm_param->tx_type, txfm_param->bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
     case V_DCT:
     case V_ADST:
     case V_FLIPADST:
@@ -4513,13 +4513,13 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                 txfm_param->tx_type, txfm_param->bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
     case V_DCT:
     case V_ADST:
     case V_FLIPADST:
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index f546adb..b38bbe9 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -5148,13 +5148,13 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_8x8_c(src, CONVERT_TO_SHORTPTR(dest), stride, tx_type,
                                bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
     case V_DCT:
     case V_ADST:
     case V_FLIPADST:
@@ -5178,9 +5178,6 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_16x8_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                 txfm_param->tx_type, txfm_param->bd);
@@ -5202,9 +5199,6 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_8x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                 txfm_param->tx_type, txfm_param->bd);
@@ -5226,9 +5220,6 @@
   switch (tx_type) {
       // Assembly version doesn't support some transform types, so use C version
       // for those.
-    case H_DCT:
-    case H_ADST:
-    case H_FLIPADST:
     case IDTX:
       av1_inv_txfm2d_add_16x16_c(src, CONVERT_TO_SHORTPTR(dest), stride,
                                  tx_type, bd);
@@ -5464,6 +5455,85 @@
                                    stride, ud_flip, txfm_size_row, bd);
   }
 }
+static void highbd_inv_txfm2d_add_v_identity_ssse41(const int32_t *input,
+                                                    uint16_t *output,
+                                                    int stride, TX_TYPE tx_type,
+                                                    TX_SIZE tx_size, int eob,
+                                                    const int bd) {
+  __m128i buf1[64];
+  int eobx, eoby;
+  get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int input_stride = AOMMIN(32, txfm_size_col);
+  const int buf_size_w_div8 = input_stride >> 2;
+  const int row_max = AOMMIN(32, txfm_size_row);
+  const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  const int fun_idx = lowbd_txfm_all_1d_zeros_idx[eobx];
+  const transform_1d_sse4_1 row_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx];
+  const transform_1d_sse4_1 col_txfm =
+      highbd_txfm_all_1d_zeros_w8_arr[txh_idx][vitx_1d_tab[tx_type]][0];
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  for (int i = 0; i < (row_max >> 2); ++i) {
+    __m128i buf0[16];
+    const int32_t *input_row = input + i * input_stride * 4;
+    for (int j = 0; j < (buf_size_nonzero_w_div8 << 1); ++j) {
+      __m128i *buf0_cur = buf0 + j * 4;
+      load_buffer_32bit_input(input_row + j * 4, input_stride, buf0_cur, 4);
+
+      TRANSPOSE_4X4(buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3],
+                    buf0_cur[0], buf0_cur[1], buf0_cur[2], buf0_cur[3]);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      av1_round_shift_rect_array_32_sse4_1(
+          buf0, buf0, (buf_size_nonzero_w_div8 << 3), 0, NewInvSqrt2);
+    }
+    row_txfm(buf0, buf0, inv_cos_bit_row[txw_idx][txh_idx], 0, bd, -shift[0]);
+
+    __m128i *_buf1 = buf1 + i * 4;
+    if (lr_flip) {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(buf0[4 * j + 3], buf0[4 * j + 2], buf0[4 * j + 1],
+                      buf0[4 * j],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 0],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 1],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 2],
+                      _buf1[txfm_size_row * (buf_size_w_div8 - 1 - j) + 3]);
+      }
+    } else {
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        TRANSPOSE_4X4(
+            buf0[j * 4 + 0], buf0[j * 4 + 1], buf0[j * 4 + 2], buf0[j * 4 + 3],
+            _buf1[j * txfm_size_row + 0], _buf1[j * txfm_size_row + 1],
+            _buf1[j * txfm_size_row + 2], _buf1[j * txfm_size_row + 3]);
+      }
+    }
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row,
+             inv_cos_bit_col[txw_idx][txh_idx], 1, bd, 0);
+
+    av1_round_shift_array_32_sse4_1(buf1 + i * txfm_size_row,
+                                    buf1 + i * txfm_size_row, txfm_size_row,
+                                    -shift[1]);
+  }
+
+  // write to buffer
+  {
+    for (int i = 0; i < (txfm_size_col >> 3); i++) {
+      highbd_write_buffer_8xn_sse4_1(buf1 + i * txfm_size_row * 2,
+                                     output + 8 * i, stride, ud_flip,
+                                     txfm_size_row, bd);
+    }
+  }
+}
 static void highbd_inv_txfm2d_add_no_identity_sse41(const int32_t *input,
                                                     uint16_t *output,
                                                     int stride, TX_TYPE tx_type,
@@ -5802,6 +5872,13 @@
           input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
           bd);
       break;
+    case H_DCT:
+    case H_ADST:
+    case H_FLIPADST:
+      highbd_inv_txfm2d_add_v_identity_ssse41(
+          input, CONVERT_TO_SHORTPTR(output), stride, tx_type, tx_size, eob,
+          bd);
+      break;
     default: assert(0); break;
   }
 }