Add round_shift_16bit_ssse3 for inv txfm

Change-Id: I6b3ae6fc4b5694933ab5103470dc0fa7bf6e4a35
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index 2ff1e33..25cac55 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -1953,7 +1953,7 @@
     transpose_16bit_4x4(buf, buf);
   }
   col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
@@ -2026,7 +2026,7 @@
       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
     }
     row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, txfm_size_col, shift[0]);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
     __m128i *_buf1 = buf1 + i * 8;
     if (lr_flip) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
@@ -2043,7 +2043,7 @@
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
-    round_shift_16bit(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+    round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
   }
 
   if (txfm_size_col >= 16) {
@@ -2134,7 +2134,7 @@
       round_shift_ssse3(buf0, buf0, input_stride);  // rect special code
     }
     row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, txfm_size_col, shift[0]);
+    round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
     __m128i *_buf1 = buf1;
     if (lr_flip) {
       for (int j = 0; j < buf_size_w_div8; ++j) {
@@ -2213,7 +2213,7 @@
   transpose_16bit_4x8(buf, buf);
   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
   row_txfm(buf, buf, cos_bit_row);
-  // round_shift_16bit(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
   if (lr_flip) {
     __m128i temp[4];
     flip_buf_sse2(buf, temp, txfm_size_col);
@@ -2222,7 +2222,7 @@
     transpose_16bit_8x4(buf, buf);
   }
   col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
@@ -2252,7 +2252,7 @@
   transpose_16bit_8x4(buf, buf);
   round_shift_ssse3(buf, buf, txfm_size_col);  // rect special code
   row_txfm(buf, buf, cos_bit_row);
-  // round_shift_16bit(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
   if (lr_flip) {
     __m128i temp[8];
     flip_buf_sse2(buf, temp, txfm_size_col);
@@ -2261,7 +2261,7 @@
     transpose_16bit_4x8(buf, buf);
   }
   col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
@@ -2296,7 +2296,7 @@
                                   row_one_loop);
     transpose_16bit_4x8(buf_cur, buf_cur);
     row_txfm(buf_cur, buf_cur, cos_bit_row);
-    round_shift_16bit(buf_cur, row_one_loop, shift[0]);
+    round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
     if (lr_flip) {
       __m128i temp[8];
       flip_buf_sse2(buf_cur, temp, txfm_size_col);
@@ -2306,7 +2306,7 @@
     }
   }
   col_txfm(buf, buf, cos_bit_col);
-  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
   lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
 }
 
@@ -2342,7 +2342,7 @@
     transpose_16bit_8x4(buf_cur, buf_cur);
   }
   row_txfm(buf, buf, cos_bit_row);
-  round_shift_16bit(buf, txfm_size_col, shift[0]);
+  round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
   if (lr_flip) {
     __m128i temp[16];
     flip_buf_sse2(buf, temp, 16);
@@ -2354,7 +2354,7 @@
   }
   for (int i = 0; i < buf_size_w_div8; i++) {
     col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
-    round_shift_16bit(buf + i * row_one_loop, txfm_size_row, shift[1]);
+    round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
   }
   lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
   lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.h b/av1/common/x86/av1_inv_txfm_ssse3.h
index df7b7f9..96dc0d6 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -53,6 +53,19 @@
     out1 = _mm_subs_epi16(_in0, _in1);                  \
   } while (0)
 
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+  if (bit < 0) {
+    const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_mulhrs_epi16(in[i], scale);
+    }
+  } else if (bit > 0) {
+    for (int i = 0; i < size; ++i) {
+      in[i] = _mm_slli_epi16(in[i], bit);
+    }
+  }
+}
+
 #ifdef __cplusplus
 extern "C" {
 #endif