Add round_shift_16bit_ssse3 for inv txfm
Change-Id: I6b3ae6fc4b5694933ab5103470dc0fa7bf6e4a35
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.c b/av1/common/x86/av1_inv_txfm_ssse3.c
index 2ff1e33..25cac55 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.c
+++ b/av1/common/x86/av1_inv_txfm_ssse3.c
@@ -1953,7 +1953,7 @@
transpose_16bit_4x4(buf, buf);
}
col_txfm(buf, buf, cos_bit_col);
- round_shift_16bit(buf, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
@@ -2026,7 +2026,7 @@
round_shift_ssse3(buf0, buf0, input_stride); // rect special code
}
row_txfm(buf0, buf0, cos_bit_row);
- round_shift_16bit(buf0, txfm_size_col, shift[0]);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
__m128i *_buf1 = buf1 + i * 8;
if (lr_flip) {
for (int j = 0; j < buf_size_w_div8; ++j) {
@@ -2043,7 +2043,7 @@
}
for (int i = 0; i < buf_size_w_div8; i++) {
col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
- round_shift_16bit(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
}
if (txfm_size_col >= 16) {
@@ -2134,7 +2134,7 @@
round_shift_ssse3(buf0, buf0, input_stride); // rect special code
}
row_txfm(buf0, buf0, cos_bit_row);
- round_shift_16bit(buf0, txfm_size_col, shift[0]);
+ round_shift_16bit_ssse3(buf0, txfm_size_col, shift[0]);
__m128i *_buf1 = buf1;
if (lr_flip) {
for (int j = 0; j < buf_size_w_div8; ++j) {
@@ -2213,7 +2213,7 @@
transpose_16bit_4x8(buf, buf);
round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
row_txfm(buf, buf, cos_bit_row);
- // round_shift_16bit(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);// shift[0] is 0
if (lr_flip) {
__m128i temp[4];
flip_buf_sse2(buf, temp, txfm_size_col);
@@ -2222,7 +2222,7 @@
transpose_16bit_8x4(buf, buf);
}
col_txfm(buf, buf, cos_bit_col);
- round_shift_16bit(buf, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
@@ -2252,7 +2252,7 @@
transpose_16bit_8x4(buf, buf);
round_shift_ssse3(buf, buf, txfm_size_col); // rect special code
row_txfm(buf, buf, cos_bit_row);
- // round_shift_16bit(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ // round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]); // shift[0] is 0
if (lr_flip) {
__m128i temp[8];
flip_buf_sse2(buf, temp, txfm_size_col);
@@ -2261,7 +2261,7 @@
transpose_16bit_4x8(buf, buf);
}
col_txfm(buf, buf, cos_bit_col);
- round_shift_16bit(buf, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
@@ -2296,7 +2296,7 @@
row_one_loop);
transpose_16bit_4x8(buf_cur, buf_cur);
row_txfm(buf_cur, buf_cur, cos_bit_row);
- round_shift_16bit(buf_cur, row_one_loop, shift[0]);
+ round_shift_16bit_ssse3(buf_cur, row_one_loop, shift[0]);
if (lr_flip) {
__m128i temp[8];
flip_buf_sse2(buf_cur, temp, txfm_size_col);
@@ -2306,7 +2306,7 @@
}
}
col_txfm(buf, buf, cos_bit_col);
- round_shift_16bit(buf, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf, txfm_size_row, shift[1]);
lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
}
@@ -2342,7 +2342,7 @@
transpose_16bit_8x4(buf_cur, buf_cur);
}
row_txfm(buf, buf, cos_bit_row);
- round_shift_16bit(buf, txfm_size_col, shift[0]);
+ round_shift_16bit_ssse3(buf, txfm_size_col, shift[0]);
if (lr_flip) {
__m128i temp[16];
flip_buf_sse2(buf, temp, 16);
@@ -2354,7 +2354,7 @@
}
for (int i = 0; i < buf_size_w_div8; i++) {
col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
- round_shift_16bit(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ round_shift_16bit_ssse3(buf + i * row_one_loop, txfm_size_row, shift[1]);
}
lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
diff --git a/av1/common/x86/av1_inv_txfm_ssse3.h b/av1/common/x86/av1_inv_txfm_ssse3.h
index df7b7f9..96dc0d6 100644
--- a/av1/common/x86/av1_inv_txfm_ssse3.h
+++ b/av1/common/x86/av1_inv_txfm_ssse3.h
@@ -53,6 +53,19 @@
out1 = _mm_subs_epi16(_in0, _in1); \
} while (0)
+static INLINE void round_shift_16bit_ssse3(__m128i *in, int size, int bit) {
+ if (bit < 0) {
+ const __m128i scale = _mm_set1_epi16(1 << (15 + bit));
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_mulhrs_epi16(in[i], scale);
+ }
+ } else if (bit > 0) {
+ for (int i = 0; i < size; ++i) {
+ in[i] = _mm_slli_epi16(in[i], bit);
+ }
+ }
+}
+
#ifdef __cplusplus
extern "C" {
#endif