Add inv txfm2d sse2 for sizes with 4
Implement av1_lowbd_inv_txfm2d_add_4x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x8_sse2
Implement av1_lowbd_inv_txfm2d_add_8x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x16_sse2
Implement av1_lowbd_inv_txfm2d_add_16x4_sse2
A brief speed test shows that using the included SSE2 functions
completed by this CL, for speed1 lowbitdepth encoder speeds up >9%
and lowbitdepth decoder speeds up >25%, comparing to the highbitdepth
implementation in the baseline.
Change-Id: I0576a2a146c0b1a7b483c9d35c3d21d979e263cd
diff --git a/aom_dsp/x86/transpose_sse2.h b/aom_dsp/x86/transpose_sse2.h
index 445eb01..5edfa71 100644
--- a/aom_dsp/x86/transpose_sse2.h
+++ b/aom_dsp/x86/transpose_sse2.h
@@ -107,10 +107,14 @@
const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
// Unpack 32 bit elements resulting in:
- // out[0]: 00 10 20 30 01 11 21 31
- // out[1]: 02 12 22 32 03 13 23 33
+ // out[0]: 00 10 20 30
+ // out[1]: 01 11 21 31
+ // out[2]: 02 12 22 32
+ // out[3]: 03 13 23 33
out[0] = _mm_unpacklo_epi32(a0, a1);
- out[1] = _mm_unpackhi_epi32(a0, a1);
+ out[1] = _mm_srli_si128(out[0], 8);
+ out[2] = _mm_unpackhi_epi32(a0, a1);
+ out[3] = _mm_srli_si128(out[2], 8);
}
static INLINE void transpose_16bit_4x8(const __m128i *const in,
@@ -155,6 +159,54 @@
out[3] = _mm_unpackhi_epi64(b2, b3);
}
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+ __m128i *const out) {
+ // Unpack 16 bit elements. Goes from:
+ // in[0]: 00 01 02 03 04 05 06 07
+ // in[1]: 10 11 12 13 14 15 16 17
+ // in[2]: 20 21 22 23 24 25 26 27
+ // in[3]: 30 31 32 33 34 35 36 37
+
+ // to:
+ // a0: 00 10 01 11 02 12 03 13
+ // a1: 20 30 21 31 22 32 23 33
+ // a4: 04 14 05 15 06 16 07 17
+ // a5: 24 34 25 35 26 36 27 37
+ const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+ // Unpack 32 bit elements resulting in:
+ // b0: 00 10 20 30 01 11 21 31
+ // b2: 04 14 24 34 05 15 25 35
+ // b4: 02 12 22 32 03 13 23 33
+ // b6: 06 16 26 36 07 17 27 37
+ const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+ const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+ const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+ const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+ // Unpack 64 bit elements resulting in:
+ // out[0]: 00 10 20 30 XX XX XX XX
+ // out[1]: 01 11 21 31 XX XX XX XX
+ // out[2]: 02 12 22 32 XX XX XX XX
+ // out[3]: 03 13 23 33 XX XX XX XX
+ // out[4]: 04 14 24 34 XX XX XX XX
+ // out[5]: 05 15 25 35 XX XX XX XX
+ // out[6]: 06 16 26 36 XX XX XX XX
+ // out[7]: 07 17 27 37 XX XX XX XX
+ const __m128i zeros = _mm_setzero_si128();
+ out[0] = _mm_unpacklo_epi64(b0, zeros);
+ out[1] = _mm_unpackhi_epi64(b0, zeros);
+ out[2] = _mm_unpacklo_epi64(b4, zeros);
+ out[3] = _mm_unpackhi_epi64(b4, zeros);
+ out[4] = _mm_unpacklo_epi64(b2, zeros);
+ out[5] = _mm_unpackhi_epi64(b2, zeros);
+ out[6] = _mm_unpacklo_epi64(b6, zeros);
+ out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
static INLINE void transpose_16bit_8x8(const __m128i *const in,
__m128i *const out) {
// Unpack 16 bit elements. Goes from:
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
index 50e0c4b..f09f269 100644
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ b/av1/common/x86/av1_inv_txfm_sse2.c
@@ -1327,6 +1327,72 @@
output[32] = _mm_subs_epi16(x10[31], x10[32]);
}
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+ const int32_t *sinpi = sinpi_arr(cos_bit);
+ const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+ const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+ const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+ const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+ const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+ const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+ const __m128i sinpi_0_p02 = pair_set_epi16(0, sinpi[2]);
+ const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+ __m128i x0[4];
+ x0[0] = input[0];
+ x0[1] = input[1];
+ x0[2] = input[2];
+ x0[3] = input[3];
+
+ __m128i u[4];
+ u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+ u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+ u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+ u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+ __m128i x1[16];
+ x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04); // x0*sin1 + x2*sin4
+ x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+ x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01); // x0*sin2 - x2*sin1
+ x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+ x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02); // x1*sin3 + x3*sin2
+ x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+ x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04); // x1*sin3 - x3*sin4
+ x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+ x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03); // x0*sin3 - x2*sin3
+ x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+ x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03); // x2*sin3
+ x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+ x1[12] = _mm_madd_epi16(u[2], sinpi_0_p02); // x3*sin2
+ x1[13] = _mm_madd_epi16(u[3], sinpi_0_p02);
+ x1[14] = _mm_madd_epi16(u[2], sinpi_p03_p04); // x1*sin3 + x3*sin4
+ x1[15] = _mm_madd_epi16(u[3], sinpi_p03_p04);
+
+ __m128i x2[8];
+ x2[0] = _mm_add_epi32(x1[0], x1[4]); // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+ x2[1] = _mm_add_epi32(x1[1], x1[5]);
+ x2[2] = _mm_add_epi32(x1[2], x1[6]); // x0*sin2 - x2*sin1 +x1*sin3 - x3*sin4
+ x2[3] = _mm_add_epi32(x1[3], x1[7]);
+ x2[4] = _mm_add_epi32(x1[8], x1[10]); // x0*sin3 - x2*sin3 +x3*sin3
+ x2[5] = _mm_add_epi32(x1[9], x1[11]);
+ x2[6] = _mm_add_epi32(x1[0], x1[2]); // x0*sin1 + x2*sin4 + x0*sin2 - x2*sin1
+ x2[7] = _mm_add_epi32(x1[1], x1[3]);
+ x2[6] = _mm_add_epi32(
+ x2[6], x1[12]); // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1
+ x2[7] = _mm_add_epi32(x2[7], x1[13]);
+ x2[6] = _mm_sub_epi32(
+ x2[6], x1[14]); // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1
+ x2[7] = _mm_sub_epi32(x2[7], x1[15]);
+
+ const __m128i rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+ for (int i = 0; i < 4; ++i) {
+ __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+ __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+ out0 = _mm_srai_epi32(out0, cos_bit);
+ out1 = _mm_srai_epi32(out1, cos_bit);
+ output[i] = _mm_packs_epi32(out0, out1);
+ }
+}
+
void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
const int32_t *cospi = cospi_arr(cos_bit);
const __m128i __zero = _mm_setzero_si128();
@@ -1599,6 +1665,24 @@
output[15] = _mm_subs_epi16(__zero, x8[1]);
}
+static void iidentity4_new_sse2(const __m128i *input, __m128i *output,
+ int8_t cos_bit) {
+ (void)cos_bit;
+ const __m128i scale = _mm_set1_epi16(NewSqrt2);
+ const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+ const __m128i one = _mm_set1_epi16(1);
+ const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+ for (int i = 0; i < 4; ++i) {
+ __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+ __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+ __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding);
+ __m128i b_hi = _mm_madd_epi16(a_hi, scale_rounding);
+ __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits);
+ __m128i c_hi = _mm_srai_epi32(b_hi, NewSqrt2Bits);
+ output[i] = _mm_packs_epi32(c_lo, c_hi);
+ }
+}
+
static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
int8_t cos_bit) {
(void)cos_bit;
@@ -1661,6 +1745,20 @@
return _mm_packus_epi16(x0, x0);
}
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+ int stride, int flipud,
+ const int height) {
+ int j = flipud ? (height - 1) : 0;
+ const int step = flipud ? -1 : 1;
+ const __m128i zero = _mm_setzero_si128();
+ for (int i = 0; i < height; ++i, j += step) {
+ const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+ __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+ u = _mm_packus_epi16(u, zero);
+ *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+ }
+}
+
static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
int stride, int flipud,
const int height) {
@@ -1674,7 +1772,7 @@
}
static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = {
- { NULL, NULL, NULL, NULL },
+ { idct4_new_sse2, iadst4_new_sse2, iadst4_new_sse2, iidentity4_new_sse2 },
{ idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
{ idct16_new_sse2, iadst16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 },
{ idct32_new_sse2, NULL, NULL, iidentity32_new_sse2 },
@@ -1683,16 +1781,18 @@
#endif
};
-void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
+void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
- __m128i buf[8];
- const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
- const int txw_idx = get_txw_idx(TX_8X8);
- const int txh_idx = get_txh_idx(TX_8X8);
+ __m128i buf[4];
+ const TX_SIZE tx_size = TX_4X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
- const int buf_size = 8;
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
const transform_1d_sse2 row_txfm =
lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
@@ -1701,7 +1801,42 @@
int ud_flip, lr_flip;
get_flip_cfg(tx_type, &ud_flip, &lr_flip);
- load_buffer_32bit_to_16bit(input, 8, buf, buf_size);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x4(buf, buf);
+ row_txfm(buf, buf, cos_bit_row);
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x4(temp, buf);
+ } else {
+ transpose_16bit_4x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X8;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_sse2 row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+ const transform_1d_sse2 col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
transpose_16bit_8x8(buf, buf);
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, 8, shift[0]);
@@ -1858,6 +1993,80 @@
}
#endif
+void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_4X8;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_sse2 row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+ const transform_1d_sse2 col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_4x8(buf, buf);
+ round_shift_sse2(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit(buf, txfm_size_col, shift[0]);// shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[4];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf);
+ } else {
+ transpose_16bit_8x4(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf[8];
+ const TX_SIZE tx_size = TX_8X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_sse2 row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+ const transform_1d_sse2 col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+ transpose_16bit_8x4(buf, buf);
+ round_shift_sse2(buf, buf, txfm_size_col); // rect special code
+ row_txfm(buf, buf, cos_bit_row);
+ // round_shift_16bit(buf, txfm_size_col, shift[0]); // shift[0] is 0
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf, temp, txfm_size_col);
+ transpose_16bit_4x8(temp, buf);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
@@ -1912,6 +2121,97 @@
}
#endif
+void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_4X16;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+
+ const transform_1d_sse2 row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+ const transform_1d_sse2 col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+ const int row_one_loop = 8;
+ for (int i = 0; i < 2; ++i) {
+ const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+ row_one_loop);
+ transpose_16bit_4x8(buf_cur, buf_cur);
+ row_txfm(buf_cur, buf_cur, cos_bit_row);
+ round_shift_16bit(buf_cur, row_one_loop, shift[0]);
+ if (lr_flip) {
+ __m128i temp[8];
+ flip_buf_sse2(buf_cur, temp, txfm_size_col);
+ transpose_16bit_8x4(temp, buf_cur);
+ } else {
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ }
+ col_txfm(buf, buf, cos_bit_col);
+ round_shift_16bit(buf, txfm_size_row, shift[1]);
+ lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd) {
+ (void)bd;
+ __m128i buf[16];
+ const TX_SIZE tx_size = TX_16X4;
+ const int8_t *shift = inv_txfm_shift_ls[tx_size];
+ const int txw_idx = get_txw_idx(tx_size);
+ const int txh_idx = get_txh_idx(tx_size);
+ const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+ const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+ const int txfm_size_col = tx_size_wide[tx_size];
+ const int txfm_size_row = tx_size_high[tx_size];
+ const int buf_size_w_div8 = txfm_size_col >> 3;
+
+ const transform_1d_sse2 row_txfm =
+ lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+ const transform_1d_sse2 col_txfm =
+ lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ const int row_one_loop = 8;
+ for (int i = 0; i < buf_size_w_div8; ++i) {
+ const int32_t *input_cur = input + i * row_one_loop;
+ __m128i *buf_cur = buf + i * row_one_loop;
+ load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+ txfm_size_row);
+ transpose_16bit_8x4(buf_cur, buf_cur);
+ }
+ row_txfm(buf, buf, cos_bit_row);
+ round_shift_16bit(buf, txfm_size_col, shift[0]);
+ if (lr_flip) {
+ __m128i temp[16];
+ flip_buf_sse2(buf, temp, 16);
+ transpose_16bit_4x8(temp, buf);
+ transpose_16bit_4x8(temp + 8, buf + 8);
+ } else {
+ transpose_16bit_4x8(buf, buf);
+ transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+ }
+ for (int i = 0; i < buf_size_w_div8; i++) {
+ col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+ round_shift_16bit(buf + i * row_one_loop, txfm_size_row, shift[1]);
+ }
+ lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+ lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd) {
(void)bd;
@@ -1957,15 +2257,15 @@
TX_TYPE tx_type, int bd);
static inv_txfm_func inv_txfm_func_ls[TX_SIZES_ALL] = {
- NULL, // 4x4
+ av1_lowbd_inv_txfm2d_add_4x4_sse2, // 4x4
av1_lowbd_inv_txfm2d_add_8x8_sse2, // 8x8
av1_lowbd_inv_txfm2d_add_16x16_sse2, // 16x16
av1_lowbd_inv_txfm2d_add_32x32_sse2, // 32x32
#if CONFIG_TX64X64
av1_lowbd_inv_txfm2d_add_64x64_sse2, // 64x64
#endif // CONFIG_TX64X64
- NULL, // 4x8
- NULL, // 8x4
+ av1_lowbd_inv_txfm2d_add_4x8_sse2, // 4x8
+ av1_lowbd_inv_txfm2d_add_8x4_sse2, // 8x4
av1_lowbd_inv_txfm2d_add_8x16_sse2, // 8x16
av1_lowbd_inv_txfm2d_add_16x8_sse2, // 16x8
av1_lowbd_inv_txfm2d_add_16x32_sse2, // 16x32
@@ -1974,8 +2274,8 @@
av1_lowbd_inv_txfm2d_add_32x64_sse2, // 32x64
av1_lowbd_inv_txfm2d_add_64x32_sse2, // 64x32
#endif // CONFIG_TX64X64
- NULL, // 4x16
- NULL, // 16x4
+ av1_lowbd_inv_txfm2d_add_4x16_sse2, // 4x16
+ av1_lowbd_inv_txfm2d_add_16x4_sse2, // 16x4
av1_lowbd_inv_txfm2d_add_8x32_sse2, // 8x32
av1_lowbd_inv_txfm2d_add_32x8_sse2, // 32x8
#if CONFIG_TX64X64
@@ -1988,7 +2288,7 @@
const TxfmParam *txfm_param) {
const TX_TYPE tx_type = txfm_param->tx_type;
const inv_txfm_func inv_func = inv_txfm_func_ls[txfm_param->tx_size];
- if (inv_func != NULL) {
+ if (inv_func != NULL && (!txfm_param->lossless)) {
inv_func(dqcoeff, dst, stride, tx_type, txfm_param->bd);
} else {
av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index a924003..efbcae7 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -59,6 +59,11 @@
return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
}
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+ const __m128i a_low = _mm_load_si128((const __m128i *)a);
+ return _mm_packs_epi32(a_low, a_low);
+}
+
// Store 8 16 bit values. Sign extend the values.
static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
const __m128i a_lo = _mm_unpacklo_epi16(a, a);
@@ -107,6 +112,13 @@
}
}
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+ __m128i *out, int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[i] = load_32bit_to_16bit_w4(in + i * stride);
+ }
+}
+
static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
int stride, __m128i *out,
int out_size) {
@@ -194,6 +206,9 @@
transform_1d_sse2 col, row; // vertical and horizontal
} transform_2d_sse2;
+void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
@@ -208,6 +223,12 @@
int stride, TX_TYPE tx_type, int bd);
#endif
+void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
@@ -228,6 +249,12 @@
int stride, TX_TYPE tx_type, int bd);
#endif
+void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output,
+ int stride, TX_TYPE tx_type, int bd);
+
void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output,
int stride, TX_TYPE tx_type, int bd);
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index 5d56cdc..6e74534 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -329,15 +329,15 @@
#include "av1/common/x86/av1_txfm_sse2.h"
const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = {
- NULL, // TX_4X4
+ av1_lowbd_inv_txfm2d_add_4x4_sse2, // TX_4X4
av1_lowbd_inv_txfm2d_add_8x8_sse2, // TX_8X8
av1_lowbd_inv_txfm2d_add_16x16_sse2, // TX_16X16
av1_lowbd_inv_txfm2d_add_32x32_sse2, // TX_32X32
#if CONFIG_TX64X64
av1_lowbd_inv_txfm2d_add_64x64_sse2, // 64x64
#endif // CONFIG_TX64X64
- NULL, // TX_4X8
- NULL, // TX_8X4
+ av1_lowbd_inv_txfm2d_add_4x8_sse2, // TX_4X8
+ av1_lowbd_inv_txfm2d_add_8x4_sse2, // TX_8X4
av1_lowbd_inv_txfm2d_add_8x16_sse2, // TX_8X16
av1_lowbd_inv_txfm2d_add_16x8_sse2, // TX_16X8
av1_lowbd_inv_txfm2d_add_16x32_sse2, // TX_16X32
@@ -346,8 +346,8 @@
av1_lowbd_inv_txfm2d_add_32x64_sse2, // TX_32X64
av1_lowbd_inv_txfm2d_add_64x32_sse2, // TX_64X32
#endif // CONFIG_TX64X64
- NULL, // TX_4X16
- NULL, // TX_16X4
+ av1_lowbd_inv_txfm2d_add_4x16_sse2, // TX_4X16
+ av1_lowbd_inv_txfm2d_add_16x4_sse2, // TX_16X4
av1_lowbd_inv_txfm2d_add_8x32_sse2, // 8x32
av1_lowbd_inv_txfm2d_add_32x8_sse2, // 32x8
#if CONFIG_TX64X64