Implement flip txfm in av1_fwd_txfm2d_8x8_sse2 Change-Id: I29e224a9d39c734db8f40f4f6dec3540d4945267
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h index 54fa1ee..1bbcad1 100644 --- a/av1/common/av1_txfm.h +++ b/av1/common/av1_txfm.h
@@ -156,46 +156,50 @@ int stage_num_row; } TXFM_2D_FLIP_CFG; -static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { +static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) { switch (tx_type) { case DCT_DCT: case ADST_DCT: case DCT_ADST: case ADST_ADST: - cfg->ud_flip = 0; - cfg->lr_flip = 0; + *ud_flip = 0; + *lr_flip = 0; break; case IDTX: case V_DCT: case H_DCT: case V_ADST: case H_ADST: - cfg->ud_flip = 0; - cfg->lr_flip = 0; + *ud_flip = 0; + *lr_flip = 0; break; case FLIPADST_DCT: case FLIPADST_ADST: case V_FLIPADST: - cfg->ud_flip = 1; - cfg->lr_flip = 0; + *ud_flip = 1; + *lr_flip = 0; break; case DCT_FLIPADST: case ADST_FLIPADST: case H_FLIPADST: - cfg->ud_flip = 0; - cfg->lr_flip = 1; + *ud_flip = 0; + *lr_flip = 1; break; case FLIPADST_FLIPADST: - cfg->ud_flip = 1; - cfg->lr_flip = 1; + *ud_flip = 1; + *lr_flip = 1; break; default: - cfg->ud_flip = 0; - cfg->lr_flip = 0; + *ud_flip = 0; + *lr_flip = 0; assert(0); } } +static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) { + get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip); +} + static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) { switch (tx_size) { case TX_4X4: return TX_4X4;
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h index 92956a0..9790c44 100644 --- a/av1/common/x86/av1_txfm_sse2.h +++ b/av1/common/x86/av1_txfm_sse2.h
@@ -78,6 +78,14 @@ } } +static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in, + int stride, __m128i *out, + int out_size) { + for (int i = 0; i < out_size; ++i) { + out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride); + } +} + static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride, __m128i *out, int out_size) { for (int i = 0; i < out_size; ++i) { @@ -114,6 +122,12 @@ } } +static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) { + for (int i = 0; i < size; ++i) { + out[size - i - 1] = in[i]; + } +} + void av1_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, int bd);
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c index f8eaf83..4c8bb91 100644 --- a/av1/encoder/x86/av1_fwd_txfm_sse2.c +++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1591,11 +1591,11 @@ { fadst8_new_sse2, fdct8_new_sse2 }, // ADST_DCT { fdct8_new_sse2, fadst8_new_sse2 }, // DCT_ADST { fadst8_new_sse2, fadst8_new_sse2 }, // ADST_ADST - { NULL, NULL }, // FLIPADST_DCT - { NULL, NULL }, // DCT_FLIPADST - { NULL, NULL }, // FLIPADST_FLIPADST - { NULL, NULL }, // ADST_FLIPADST - { NULL, NULL }, // FLIPADST_ADST + { fadst8_new_sse2, fdct8_new_sse2 }, // FLIPADST_DCT + { fdct8_new_sse2, fadst8_new_sse2 }, // DCT_FLIPADST + { fadst8_new_sse2, fadst8_new_sse2 }, // FLIPADST_FLIPADST + { fadst8_new_sse2, fadst8_new_sse2 }, // ADST_FLIPADST + { fadst8_new_sse2, fadst8_new_sse2 }, // FLIPADST_ADST { NULL, NULL }, // IDTX { NULL, NULL }, // V_DCT { NULL, NULL }, // H_DCT @@ -1608,11 +1608,22 @@ const transform_1d_sse2 col_txfm = txfm_arr[tx_type].col; const transform_1d_sse2 row_txfm = txfm_arr[tx_type].row; if (col_txfm != NULL && row_txfm != NULL) { - load_buffer_16bit_to_16bit(input, stride, buf, buf_size); + int ud_flip, lr_flip; + get_flip_cfg(tx_type, &ud_flip, &lr_flip); + if (ud_flip) + load_buffer_16bit_to_16bit_flip(input, stride, buf, buf_size); + else + load_buffer_16bit_to_16bit(input, stride, buf, buf_size); round_shift_16bit(buf, 8, shift[0]); col_txfm(buf, buf, cos_bit_col); round_shift_16bit(buf, 8, shift[1]); - transpose_16bit_8x8(buf, buf); + if (lr_flip) { + __m128i tmp[8]; + transpose_16bit_8x8(buf, tmp); + flip_buf_sse2(tmp, buf, 8); + } else { + transpose_16bit_8x8(buf, buf); + } row_txfm(buf, buf, cos_bit_row); round_shift_16bit(buf, 8, shift[2]); transpose_16bit_8x8(buf, buf);