Implement flip txfm in av1_fwd_txfm2d_8x8_sse2
Change-Id: I29e224a9d39c734db8f40f4f6dec3540d4945267
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 54fa1ee..1bbcad1 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -156,46 +156,50 @@
int stage_num_row;
} TXFM_2D_FLIP_CFG;
-static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
switch (tx_type) {
case DCT_DCT:
case ADST_DCT:
case DCT_ADST:
case ADST_ADST:
- cfg->ud_flip = 0;
- cfg->lr_flip = 0;
+ *ud_flip = 0;
+ *lr_flip = 0;
break;
case IDTX:
case V_DCT:
case H_DCT:
case V_ADST:
case H_ADST:
- cfg->ud_flip = 0;
- cfg->lr_flip = 0;
+ *ud_flip = 0;
+ *lr_flip = 0;
break;
case FLIPADST_DCT:
case FLIPADST_ADST:
case V_FLIPADST:
- cfg->ud_flip = 1;
- cfg->lr_flip = 0;
+ *ud_flip = 1;
+ *lr_flip = 0;
break;
case DCT_FLIPADST:
case ADST_FLIPADST:
case H_FLIPADST:
- cfg->ud_flip = 0;
- cfg->lr_flip = 1;
+ *ud_flip = 0;
+ *lr_flip = 1;
break;
case FLIPADST_FLIPADST:
- cfg->ud_flip = 1;
- cfg->lr_flip = 1;
+ *ud_flip = 1;
+ *lr_flip = 1;
break;
default:
- cfg->ud_flip = 0;
- cfg->lr_flip = 0;
+ *ud_flip = 0;
+ *lr_flip = 0;
assert(0);
}
}
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+ get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
switch (tx_size) {
case TX_4X4: return TX_4X4;
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index 92956a0..9790c44 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -78,6 +78,14 @@
}
}
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+ int stride, __m128i *out,
+ int out_size) {
+ for (int i = 0; i < out_size; ++i) {
+ out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+ }
+}
+
static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
__m128i *out, int out_size) {
for (int i = 0; i < out_size; ++i) {
@@ -114,6 +122,12 @@
}
}
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+ for (int i = 0; i < size; ++i) {
+ out[size - i - 1] = in[i];
+ }
+}
+
void av1_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride,
TX_TYPE tx_type, int bd);
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index f8eaf83..4c8bb91 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1591,11 +1591,11 @@
{ fadst8_new_sse2, fdct8_new_sse2 }, // ADST_DCT
{ fdct8_new_sse2, fadst8_new_sse2 }, // DCT_ADST
{ fadst8_new_sse2, fadst8_new_sse2 }, // ADST_ADST
- { NULL, NULL }, // FLIPADST_DCT
- { NULL, NULL }, // DCT_FLIPADST
- { NULL, NULL }, // FLIPADST_FLIPADST
- { NULL, NULL }, // ADST_FLIPADST
- { NULL, NULL }, // FLIPADST_ADST
+ { fadst8_new_sse2, fdct8_new_sse2 }, // FLIPADST_DCT
+ { fdct8_new_sse2, fadst8_new_sse2 }, // DCT_FLIPADST
+ { fadst8_new_sse2, fadst8_new_sse2 }, // FLIPADST_FLIPADST
+ { fadst8_new_sse2, fadst8_new_sse2 }, // ADST_FLIPADST
+ { fadst8_new_sse2, fadst8_new_sse2 }, // FLIPADST_ADST
{ NULL, NULL }, // IDTX
{ NULL, NULL }, // V_DCT
{ NULL, NULL }, // H_DCT
@@ -1608,11 +1608,22 @@
const transform_1d_sse2 col_txfm = txfm_arr[tx_type].col;
const transform_1d_sse2 row_txfm = txfm_arr[tx_type].row;
if (col_txfm != NULL && row_txfm != NULL) {
- load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
+ int ud_flip, lr_flip;
+ get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+ if (ud_flip)
+ load_buffer_16bit_to_16bit_flip(input, stride, buf, buf_size);
+ else
+ load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
round_shift_16bit(buf, 8, shift[0]);
col_txfm(buf, buf, cos_bit_col);
round_shift_16bit(buf, 8, shift[1]);
- transpose_16bit_8x8(buf, buf);
+ if (lr_flip) {
+ __m128i tmp[8];
+ transpose_16bit_8x8(buf, tmp);
+ flip_buf_sse2(tmp, buf, 8);
+ } else {
+ transpose_16bit_8x8(buf, buf);
+ }
row_txfm(buf, buf, cos_bit_row);
round_shift_16bit(buf, 8, shift[2]);
transpose_16bit_8x8(buf, buf);