Implement flip txfm in av1_fwd_txfm2d_8x8_sse2

Change-Id: I29e224a9d39c734db8f40f4f6dec3540d4945267
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 54fa1ee..1bbcad1 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -156,46 +156,50 @@
   int stage_num_row;
 } TXFM_2D_FLIP_CFG;
 
-static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+static INLINE void get_flip_cfg(TX_TYPE tx_type, int *ud_flip, int *lr_flip) {
   switch (tx_type) {
     case DCT_DCT:
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
     case IDTX:
     case V_DCT:
     case H_DCT:
     case V_ADST:
     case H_ADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       break;
     case FLIPADST_DCT:
     case FLIPADST_ADST:
     case V_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 0;
+      *ud_flip = 1;
+      *lr_flip = 0;
       break;
     case DCT_FLIPADST:
     case ADST_FLIPADST:
     case H_FLIPADST:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 1;
+      *ud_flip = 0;
+      *lr_flip = 1;
       break;
     case FLIPADST_FLIPADST:
-      cfg->ud_flip = 1;
-      cfg->lr_flip = 1;
+      *ud_flip = 1;
+      *lr_flip = 1;
       break;
     default:
-      cfg->ud_flip = 0;
-      cfg->lr_flip = 0;
+      *ud_flip = 0;
+      *lr_flip = 0;
       assert(0);
   }
 }
 
+static INLINE void set_flip_cfg(TX_TYPE tx_type, TXFM_2D_FLIP_CFG *cfg) {
+  get_flip_cfg(tx_type, &cfg->ud_flip, &cfg->lr_flip);
+}
+
 static INLINE TX_SIZE av1_rotate_tx_size(TX_SIZE tx_size) {
   switch (tx_size) {
     case TX_4X4: return TX_4X4;
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index 92956a0..9790c44 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -78,6 +78,14 @@
   }
 }
 
+static INLINE void load_buffer_16bit_to_16bit_flip(const int16_t *in,
+                                                   int stride, __m128i *out,
+                                                   int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[out_size - i - 1] = load_16bit_to_16bit(in + i * stride);
+  }
+}
+
 static INLINE void load_buffer_32bit_to_16bit(const int32_t *in, int stride,
                                               __m128i *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
@@ -114,6 +122,12 @@
   }
 }
 
+static INLINE void flip_buf_sse2(__m128i *in, __m128i *out, int size) {
+  for (int i = 0; i < size; ++i) {
+    out[size - i - 1] = in[i];
+  }
+}
+
 void av1_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride,
                              TX_TYPE tx_type, int bd);
 
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index f8eaf83..4c8bb91 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1591,11 +1591,11 @@
     { fadst8_new_sse2, fdct8_new_sse2 },   // ADST_DCT
     { fdct8_new_sse2, fadst8_new_sse2 },   // DCT_ADST
     { fadst8_new_sse2, fadst8_new_sse2 },  // ADST_ADST
-    { NULL, NULL },                        // FLIPADST_DCT
-    { NULL, NULL },                        // DCT_FLIPADST
-    { NULL, NULL },                        // FLIPADST_FLIPADST
-    { NULL, NULL },                        // ADST_FLIPADST
-    { NULL, NULL },                        // FLIPADST_ADST
+    { fadst8_new_sse2, fdct8_new_sse2 },   // FLIPADST_DCT
+    { fdct8_new_sse2, fadst8_new_sse2 },   // DCT_FLIPADST
+    { fadst8_new_sse2, fadst8_new_sse2 },  // FLIPADST_FLIPADST
+    { fadst8_new_sse2, fadst8_new_sse2 },  // ADST_FLIPADST
+    { fadst8_new_sse2, fadst8_new_sse2 },  // FLIPADST_ADST
     { NULL, NULL },                        // IDTX
     { NULL, NULL },                        // V_DCT
     { NULL, NULL },                        // H_DCT
@@ -1608,11 +1608,22 @@
   const transform_1d_sse2 col_txfm = txfm_arr[tx_type].col;
   const transform_1d_sse2 row_txfm = txfm_arr[tx_type].row;
   if (col_txfm != NULL && row_txfm != NULL) {
-    load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
+    int ud_flip, lr_flip;
+    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+    if (ud_flip)
+      load_buffer_16bit_to_16bit_flip(input, stride, buf, buf_size);
+    else
+      load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
     round_shift_16bit(buf, 8, shift[0]);
     col_txfm(buf, buf, cos_bit_col);
     round_shift_16bit(buf, 8, shift[1]);
-    transpose_16bit_8x8(buf, buf);
+    if (lr_flip) {
+      __m128i tmp[8];
+      transpose_16bit_8x8(buf, tmp);
+      flip_buf_sse2(tmp, buf, 8);
+    } else {
+      transpose_16bit_8x8(buf, buf);
+    }
     row_txfm(buf, buf, cos_bit_row);
     round_shift_16bit(buf, 8, shift[2]);
     transpose_16bit_8x8(buf, buf);