Implement fidentity{8/16}_new_sse2

Change-Id: I2cd5eca68eada9053a229e11e364187e923cbd64
diff --git a/av1/encoder/x86/av1_fwd_txfm_sse2.c b/av1/encoder/x86/av1_fwd_txfm_sse2.c
index b311f74..520bf85 100644
--- a/av1/encoder/x86/av1_fwd_txfm_sse2.c
+++ b/av1/encoder/x86/av1_fwd_txfm_sse2.c
@@ -1575,61 +1575,101 @@
   output[15] = x8[0];
 }
 
+static INLINE void fidentity8_new_sse2(const __m128i *input, __m128i *output,
+                                       int8_t cos_bit) {
+  (void)cos_bit;
+
+  output[0] = _mm_adds_epi16(input[0], input[0]);
+  output[1] = _mm_adds_epi16(input[1], input[1]);
+  output[2] = _mm_adds_epi16(input[2], input[2]);
+  output[3] = _mm_adds_epi16(input[3], input[3]);
+  output[4] = _mm_adds_epi16(input[4], input[4]);
+  output[5] = _mm_adds_epi16(input[5], input[5]);
+  output[6] = _mm_adds_epi16(input[6], input[6]);
+  output[7] = _mm_adds_epi16(input[7], input[7]);
+}
+
+static INLINE void fidentity16_new_sse2(const __m128i *input, __m128i *output,
+                                        int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i scale = _mm_set1_epi16(2 * NewSqrt2);
+  const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  for (int i = 0; i < 16; ++i) {
+    __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding);
+    __m128i b_hi = _mm_madd_epi16(a_hi, scale_rounding);
+    __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits);
+    __m128i c_hi = _mm_srai_epi32(b_hi, NewSqrt2Bits);
+    output[i] = _mm_packs_epi32(c_lo, c_hi);
+  }
+}
+
+static INLINE void fidentity32_new_sse2(const __m128i *input, __m128i *output,
+                                        int8_t cos_bit) {
+  (void)cos_bit;
+  for (int i = 0; i < 32; ++i) {
+    output[i] = _mm_slli_epi16(input[i], 2);
+  }
+}
+
 static const transform_2d_sse2 txfm8_arr[] = {
-  { fdct8_new_sse2, fdct8_new_sse2 },    // DCT_DCT
-  { fadst8_new_sse2, fdct8_new_sse2 },   // ADST_DCT
-  { fdct8_new_sse2, fadst8_new_sse2 },   // DCT_ADST
-  { fadst8_new_sse2, fadst8_new_sse2 },  // ADST_ADST
-  { fadst8_new_sse2, fdct8_new_sse2 },   // FLIPADST_DCT
-  { fdct8_new_sse2, fadst8_new_sse2 },   // DCT_FLIPADST
-  { fadst8_new_sse2, fadst8_new_sse2 },  // FLIPADST_FLIPADST
-  { fadst8_new_sse2, fadst8_new_sse2 },  // ADST_FLIPADST
-  { fadst8_new_sse2, fadst8_new_sse2 },  // FLIPADST_ADST
-  { NULL, NULL },                        // IDTX
-  { NULL, NULL },                        // V_DCT
-  { NULL, NULL },                        // H_DCT
-  { NULL, NULL },                        // V_ADST
-  { NULL, NULL },                        // H_ADST
-  { NULL, NULL },                        // V_FLIPADST
-  { NULL, NULL },                        // H_FLIPADST
+  { fdct8_new_sse2, fdct8_new_sse2 },            // DCT_DCT
+  { fadst8_new_sse2, fdct8_new_sse2 },           // ADST_DCT
+  { fdct8_new_sse2, fadst8_new_sse2 },           // DCT_ADST
+  { fadst8_new_sse2, fadst8_new_sse2 },          // ADST_ADST
+  { fadst8_new_sse2, fdct8_new_sse2 },           // FLIPADST_DCT
+  { fdct8_new_sse2, fadst8_new_sse2 },           // DCT_FLIPADST
+  { fadst8_new_sse2, fadst8_new_sse2 },          // FLIPADST_FLIPADST
+  { fadst8_new_sse2, fadst8_new_sse2 },          // ADST_FLIPADST
+  { fadst8_new_sse2, fadst8_new_sse2 },          // FLIPADST_ADST
+  { fidentity8_new_sse2, fidentity8_new_sse2 },  // IDTX
+  { fdct8_new_sse2, fidentity8_new_sse2 },       // V_DCT
+  { fidentity8_new_sse2, fdct8_new_sse2 },       // H_DCT
+  { fadst8_new_sse2, fidentity8_new_sse2 },      // V_ADST
+  { fidentity8_new_sse2, fadst8_new_sse2 },      // H_ADST
+  { fadst8_new_sse2, fidentity8_new_sse2 },      // V_FLIPADST
+  { fidentity8_new_sse2, fadst8_new_sse2 },      // H_FLIPADST
 };
 
 static const transform_2d_sse2 txfm16_arr[] = {
-  { fdct16_new_sse2, fdct16_new_sse2 },    // DCT_DCT
-  { fadst16_new_sse2, fdct16_new_sse2 },   // ADST_DCT
-  { fdct16_new_sse2, fadst16_new_sse2 },   // DCT_ADST
-  { fadst16_new_sse2, fadst16_new_sse2 },  // ADST_ADST
-  { fadst16_new_sse2, fdct16_new_sse2 },   // FLIPADST_DCT
-  { fdct16_new_sse2, fadst16_new_sse2 },   // DCT_FLIPADST
-  { fadst16_new_sse2, fadst16_new_sse2 },  // FLIPADST_FLIPADST
-  { fadst16_new_sse2, fadst16_new_sse2 },  // ADST_FLIPADST
-  { fadst16_new_sse2, fadst16_new_sse2 },  // FLIPADST_ADST
-  { NULL, NULL },                          // IDTX
-  { NULL, NULL },                          // V_DCT
-  { NULL, NULL },                          // H_DCT
-  { NULL, NULL },                          // V_ADST
-  { NULL, NULL },                          // H_ADST
-  { NULL, NULL },                          // V_FLIPADST
-  { NULL, NULL },                          // H_FLIPADST
+  { fdct16_new_sse2, fdct16_new_sse2 },            // DCT_DCT
+  { fadst16_new_sse2, fdct16_new_sse2 },           // ADST_DCT
+  { fdct16_new_sse2, fadst16_new_sse2 },           // DCT_ADST
+  { fadst16_new_sse2, fadst16_new_sse2 },          // ADST_ADST
+  { fadst16_new_sse2, fdct16_new_sse2 },           // FLIPADST_DCT
+  { fdct16_new_sse2, fadst16_new_sse2 },           // DCT_FLIPADST
+  { fadst16_new_sse2, fadst16_new_sse2 },          // FLIPADST_FLIPADST
+  { fadst16_new_sse2, fadst16_new_sse2 },          // ADST_FLIPADST
+  { fadst16_new_sse2, fadst16_new_sse2 },          // FLIPADST_ADST
+  { fidentity16_new_sse2, fidentity16_new_sse2 },  // IDTX
+  { fdct16_new_sse2, fidentity16_new_sse2 },       // V_DCT
+  { fidentity16_new_sse2, fdct16_new_sse2 },       // H_DCT
+  { fadst16_new_sse2, fidentity16_new_sse2 },      // V_ADST
+  { fidentity16_new_sse2, fadst16_new_sse2 },      // H_ADST
+  { fadst16_new_sse2, fidentity16_new_sse2 },      // V_FLIPADST
+  { fidentity16_new_sse2, fadst16_new_sse2 },      // H_FLIPADST
 };
 
 static const transform_2d_sse2 txfm32_arr[] = {
-  { fdct32_new_sse2, fdct32_new_sse2 },  // DCT_DCT
-  { NULL, NULL },                        // ADST_DCT
-  { NULL, NULL },                        // DCT_ADST
-  { NULL, NULL },                        // ADST_ADST
-  { NULL, NULL },                        // FLIPADST_DCT
-  { NULL, NULL },                        // DCT_FLIPADST
-  { NULL, NULL },                        // FLIPADST_FLIPADST
-  { NULL, NULL },                        // ADST_FLIPADST
-  { NULL, NULL },                        // FLIPADST_ADST
-  { NULL, NULL },                        // IDTX
-  { NULL, NULL },                        // V_DCT
-  { NULL, NULL },                        // H_DCT
-  { NULL, NULL },                        // V_ADST
-  { NULL, NULL },                        // H_ADST
-  { NULL, NULL },                        // V_FLIPADST
-  { NULL, NULL },                        // H_FLIPADST
+  { fdct32_new_sse2, fdct32_new_sse2 },            // DCT_DCT
+  { NULL, NULL },                                  // ADST_DCT
+  { NULL, NULL },                                  // DCT_ADST
+  { NULL, NULL },                                  // ADST_ADST
+  { NULL, NULL },                                  // FLIPADST_DCT
+  { NULL, NULL },                                  // DCT_FLIPADST
+  { NULL, NULL },                                  // FLIPADST_FLIPADST
+  { NULL, NULL },                                  // ADST_FLIPADST
+  { NULL, NULL },                                  // FLIPADST_ADST
+  { fidentity32_new_sse2, fidentity32_new_sse2 },  // IDTX
+  { fdct32_new_sse2, fidentity32_new_sse2 },       // V_DCT
+  { fidentity32_new_sse2, fdct32_new_sse2 },       // H_DCT
+  { NULL, NULL },                                  // V_ADST
+  { NULL, NULL },                                  // H_ADST
+  { NULL, NULL },                                  // V_FLIPADST
+  { NULL, NULL },                                  // H_FLIPADST
 };
 
 void av1_fwd_txfm2d_8x8_sse2(const int16_t *input, int32_t *output, int stride,
@@ -1646,30 +1686,26 @@
 
   const transform_1d_sse2 col_txfm = txfm8_arr[tx_type].col;
   const transform_1d_sse2 row_txfm = txfm8_arr[tx_type].row;
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-    if (ud_flip)
-      load_buffer_16bit_to_16bit_flip(input, stride, buf, buf_size);
-    else
-      load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
-    round_shift_16bit(buf, 8, shift[0]);
-    col_txfm(buf, buf, cos_bit_col);
-    round_shift_16bit(buf, 8, shift[1]);
-    if (lr_flip) {
-      __m128i tmp[8];
-      transpose_16bit_8x8(buf, tmp);
-      flip_buf_sse2(tmp, buf, 8);
-    } else {
-      transpose_16bit_8x8(buf, buf);
-    }
-    row_txfm(buf, buf, cos_bit_row);
-    round_shift_16bit(buf, 8, shift[2]);
-    transpose_16bit_8x8(buf, buf);
-    store_buffer_16bit_to_32bit_8x8(buf, output, buf_size);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  if (ud_flip)
+    load_buffer_16bit_to_16bit_flip(input, stride, buf, buf_size);
+  else
+    load_buffer_16bit_to_16bit(input, stride, buf, buf_size);
+  round_shift_16bit(buf, 8, shift[0]);
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit(buf, 8, shift[1]);
+  if (lr_flip) {
+    __m128i tmp[8];
+    transpose_16bit_8x8(buf, tmp);
+    flip_buf_sse2(tmp, buf, 8);
   } else {
-    av1_fwd_txfm2d_8x8_c(input, output, stride, tx_type, bd);
+    transpose_16bit_8x8(buf, buf);
   }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, 8, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_buffer_16bit_to_32bit_8x8(buf, output, buf_size);
 }
 
 void av1_fwd_txfm2d_8x16_sse2(const int16_t *input, int32_t *output, int stride,
@@ -1686,36 +1722,32 @@
   const transform_1d_sse2 col_txfm = txfm16_arr[tx_type].col;
   const transform_1d_sse2 row_txfm = txfm8_arr[tx_type].row;
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
-    if (ud_flip) {
-      load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
-    } else {
-      load_buffer_16bit_to_16bit(input, stride, buf0, height);
-    }
-    round_shift_16bit(buf0, height, shift[0]);
-    col_txfm(buf0, buf0, cos_bit_col);
-    round_shift_16bit(buf0, height, shift[1]);
-    transpose_16bit_8x8(buf0, buf1);
-    transpose_16bit_8x8(buf0 + 8, buf1 + 8);
-
-    for (int i = 0; i < 2; i++) {
-      __m128i *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_sse2(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row);
-      round_shift_16bit(buf, width, shift[2]);
-      transpose_16bit_8x8(buf, buf);
-      store_rect_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
-    }
+  if (ud_flip) {
+    load_buffer_16bit_to_16bit_flip(input, stride, buf0, height);
   } else {
-    av1_fwd_txfm2d_8x16_c(input, output, stride, tx_type, bd);
+    load_buffer_16bit_to_16bit(input, stride, buf0, height);
+  }
+  round_shift_16bit(buf0, height, shift[0]);
+  col_txfm(buf0, buf0, cos_bit_col);
+  round_shift_16bit(buf0, height, shift[1]);
+  transpose_16bit_8x8(buf0, buf1);
+  transpose_16bit_8x8(buf0 + 8, buf1 + 8);
+
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
+    }
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_rect_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
   }
 }
 
@@ -1733,38 +1765,34 @@
   const transform_1d_sse2 col_txfm = txfm8_arr[tx_type].col;
   const transform_1d_sse2 row_txfm = txfm16_arr[tx_type].row;
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    __m128i *buf;
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  __m128i *buf;
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
-    for (int i = 0; i < 2; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit(buf0, height, shift[0]);
-      col_txfm(buf0, buf0, cos_bit_col);
-      round_shift_16bit(buf0, height, shift[1]);
-      transpose_16bit_8x8(buf0, buf1 + 8 * i);
-    }
-
-    if (lr_flip) {
-      buf = buf0;
-      flip_buf_sse2(buf1, buf, width);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
     } else {
-      buf = buf1;
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
     }
-    row_txfm(buf, buf, cos_bit_row);
-    round_shift_16bit(buf, width, shift[2]);
-    transpose_16bit_8x8(buf, buf);
-    store_rect_buffer_16bit_to_32bit_8x8(buf, output, width);
-    transpose_16bit_8x8(buf + 8, buf + 8);
-    store_rect_buffer_16bit_to_32bit_8x8(buf + 8, output + 8, width);
-  } else {
-    av1_fwd_txfm2d_16x8_c(input, output, stride, tx_type, bd);
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 8 * i);
   }
+
+  if (lr_flip) {
+    buf = buf0;
+    flip_buf_sse2(buf1, buf, width);
+  } else {
+    buf = buf1;
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, width, shift[2]);
+  transpose_16bit_8x8(buf, buf);
+  store_rect_buffer_16bit_to_32bit_8x8(buf, output, width);
+  transpose_16bit_8x8(buf + 8, buf + 8);
+  store_rect_buffer_16bit_to_32bit_8x8(buf + 8, output + 8, width);
 }
 
 void av1_fwd_txfm2d_16x16_sse2(const int16_t *input, int32_t *output,
@@ -1781,41 +1809,36 @@
   const transform_1d_sse2 col_txfm = txfm16_arr[tx_type].col;
   const transform_1d_sse2 row_txfm = txfm16_arr[tx_type].row;
 
-  if (col_txfm != NULL && row_txfm != NULL) {
-    int ud_flip, lr_flip;
-    get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
 
-    for (int i = 0; i < 2; i++) {
-      if (ud_flip) {
-        load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
-      } else {
-        load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
-      }
-      round_shift_16bit(buf0, height, shift[0]);
-      col_txfm(buf0, buf0, cos_bit_col);
-      round_shift_16bit(buf0, height, shift[1]);
-      transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
-      transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  for (int i = 0; i < 2; i++) {
+    if (ud_flip) {
+      load_buffer_16bit_to_16bit_flip(input + 8 * i, stride, buf0, height);
+    } else {
+      load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, height);
     }
+    round_shift_16bit(buf0, height, shift[0]);
+    col_txfm(buf0, buf0, cos_bit_col);
+    round_shift_16bit(buf0, height, shift[1]);
+    transpose_16bit_8x8(buf0, buf1 + 0 * width + 8 * i);
+    transpose_16bit_8x8(buf0 + 8, buf1 + 1 * width + 8 * i);
+  }
 
-    for (int i = 0; i < 2; i++) {
-      __m128i *buf;
-      if (lr_flip) {
-        buf = buf0;
-        flip_buf_sse2(buf1 + width * i, buf, width);
-      } else {
-        buf = buf1 + width * i;
-      }
-      row_txfm(buf, buf, cos_bit_row);
-      round_shift_16bit(buf, width, shift[2]);
-      transpose_16bit_8x8(buf, buf);
-      store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
-      transpose_16bit_8x8(buf + 8, buf + 8);
-      store_buffer_16bit_to_32bit_8x8(buf + 8, output + 8 * width * i + 8,
-                                      width);
+  for (int i = 0; i < 2; i++) {
+    __m128i *buf;
+    if (lr_flip) {
+      buf = buf0;
+      flip_buf_sse2(buf1 + width * i, buf, width);
+    } else {
+      buf = buf1 + width * i;
     }
-  } else {
-    av1_fwd_txfm2d_16x16_c(input, output, stride, tx_type, bd);
+    row_txfm(buf, buf, cos_bit_row);
+    round_shift_16bit(buf, width, shift[2]);
+    transpose_16bit_8x8(buf, buf);
+    store_buffer_16bit_to_32bit_8x8(buf, output + 8 * width * i, width);
+    transpose_16bit_8x8(buf + 8, buf + 8);
+    store_buffer_16bit_to_32bit_8x8(buf + 8, output + 8 * width * i + 8, width);
   }
 }