Add inv txfm2d sse2 for sizes with 4

Implement av1_lowbd_inv_txfm2d_add_4x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x8_sse2
Implement av1_lowbd_inv_txfm2d_add_8x4_sse2
Implement av1_lowbd_inv_txfm2d_add_4x16_sse2
Implement av1_lowbd_inv_txfm2d_add_16x4_sse2

A brief speed test shows that using the included SSE2 functions
completed by this CL, for speed1 lowbitdepth encoder speeds up >9%
and lowbitdepth decoder speeds up >25%, comparing to the highbitdepth
implementation in the baseline.

Change-Id: I0576a2a146c0b1a7b483c9d35c3d21d979e263cd
diff --git a/aom_dsp/x86/transpose_sse2.h b/aom_dsp/x86/transpose_sse2.h
index 445eb01..5edfa71 100644
--- a/aom_dsp/x86/transpose_sse2.h
+++ b/aom_dsp/x86/transpose_sse2.h
@@ -107,10 +107,14 @@
   const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
 
   // Unpack 32 bit elements resulting in:
-  // out[0]: 00 10 20 30  01 11 21 31
-  // out[1]: 02 12 22 32  03 13 23 33
+  // out[0]: 00 10 20 30
+  // out[1]: 01 11 21 31
+  // out[2]: 02 12 22 32
+  // out[3]: 03 13 23 33
   out[0] = _mm_unpacklo_epi32(a0, a1);
-  out[1] = _mm_unpackhi_epi32(a0, a1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(a0, a1);
+  out[3] = _mm_srli_si128(out[2], 8);
 }
 
 static INLINE void transpose_16bit_4x8(const __m128i *const in,
@@ -155,6 +159,54 @@
   out[3] = _mm_unpackhi_epi64(b2, b3);
 }
 
+static INLINE void transpose_16bit_8x4(const __m128i *const in,
+                                       __m128i *const out) {
+  // Unpack 16 bit elements. Goes from:
+  // in[0]: 00 01 02 03  04 05 06 07
+  // in[1]: 10 11 12 13  14 15 16 17
+  // in[2]: 20 21 22 23  24 25 26 27
+  // in[3]: 30 31 32 33  34 35 36 37
+
+  // to:
+  // a0:    00 10 01 11  02 12 03 13
+  // a1:    20 30 21 31  22 32 23 33
+  // a4:    04 14 05 15  06 16 07 17
+  // a5:    24 34 25 35  26 36 27 37
+  const __m128i a0 = _mm_unpacklo_epi16(in[0], in[1]);
+  const __m128i a1 = _mm_unpacklo_epi16(in[2], in[3]);
+  const __m128i a4 = _mm_unpackhi_epi16(in[0], in[1]);
+  const __m128i a5 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  // Unpack 32 bit elements resulting in:
+  // b0: 00 10 20 30  01 11 21 31
+  // b2: 04 14 24 34  05 15 25 35
+  // b4: 02 12 22 32  03 13 23 33
+  // b6: 06 16 26 36  07 17 27 37
+  const __m128i b0 = _mm_unpacklo_epi32(a0, a1);
+  const __m128i b2 = _mm_unpacklo_epi32(a4, a5);
+  const __m128i b4 = _mm_unpackhi_epi32(a0, a1);
+  const __m128i b6 = _mm_unpackhi_epi32(a4, a5);
+
+  // Unpack 64 bit elements resulting in:
+  // out[0]: 00 10 20 30  XX XX XX XX
+  // out[1]: 01 11 21 31  XX XX XX XX
+  // out[2]: 02 12 22 32  XX XX XX XX
+  // out[3]: 03 13 23 33  XX XX XX XX
+  // out[4]: 04 14 24 34  XX XX XX XX
+  // out[5]: 05 15 25 35  XX XX XX XX
+  // out[6]: 06 16 26 36  XX XX XX XX
+  // out[7]: 07 17 27 37  XX XX XX XX
+  const __m128i zeros = _mm_setzero_si128();
+  out[0] = _mm_unpacklo_epi64(b0, zeros);
+  out[1] = _mm_unpackhi_epi64(b0, zeros);
+  out[2] = _mm_unpacklo_epi64(b4, zeros);
+  out[3] = _mm_unpackhi_epi64(b4, zeros);
+  out[4] = _mm_unpacklo_epi64(b2, zeros);
+  out[5] = _mm_unpackhi_epi64(b2, zeros);
+  out[6] = _mm_unpacklo_epi64(b6, zeros);
+  out[7] = _mm_unpackhi_epi64(b6, zeros);
+}
+
 static INLINE void transpose_16bit_8x8(const __m128i *const in,
                                        __m128i *const out) {
   // Unpack 16 bit elements. Goes from:
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
index 50e0c4b..f09f269 100644
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ b/av1/common/x86/av1_inv_txfm_sse2.c
@@ -1327,6 +1327,72 @@
   output[32] = _mm_subs_epi16(x10[31], x10[32]);
 }
 
+void iadst4_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
+  const int32_t *sinpi = sinpi_arr(cos_bit);
+  const __m128i sinpi_p01_p04 = pair_set_epi16(sinpi[1], sinpi[4]);
+  const __m128i sinpi_p02_m01 = pair_set_epi16(sinpi[2], -sinpi[1]);
+  const __m128i sinpi_p03_p02 = pair_set_epi16(sinpi[3], sinpi[2]);
+  const __m128i sinpi_p03_m04 = pair_set_epi16(sinpi[3], -sinpi[4]);
+  const __m128i sinpi_p03_m03 = pair_set_epi16(sinpi[3], -sinpi[3]);
+  const __m128i sinpi_0_p03 = pair_set_epi16(0, sinpi[3]);
+  const __m128i sinpi_0_p02 = pair_set_epi16(0, sinpi[2]);
+  const __m128i sinpi_p03_p04 = pair_set_epi16(sinpi[3], sinpi[4]);
+  __m128i x0[4];
+  x0[0] = input[0];
+  x0[1] = input[1];
+  x0[2] = input[2];
+  x0[3] = input[3];
+
+  __m128i u[4];
+  u[0] = _mm_unpacklo_epi16(x0[0], x0[2]);
+  u[1] = _mm_unpackhi_epi16(x0[0], x0[2]);
+  u[2] = _mm_unpacklo_epi16(x0[1], x0[3]);
+  u[3] = _mm_unpackhi_epi16(x0[1], x0[3]);
+
+  __m128i x1[16];
+  x1[0] = _mm_madd_epi16(u[0], sinpi_p01_p04);  // x0*sin1 + x2*sin4
+  x1[1] = _mm_madd_epi16(u[1], sinpi_p01_p04);
+  x1[2] = _mm_madd_epi16(u[0], sinpi_p02_m01);  // x0*sin2 - x2*sin1
+  x1[3] = _mm_madd_epi16(u[1], sinpi_p02_m01);
+  x1[4] = _mm_madd_epi16(u[2], sinpi_p03_p02);  // x1*sin3 + x3*sin2
+  x1[5] = _mm_madd_epi16(u[3], sinpi_p03_p02);
+  x1[6] = _mm_madd_epi16(u[2], sinpi_p03_m04);  // x1*sin3 - x3*sin4
+  x1[7] = _mm_madd_epi16(u[3], sinpi_p03_m04);
+  x1[8] = _mm_madd_epi16(u[0], sinpi_p03_m03);  // x0*sin3 - x2*sin3
+  x1[9] = _mm_madd_epi16(u[1], sinpi_p03_m03);
+  x1[10] = _mm_madd_epi16(u[2], sinpi_0_p03);  // x2*sin3
+  x1[11] = _mm_madd_epi16(u[3], sinpi_0_p03);
+  x1[12] = _mm_madd_epi16(u[2], sinpi_0_p02);  // x3*sin2
+  x1[13] = _mm_madd_epi16(u[3], sinpi_0_p02);
+  x1[14] = _mm_madd_epi16(u[2], sinpi_p03_p04);  // x1*sin3 + x3*sin4
+  x1[15] = _mm_madd_epi16(u[3], sinpi_p03_p04);
+
+  __m128i x2[8];
+  x2[0] = _mm_add_epi32(x1[0], x1[4]);  // x0*sin1 + x2*sin4 + x1*sin3 + x3*sin2
+  x2[1] = _mm_add_epi32(x1[1], x1[5]);
+  x2[2] = _mm_add_epi32(x1[2], x1[6]);  // x0*sin2 - x2*sin1 +x1*sin3 - x3*sin4
+  x2[3] = _mm_add_epi32(x1[3], x1[7]);
+  x2[4] = _mm_add_epi32(x1[8], x1[10]);  // x0*sin3 - x2*sin3 +x3*sin3
+  x2[5] = _mm_add_epi32(x1[9], x1[11]);
+  x2[6] = _mm_add_epi32(x1[0], x1[2]);  // x0*sin1 + x2*sin4 + x0*sin2 - x2*sin1
+  x2[7] = _mm_add_epi32(x1[1], x1[3]);
+  x2[6] = _mm_add_epi32(
+      x2[6], x1[12]);  // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1
+  x2[7] = _mm_add_epi32(x2[7], x1[13]);
+  x2[6] = _mm_sub_epi32(
+      x2[6], x1[14]);  // x0*sin1 + x2*sin4 + x3*sin2 + x0*sin2 - x2*sin1
+  x2[7] = _mm_sub_epi32(x2[7], x1[15]);
+
+  const __m128i rounding = _mm_set1_epi32(1 << (cos_bit - 1));
+  for (int i = 0; i < 4; ++i) {
+    __m128i out0 = _mm_add_epi32(x2[2 * i], rounding);
+    __m128i out1 = _mm_add_epi32(x2[2 * i + 1], rounding);
+    out0 = _mm_srai_epi32(out0, cos_bit);
+    out1 = _mm_srai_epi32(out1, cos_bit);
+    output[i] = _mm_packs_epi32(out0, out1);
+  }
+}
+
 void iadst8_new_sse2(const __m128i *input, __m128i *output, int8_t cos_bit) {
   const int32_t *cospi = cospi_arr(cos_bit);
   const __m128i __zero = _mm_setzero_si128();
@@ -1599,6 +1665,24 @@
   output[15] = _mm_subs_epi16(__zero, x8[1]);
 }
 
+static void iidentity4_new_sse2(const __m128i *input, __m128i *output,
+                                int8_t cos_bit) {
+  (void)cos_bit;
+  const __m128i scale = _mm_set1_epi16(NewSqrt2);
+  const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i scale_rounding = _mm_unpacklo_epi16(scale, rounding);
+  for (int i = 0; i < 4; ++i) {
+    __m128i a_lo = _mm_unpacklo_epi16(input[i], one);
+    __m128i a_hi = _mm_unpackhi_epi16(input[i], one);
+    __m128i b_lo = _mm_madd_epi16(a_lo, scale_rounding);
+    __m128i b_hi = _mm_madd_epi16(a_hi, scale_rounding);
+    __m128i c_lo = _mm_srai_epi32(b_lo, NewSqrt2Bits);
+    __m128i c_hi = _mm_srai_epi32(b_hi, NewSqrt2Bits);
+    output[i] = _mm_packs_epi32(c_lo, c_hi);
+  }
+}
+
 static void iidentity8_new_sse2(const __m128i *input, __m128i *output,
                                 int8_t cos_bit) {
   (void)cos_bit;
@@ -1661,6 +1745,20 @@
   return _mm_packus_epi16(x0, x0);
 }
 
+static INLINE void lowbd_write_buffer_4xn_sse2(__m128i *in, uint8_t *output,
+                                               int stride, int flipud,
+                                               const int height) {
+  int j = flipud ? (height - 1) : 0;
+  const int step = flipud ? -1 : 1;
+  const __m128i zero = _mm_setzero_si128();
+  for (int i = 0; i < height; ++i, j += step) {
+    const __m128i v = _mm_cvtsi32_si128(*((uint32_t *)(output + i * stride)));
+    __m128i u = _mm_adds_epi16(in[j], _mm_unpacklo_epi8(v, zero));
+    u = _mm_packus_epi16(u, zero);
+    *((uint32_t *)(output + i * stride)) = _mm_cvtsi128_si32(u);
+  }
+}
+
 static INLINE void lowbd_write_buffer_8xn_sse2(__m128i *in, uint8_t *output,
                                                int stride, int flipud,
                                                const int height) {
@@ -1674,7 +1772,7 @@
 }
 
 static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = {
-  { NULL, NULL, NULL, NULL },
+  { idct4_new_sse2, iadst4_new_sse2, iadst4_new_sse2, iidentity4_new_sse2 },
   { idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
   { idct16_new_sse2, iadst16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 },
   { idct32_new_sse2, NULL, NULL, iidentity32_new_sse2 },
@@ -1683,16 +1781,18 @@
 #endif
 };
 
-void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
+void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output,
                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  __m128i buf[8];
-  const int8_t *shift = inv_txfm_shift_ls[TX_8X8];
-  const int txw_idx = get_txw_idx(TX_8X8);
-  const int txh_idx = get_txh_idx(TX_8X8);
+  __m128i buf[4];
+  const TX_SIZE tx_size = TX_4X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
   const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int buf_size = 8;
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
 
   const transform_1d_sse2 row_txfm =
       lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
@@ -1701,7 +1801,42 @@
 
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  load_buffer_32bit_to_16bit(input, 8, buf, buf_size);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x4(buf, buf);
+  row_txfm(buf, buf, cos_bit_row);
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x4(temp, buf);
+  } else {
+    transpose_16bit_4x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X8;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
   transpose_16bit_8x8(buf, buf);
   row_txfm(buf, buf, cos_bit_row);
   round_shift_16bit(buf, 8, shift[0]);
@@ -1858,6 +1993,80 @@
 }
 #endif
 
+void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_4X8;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit_w4(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_4x8(buf, buf);
+  round_shift_sse2(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit(buf, txfm_size_col, shift[0]);// shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[4];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_8x4(temp, buf);
+  } else {
+    transpose_16bit_8x4(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf[8];
+  const TX_SIZE tx_size = TX_8X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  load_buffer_32bit_to_16bit(input, txfm_size_col, buf, txfm_size_row);
+  transpose_16bit_8x4(buf, buf);
+  round_shift_sse2(buf, buf, txfm_size_col);  // rect special code
+  row_txfm(buf, buf, cos_bit_row);
+  // round_shift_16bit(buf, txfm_size_col, shift[0]); // shift[0] is 0
+  if (lr_flip) {
+    __m128i temp[8];
+    flip_buf_sse2(buf, temp, txfm_size_col);
+    transpose_16bit_4x8(temp, buf);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
 void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
@@ -1912,6 +2121,97 @@
 }
 #endif
 
+void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_4X16;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+
+  const int row_one_loop = 8;
+  for (int i = 0; i < 2; ++i) {
+    const int32_t *input_cur = input + i * txfm_size_col * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit_w4(input_cur, txfm_size_col, buf_cur,
+                                  row_one_loop);
+    transpose_16bit_4x8(buf_cur, buf_cur);
+    row_txfm(buf_cur, buf_cur, cos_bit_row);
+    round_shift_16bit(buf_cur, row_one_loop, shift[0]);
+    if (lr_flip) {
+      __m128i temp[8];
+      flip_buf_sse2(buf_cur, temp, txfm_size_col);
+      transpose_16bit_8x4(temp, buf_cur);
+    } else {
+      transpose_16bit_8x4(buf_cur, buf_cur);
+    }
+  }
+  col_txfm(buf, buf, cos_bit_col);
+  round_shift_16bit(buf, txfm_size_row, shift[1]);
+  lowbd_write_buffer_4xn_sse2(buf, output, stride, ud_flip, txfm_size_row);
+}
+
+void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  __m128i buf[16];
+  const TX_SIZE tx_size = TX_16X4;
+  const int8_t *shift = inv_txfm_shift_ls[tx_size];
+  const int txw_idx = get_txw_idx(tx_size);
+  const int txh_idx = get_txh_idx(tx_size);
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
+
+  int ud_flip, lr_flip;
+  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
+  const int row_one_loop = 8;
+  for (int i = 0; i < buf_size_w_div8; ++i) {
+    const int32_t *input_cur = input + i * row_one_loop;
+    __m128i *buf_cur = buf + i * row_one_loop;
+    load_buffer_32bit_to_16bit(input_cur, txfm_size_col, buf_cur,
+                               txfm_size_row);
+    transpose_16bit_8x4(buf_cur, buf_cur);
+  }
+  row_txfm(buf, buf, cos_bit_row);
+  round_shift_16bit(buf, txfm_size_col, shift[0]);
+  if (lr_flip) {
+    __m128i temp[16];
+    flip_buf_sse2(buf, temp, 16);
+    transpose_16bit_4x8(temp, buf);
+    transpose_16bit_4x8(temp + 8, buf + 8);
+  } else {
+    transpose_16bit_4x8(buf, buf);
+    transpose_16bit_4x8(buf + row_one_loop, buf + row_one_loop);
+  }
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf + i * row_one_loop, buf + i * row_one_loop, cos_bit_col);
+    round_shift_16bit(buf + i * row_one_loop, txfm_size_row, shift[1]);
+  }
+  lowbd_write_buffer_8xn_sse2(buf, output, stride, ud_flip, 4);
+  lowbd_write_buffer_8xn_sse2(buf + 8, output + 8, stride, ud_flip, 4);
+}
+
 void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output,
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
@@ -1957,15 +2257,15 @@
                               TX_TYPE tx_type, int bd);
 
 static inv_txfm_func inv_txfm_func_ls[TX_SIZES_ALL] = {
-  NULL,                                 // 4x4
+  av1_lowbd_inv_txfm2d_add_4x4_sse2,    // 4x4
   av1_lowbd_inv_txfm2d_add_8x8_sse2,    // 8x8
   av1_lowbd_inv_txfm2d_add_16x16_sse2,  // 16x16
   av1_lowbd_inv_txfm2d_add_32x32_sse2,  // 32x32
 #if CONFIG_TX64X64
   av1_lowbd_inv_txfm2d_add_64x64_sse2,  // 64x64
 #endif                                  // CONFIG_TX64X64
-  NULL,                                 // 4x8
-  NULL,                                 // 8x4
+  av1_lowbd_inv_txfm2d_add_4x8_sse2,    // 4x8
+  av1_lowbd_inv_txfm2d_add_8x4_sse2,    // 8x4
   av1_lowbd_inv_txfm2d_add_8x16_sse2,   // 8x16
   av1_lowbd_inv_txfm2d_add_16x8_sse2,   // 16x8
   av1_lowbd_inv_txfm2d_add_16x32_sse2,  // 16x32
@@ -1974,8 +2274,8 @@
   av1_lowbd_inv_txfm2d_add_32x64_sse2,  // 32x64
   av1_lowbd_inv_txfm2d_add_64x32_sse2,  // 64x32
 #endif                                  // CONFIG_TX64X64
-  NULL,                                 // 4x16
-  NULL,                                 // 16x4
+  av1_lowbd_inv_txfm2d_add_4x16_sse2,   // 4x16
+  av1_lowbd_inv_txfm2d_add_16x4_sse2,   // 16x4
   av1_lowbd_inv_txfm2d_add_8x32_sse2,   // 8x32
   av1_lowbd_inv_txfm2d_add_32x8_sse2,   // 32x8
 #if CONFIG_TX64X64
@@ -1988,7 +2288,7 @@
                            const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
   const inv_txfm_func inv_func = inv_txfm_func_ls[txfm_param->tx_size];
-  if (inv_func != NULL) {
+  if (inv_func != NULL && (!txfm_param->lossless)) {
     inv_func(dqcoeff, dst, stride, tx_type, txfm_param->bd);
   } else {
     av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index a924003..efbcae7 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -59,6 +59,11 @@
   return _mm_packs_epi32(a_low, *(const __m128i *)(a + 4));
 }
 
+static INLINE __m128i load_32bit_to_16bit_w4(const int32_t *a) {
+  const __m128i a_low = _mm_load_si128((const __m128i *)a);
+  return _mm_packs_epi32(a_low, a_low);
+}
+
 // Store 8 16 bit values. Sign extend the values.
 static INLINE void store_16bit_to_32bit(__m128i a, int32_t *b) {
   const __m128i a_lo = _mm_unpacklo_epi16(a, a);
@@ -107,6 +112,13 @@
   }
 }
 
+static INLINE void load_buffer_32bit_to_16bit_w4(const int32_t *in, int stride,
+                                                 __m128i *out, int out_size) {
+  for (int i = 0; i < out_size; ++i) {
+    out[i] = load_32bit_to_16bit_w4(in + i * stride);
+  }
+}
+
 static INLINE void load_buffer_32bit_to_16bit_flip(const int32_t *in,
                                                    int stride, __m128i *out,
                                                    int out_size) {
@@ -194,6 +206,9 @@
   transform_1d_sse2 col, row;  // vertical and horizontal
 } transform_2d_sse2;
 
+void av1_lowbd_inv_txfm2d_add_4x4_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd);
+
 void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
                                        int stride, TX_TYPE tx_type, int bd);
 
@@ -208,6 +223,12 @@
                                          int stride, TX_TYPE tx_type, int bd);
 #endif
 
+void av1_lowbd_inv_txfm2d_add_4x8_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_inv_txfm2d_add_8x4_sse2(const int32_t *input, uint8_t *output,
+                                       int stride, TX_TYPE tx_type, int bd);
+
 void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
                                         int stride, TX_TYPE tx_type, int bd);
 
@@ -228,6 +249,12 @@
                                          int stride, TX_TYPE tx_type, int bd);
 #endif
 
+void av1_lowbd_inv_txfm2d_add_4x16_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_inv_txfm2d_add_16x4_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd);
+
 void av1_lowbd_inv_txfm2d_add_8x32_sse2(const int32_t *input, uint8_t *output,
                                         int stride, TX_TYPE tx_type, int bd);
 
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index 5d56cdc..6e74534 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -329,15 +329,15 @@
 #include "av1/common/x86/av1_txfm_sse2.h"
 
 const LbdInvTxfm2dFunc kLbdInvFuncSSE2List[TX_SIZES_ALL] = {
-  NULL,                                 // TX_4X4
+  av1_lowbd_inv_txfm2d_add_4x4_sse2,    // TX_4X4
   av1_lowbd_inv_txfm2d_add_8x8_sse2,    // TX_8X8
   av1_lowbd_inv_txfm2d_add_16x16_sse2,  // TX_16X16
   av1_lowbd_inv_txfm2d_add_32x32_sse2,  // TX_32X32
 #if CONFIG_TX64X64
   av1_lowbd_inv_txfm2d_add_64x64_sse2,  // 64x64
 #endif                                  // CONFIG_TX64X64
-  NULL,                                 // TX_4X8
-  NULL,                                 // TX_8X4
+  av1_lowbd_inv_txfm2d_add_4x8_sse2,    // TX_4X8
+  av1_lowbd_inv_txfm2d_add_8x4_sse2,    // TX_8X4
   av1_lowbd_inv_txfm2d_add_8x16_sse2,   // TX_8X16
   av1_lowbd_inv_txfm2d_add_16x8_sse2,   // TX_16X8
   av1_lowbd_inv_txfm2d_add_16x32_sse2,  // TX_16X32
@@ -346,8 +346,8 @@
   av1_lowbd_inv_txfm2d_add_32x64_sse2,  // TX_32X64
   av1_lowbd_inv_txfm2d_add_64x32_sse2,  // TX_64X32
 #endif                                  // CONFIG_TX64X64
-  NULL,                                 // TX_4X16
-  NULL,                                 // TX_16X4
+  av1_lowbd_inv_txfm2d_add_4x16_sse2,   // TX_4X16
+  av1_lowbd_inv_txfm2d_add_16x4_sse2,   // TX_16X4
   av1_lowbd_inv_txfm2d_add_8x32_sse2,   // 8x32
   av1_lowbd_inv_txfm2d_add_32x8_sse2,   // 32x8
 #if CONFIG_TX64X64