Code refactor of lowbd_inv_txfm2d_add sse2

1. Reorder functions to align with TX_SIZE define order.
2. Merge functions for each TX_SIZE which have very similar code
   into a universal function lowbd_inv_txfm2d_add_internal_sse2.
3. No speed impact was spotted except size 8x8, so the 8x8 version
   stays unchanged.

Change-Id: Ic896aacd93745906716582af855774807a863231
diff --git a/av1/common/x86/av1_inv_txfm_sse2.c b/av1/common/x86/av1_inv_txfm_sse2.c
index bf595a1..e1106f3 100644
--- a/av1/common/x86/av1_inv_txfm_sse2.c
+++ b/av1/common/x86/av1_inv_txfm_sse2.c
@@ -1653,22 +1653,14 @@
   }
 }
 
-static const transform_1d_sse2 lowbd_txfm8_1d_arr[TX_TYPES_1D] = {
-  idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2
-};
-
-static const transform_1d_sse2 lowbd_txfm16_1d_arr[TX_TYPES_1D] = {
-  idct16_new_sse2,
-  iadst16_new_sse2,
-  iadst16_new_sse2,
-  iidentity16_new_sse2,
-};
-
-static const transform_1d_sse2 lowbd_txfm32_1d_arr[TX_TYPES_1D] = {
-  idct32_new_sse2,
-  NULL,
-  NULL,
-  iidentity32_new_sse2,
+static const transform_1d_sse2 lowbd_txfm_all_1d_arr[TX_SIZES][TX_TYPES_1D] = {
+  { NULL, NULL, NULL, NULL },
+  { idct8_new_sse2, iadst8_new_sse2, iadst8_new_sse2, iidentity8_new_sse2 },
+  { idct16_new_sse2, iadst16_new_sse2, iadst16_new_sse2, iidentity16_new_sse2 },
+  { idct32_new_sse2, NULL, NULL, iidentity32_new_sse2 },
+#if CONFIG_TX64X64
+  { idct64_new_sse2, NULL, NULL, NULL },
+#endif
 };
 
 void av1_lowbd_inv_txfm2d_add_8x8_sse2(const int32_t *input, uint8_t *output,
@@ -1682,8 +1674,10 @@
   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
   const int buf_size = 8;
 
-  const transform_1d_sse2 row_txfm = lowbd_txfm8_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm8_1d_arr[vtx_tab[tx_type]];
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
 
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
@@ -1724,56 +1718,8 @@
   }
 }
 
-void av1_lowbd_inv_txfm2d_add_16x16_sse2(const int32_t *input, uint8_t *output,
-                                         int stride, TX_TYPE tx_type, int bd) {
-  (void)bd;
-  __m128i buf1[32];
-  const TX_SIZE tx_size = TX_16X16;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
-
-  const transform_1d_sse2 row_txfm = lowbd_txfm16_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm16_1d_arr[vtx_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  // i=0 do up 16x8,i=1 do down 16x8
-  for (int i = 0; i < 2; i++) {
-    __m128i buf0[16];
-    const int32_t *input_row = input + i * buf_size_w * 8;
-    for (int j = 0; j < 2; ++j) {
-      __m128i *buf0_cur = buf0 + 8 * j;
-      load_buffer_32bit_to_16bit(input_row + j * 8, buf_size_w, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, buf_size_w, shift[0]);
-    if (lr_flip) {
-      for (int j = 0; j < 2; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp, buf1 + i * 8 + (1 - j) * buf_size_w);
-      }
-    } else {
-      for (int j = 0; j < 2; ++j) {
-        transpose_16bit_8x8(buf0 + j * 8, buf1 + i * 8 + j * buf_size_h);
-      }
-    }
-  }
-  for (int i = 0; i < 2; i++) {
-    col_txfm(buf1 + i * buf_size_h, buf1 + i * buf_size_h, cos_bit_col);
-    round_shift_16bit(buf1 + i * buf_size_h, buf_size_h, shift[1]);
-  }
-  lowbd_write_buffer_16xn_sse2(buf1, output, stride, ud_flip, buf_size_h);
-}
-
-static void round_shift_sse2(const __m128i *input, __m128i *output, int size) {
+static INLINE void round_shift_sse2(const __m128i *input, __m128i *output,
+                                    int size) {
   const __m128i scale = _mm_set1_epi16(NewInvSqrt2);
   const __m128i rounding = _mm_set1_epi16(1 << (NewSqrt2Bits - 1));
   const __m128i one = _mm_set1_epi16(1);
@@ -1789,249 +1735,109 @@
   }
 }
 
-void av1_lowbd_inv_txfm2d_add_16x8_sse2(const int32_t *input, uint8_t *output,
-                                        int stride, TX_TYPE tx_type, int bd) {
-  (void)bd;
-  __m128i buf0[16];
-  const TX_SIZE tx_size = TX_16X8;
+static INLINE void lowbd_inv_txfm2d_add_internal_sse2(const int32_t *input,
+                                                      uint8_t *output,
+                                                      int stride,
+                                                      TX_TYPE tx_type,
+                                                      TX_SIZE tx_size) {
+  __m128i buf1[64 * 8];
   const int8_t *shift = inv_txfm_shift_ls[tx_size];
   const int txw_idx = get_txw_idx(tx_size);
   const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
   const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
+  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
+  const int txfm_size_col = tx_size_wide[tx_size];
+  const int txfm_size_row = tx_size_high[tx_size];
+  const int buf_size_w_div8 = txfm_size_col >> 3;
+  const int buf_size_h_div8 = txfm_size_row >> 3;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
 
-  const transform_1d_sse2 row_txfm = lowbd_txfm16_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm8_1d_arr[vtx_tab[tx_type]];
+  const transform_1d_sse2 row_txfm =
+      lowbd_txfm_all_1d_arr[txw_idx][htx_tab[tx_type]];
+  const transform_1d_sse2 col_txfm =
+      lowbd_txfm_all_1d_arr[txh_idx][vtx_tab[tx_type]];
 
+  assert(col_txfm != NULL);
+  assert(row_txfm != NULL);
   int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  const int32_t *input_row = input;
-  for (int j = 0; j < 2; ++j) {
-    __m128i *buf0_cur = buf0 + 8 * j;
-    load_buffer_32bit_to_16bit(input_row + j * 8, buf_size_w, buf0_cur, 8);
-    transpose_16bit_8x8(buf0_cur, buf0_cur);
-  }
-
-  round_shift_sse2(buf0, buf0, buf_size_w);  // rect special code
-  row_txfm(buf0, buf0, cos_bit_row);
-  round_shift_16bit(buf0, buf_size_w, shift[0]);
-  if (lr_flip) {
-    __m128i temp[16];
-    flip_buf_sse2(buf0, temp, buf_size_w);
-    transpose_16bit_8x8(temp, buf0);
-    transpose_16bit_8x8(temp + 8, buf0 + 8);
-  } else {
-    transpose_16bit_8x8(buf0, buf0);
-    transpose_16bit_8x8(buf0 + 8, buf0 + 8);
-  }
-
-  for (int i = 0; i < 2; i++) {
-    col_txfm(buf0 + i * buf_size_h, buf0 + i * buf_size_h, cos_bit_col);
-    round_shift_16bit(buf0 + i * buf_size_h, buf_size_h, shift[1]);
-  }
-  lowbd_write_buffer_16xn_sse2(buf0, output, stride, ud_flip, buf_size_h);
-}
-
-void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
-                                        int stride, TX_TYPE tx_type, int bd) {
-  (void)bd;
-  __m128i buf0[16];
-  const TX_SIZE tx_size = TX_8X16;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
-
-  const transform_1d_sse2 row_txfm = lowbd_txfm8_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm16_1d_arr[vtx_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < 2; i++) {
-    __m128i *buf_cur = buf0 + 8 * i;
-    load_buffer_32bit_to_16bit(input + i * buf_size_w * 8, buf_size_w, buf_cur,
-                               8);
-    transpose_16bit_8x8(buf_cur, buf_cur);
-    round_shift_sse2(buf_cur, buf_cur, buf_size_w);  // rect special code
-    row_txfm(buf_cur, buf_cur, cos_bit_row);
-    round_shift_16bit(buf_cur, buf_size_w, shift[0]);
+  for (int i = 0; i < buf_size_h_div8; i++) {
+    __m128i buf0[64];
+    const int32_t *input_row = input + i * txfm_size_col * 8;
+    for (int j = 0; j < buf_size_w_div8; ++j) {
+      __m128i *buf0_cur = buf0 + j * 8;
+      load_buffer_32bit_to_16bit(input_row + j * 8, txfm_size_col, buf0_cur, 8);
+      transpose_16bit_8x8(buf0_cur, buf0_cur);
+    }
+    if (rect_type == 1 || rect_type == -1) {
+      round_shift_sse2(buf0, buf0, txfm_size_col);  // rect special code
+    }
+    row_txfm(buf0, buf0, cos_bit_row);
+    round_shift_16bit(buf0, txfm_size_col, shift[0]);
+    __m128i *_buf1 = buf1 + i * 8;
     if (lr_flip) {
-      __m128i temp[8];
-      flip_buf_sse2(buf_cur, temp, buf_size_w);
-      transpose_16bit_8x8(temp, buf_cur);
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        __m128i temp[8];
+        flip_buf_sse2(buf0 + 8 * j, temp, 8);
+        transpose_16bit_8x8(temp,
+                            _buf1 + txfm_size_row * (buf_size_w_div8 - 1 - j));
+      }
     } else {
-      transpose_16bit_8x8(buf_cur, buf_cur);
+      for (int j = 0; j < buf_size_w_div8; ++j) {
+        transpose_16bit_8x8(buf0 + 8 * j, _buf1 + txfm_size_row * j);
+      }
     }
   }
-  col_txfm(buf0, buf0, cos_bit_col);
-  round_shift_16bit(buf0, buf_size_h, shift[1]);
-  lowbd_write_buffer_8xn_sse2(buf0, output, stride, ud_flip, buf_size_h);
+  for (int i = 0; i < buf_size_w_div8; i++) {
+    col_txfm(buf1 + i * txfm_size_row, buf1 + i * txfm_size_row, cos_bit_col);
+    round_shift_16bit(buf1 + i * txfm_size_row, txfm_size_row, shift[1]);
+  }
+
+  if (txfm_size_col >= 16) {
+    for (int i = 0; i < (txfm_size_col >> 4); i++) {
+      lowbd_write_buffer_16xn_sse2(buf1 + i * txfm_size_row * 2,
+                                   output + 16 * i, stride, ud_flip,
+                                   txfm_size_row);
+    }
+  } else if (txfm_size_col == 8) {
+    lowbd_write_buffer_8xn_sse2(buf1, output, stride, ud_flip, txfm_size_row);
+  }
+}
+
+void av1_lowbd_inv_txfm2d_add_16x16_sse2(const int32_t *input, uint8_t *output,
+                                         int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_16X16);
 }
 
 void av1_lowbd_inv_txfm2d_add_32x32_sse2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  __m128i buf1[32 * 4];
-  const TX_SIZE tx_size = TX_32X32;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
-
-  const transform_1d_sse2 row_txfm = lowbd_txfm32_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm32_1d_arr[vtx_tab[tx_type]];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < 4; i++) {
-    __m128i buf0[32];
-    const int32_t *input_row = input + i * buf_size_w * 8;
-    for (int j = 0; j < 4; ++j) {
-      __m128i *buf0_cur = buf0 + j * 8;
-      load_buffer_32bit_to_16bit(input_row + j * 8, buf_size_w, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, buf_size_w, shift[0]);
-    __m128i *buf1_cur = buf1 + i * 8;
-    if (lr_flip) {
-      for (int j = 0; j < 4; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp, buf1_cur + buf_size_w * (3 - j));
-      }
-    } else {
-      for (int j = 0; j < 4; ++j) {
-        transpose_16bit_8x8(buf0 + 8 * j, buf1_cur + buf_size_h * j);
-      }
-    }
-  }
-  for (int i = 0; i < 4; i++) {
-    col_txfm(buf1 + i * buf_size_h, buf1 + i * buf_size_h, cos_bit_col);
-    round_shift_16bit(buf1 + i * buf_size_h, buf_size_h, shift[1]);
-  }
-  lowbd_write_buffer_16xn_sse2(buf1, output, stride, ud_flip, buf_size_h);
-  lowbd_write_buffer_16xn_sse2(buf1 + 64, output + 16, stride, ud_flip,
-                               buf_size_h);
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_32X32);
 }
 
-void av1_lowbd_inv_txfm2d_add_32x16_sse2(const int32_t *input, uint8_t *output,
-                                         int stride, TX_TYPE tx_type, int bd) {
+void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  __m128i buf1[32 * 2];
-  const TX_SIZE tx_size = TX_32X16;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
-  const int buf_size_w_div8 = buf_size_w >> 3;
-  const int buf_size_h_div8 = buf_size_h >> 3;
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_8X16);
+}
 
-  const transform_1d_sse2 row_txfm = lowbd_txfm32_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm16_1d_arr[vtx_tab[tx_type]];
-
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_h_div8; i++) {
-    __m128i buf0[32];
-    const int32_t *input_row = input + i * buf_size_w * 8;
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      __m128i *buf0_cur = buf0 + j * 8;
-      load_buffer_32bit_to_16bit(input_row + j * 8, buf_size_w, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-    round_shift_sse2(buf0, buf0, buf_size_w);  // rect special code
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, buf_size_w, shift[0]);
-    __m128i *buf1_cur = buf1 + i * 8;
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp,
-                            buf1_cur + buf_size_h * (buf_size_w_div8 - 1 - j));
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_16bit_8x8(buf0 + 8 * j, buf1_cur + buf_size_h * j);
-      }
-    }
-  }
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * buf_size_h, buf1 + i * buf_size_h, cos_bit_col);
-    round_shift_16bit(buf1 + i * buf_size_h, buf_size_h, shift[1]);
-  }
-  lowbd_write_buffer_16xn_sse2(buf1, output, stride, ud_flip, buf_size_h);
-  lowbd_write_buffer_16xn_sse2(buf1 + 32, output + 16, stride, ud_flip,
-                               buf_size_h);
+void av1_lowbd_inv_txfm2d_add_16x8_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_16X8);
 }
 
 void av1_lowbd_inv_txfm2d_add_16x32_sse2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
-  __m128i buf1[32 * 4];
-  const TX_SIZE tx_size = TX_16X32;
-  const int8_t *shift = inv_txfm_shift_ls[tx_size];
-  const int txw_idx = get_txw_idx(tx_size);
-  const int txh_idx = get_txh_idx(tx_size);
-  const int cos_bit_col = inv_cos_bit_col[txw_idx][txh_idx];
-  const int cos_bit_row = inv_cos_bit_row[txw_idx][txh_idx];
-  const int buf_size_w = tx_size_wide[tx_size];
-  const int buf_size_h = tx_size_high[tx_size];
-  const int buf_size_w_div8 = buf_size_w >> 3;
-  const int buf_size_h_div8 = buf_size_h >> 3;
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_16X32);
+}
 
-  const transform_1d_sse2 row_txfm = lowbd_txfm16_1d_arr[htx_tab[tx_type]];
-  const transform_1d_sse2 col_txfm = lowbd_txfm32_1d_arr[vtx_tab[tx_type]];
-
-  assert(col_txfm != NULL);
-  assert(row_txfm != NULL);
-  int ud_flip, lr_flip;
-  get_flip_cfg(tx_type, &ud_flip, &lr_flip);
-  for (int i = 0; i < buf_size_h_div8; i++) {
-    __m128i buf0[16];  // buffer __m128i with count of buf_size_w
-    const int32_t *input_row = input + i * buf_size_w * 8;
-    for (int j = 0; j < buf_size_w_div8; ++j) {
-      __m128i *buf0_cur = buf0 + j * 8;
-      load_buffer_32bit_to_16bit(input_row + j * 8, buf_size_w, buf0_cur, 8);
-      transpose_16bit_8x8(buf0_cur, buf0_cur);
-    }
-    round_shift_sse2(buf0, buf0, buf_size_w);  // rect special code
-    row_txfm(buf0, buf0, cos_bit_row);
-    round_shift_16bit(buf0, buf_size_w, shift[0]);
-    __m128i *buf1_cur = buf1 + i * 8;
-    if (lr_flip) {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        __m128i temp[8];
-        flip_buf_sse2(buf0 + 8 * j, temp, 8);
-        transpose_16bit_8x8(temp,
-                            buf1_cur + buf_size_h * (buf_size_w_div8 - 1 - j));
-      }
-    } else {
-      for (int j = 0; j < buf_size_w_div8; ++j) {
-        transpose_16bit_8x8(buf0 + 8 * j, buf1_cur + buf_size_h * j);
-      }
-    }
-  }
-  for (int i = 0; i < buf_size_w_div8; i++) {
-    col_txfm(buf1 + i * buf_size_h, buf1 + i * buf_size_h, cos_bit_col);
-    round_shift_16bit(buf1 + i * buf_size_h, buf_size_h, shift[1]);
-  }
-  lowbd_write_buffer_16xn_sse2(buf1, output, stride, ud_flip, buf_size_h);
+void av1_lowbd_inv_txfm2d_add_32x16_sse2(const int32_t *input, uint8_t *output,
+                                         int stride, TX_TYPE tx_type, int bd) {
+  (void)bd;
+  lowbd_inv_txfm2d_add_internal_sse2(input, output, stride, tx_type, TX_32X16);
 }
 
 typedef void (*inv_txfm_func)(const int32_t *input, uint8_t *output, int stride,
diff --git a/av1/common/x86/av1_txfm_sse2.h b/av1/common/x86/av1_txfm_sse2.h
index 8fba6cf..d7f0511 100644
--- a/av1/common/x86/av1_txfm_sse2.h
+++ b/av1/common/x86/av1_txfm_sse2.h
@@ -200,20 +200,21 @@
 void av1_lowbd_inv_txfm2d_add_16x16_sse2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type, int bd);
 
-void av1_lowbd_inv_txfm2d_add_16x8_sse2(const int32_t *input, uint8_t *output,
-                                        int stride, TX_TYPE tx_type, int bd);
+void av1_lowbd_inv_txfm2d_add_32x32_sse2(const int32_t *input, uint8_t *output,
+                                         int stride, TX_TYPE tx_type, int bd);
 
 void av1_lowbd_inv_txfm2d_add_8x16_sse2(const int32_t *input, uint8_t *output,
                                         int stride, TX_TYPE tx_type, int bd);
 
-void av1_lowbd_inv_txfm2d_add_32x32_sse2(const int32_t *input, uint8_t *output,
+void av1_lowbd_inv_txfm2d_add_16x8_sse2(const int32_t *input, uint8_t *output,
+                                        int stride, TX_TYPE tx_type, int bd);
+
+void av1_lowbd_inv_txfm2d_add_16x32_sse2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type, int bd);
 
 void av1_lowbd_inv_txfm2d_add_32x16_sse2(const int32_t *input, uint8_t *output,
                                          int stride, TX_TYPE tx_type, int bd);
 
-void av1_lowbd_inv_txfm2d_add_16x32_sse2(const int32_t *input, uint8_t *output,
-                                         int stride, TX_TYPE tx_type, int bd);
 #ifdef __cplusplus
 }
 #endif  // __cplusplus