Merge buffer load/store into transforms in av1_fwd_txfm2d_neon.c The buffer load/store steps in txfm2d currently cannot be inlined into the transforms themselves since the exact kernel called depends on the function pointer out of an array lookup. In practice this means the generated assembly at the start and end of the overall transform is mostly just loads and stores between the stack and the input/output arrays. This commit eliminates this overhead by merging the strided load/store steps into the component transform kernels themselves. Running the txfm2d speed test benchmarks on a Neoverse N2 machine with Clang 16 and GCC 12 gives a geomean 6.8% reduction in the times reported. Change-Id: I8e3c46daf334eb72e26130a3c63d9cdb5815eaa6

diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index ee6c2f4..500d957 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c

@@ -212,46 +212,44 @@
   }
 }
 
-static INLINE void store_output_32bit_w8(int32_t *const out,
-                                         const int32x4_t *const in1,
-                                         const int32x4_t *const in2,
-                                         const int stride, const int out_size) {
+static INLINE void store_buffer_interleaved_s32_x8(int32_t *const out,
+                                                   const int32x4_t *const in1,
+                                                   const int32x4_t *const in2,
+                                                   const int stride,
+                                                   const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + stride * i, in1[i]);
     vst1q_s32(out + stride * i + 4, in2[i]);
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit_w4(const int16_t *in,
-                                                 const int stride,
-                                                 int16x4_t *const out,
-                                                 const int out_size) {
+static INLINE void load_buffer_s16_x4(const int16_t *in, const int stride,
+                                      int16x4_t *const out,
+                                      const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = vld1_s16(in);
     in += stride;
   }
 }
 
-static INLINE void load_buffer_16bit_to_16bit(const int16_t *in, int stride,
-                                              int16x8_t *out, int out_size) {
+static INLINE void load_buffer_s16_x8(const int16_t *in, int stride,
+                                      int16x8_t *out, int out_size) {
   for (int i = 0; i < out_size; ++i) {
     out[i] = vld1q_s16(in + i * stride);
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_4_to_4(const int16x4_t *const in,
-                                                      int32_t *const out,
-                                                      const int stride,
-                                                      const int out_size) {
+static INLINE void store_buffer_s16_x4(const int16x4_t *const in,
+                                       int32_t *const out, const int stride,
+                                       const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + i * stride, vmovl_s16(in[i]));
   }
 }
 
-static INLINE void store_buffer_16bit_to_32bit_w8(const int16x8_t *const in,
-                                                  int32_t *const out,
-                                                  const int stride,
-                                                  const int out_size) {
+static INLINE void store_buffer_s16_x8(const int16x8_t *const in,
+                                       int32_t *const out, const int stride,
+                                       const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + i * stride + 0, vmovl_s16(vget_low_s16(in[i])));
     vst1q_s32(out + i * stride + 4, vmovl_s16(vget_high_s16(in[i])));
@@ -311,17 +309,19 @@
 ROUND_SHIFT_SQRT_LOOP_HELPER(round_shift_2sqrt2_s16_s16_8xn_neon, int16x8_t,
                              int16x8_t, round_shift_2sqrt2_s16_s16_8x1_neon)
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w4(
-    const int16x4_t *const in, int32_t *const out, const int stride,
-    const int out_size) {
+static INLINE void store_rect_buffer_s16_x4(const int16x4_t *const in,
+                                            int32_t *const out,
+                                            const int stride,
+                                            const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + i * stride, round_shift_sqrt2_s16_s32_4x1_neon(in[i]));
   }
 }
 
-static INLINE void store_rect_buffer_16bit_to_32bit_w8(
-    const int16x8_t *const in, int32_t *const out, const int stride,
-    const int out_size) {
+static INLINE void store_rect_buffer_s16_x8(const int16x8_t *const in,
+                                            int32_t *const out,
+                                            const int stride,
+                                            const int out_size) {
   for (int i = 0; i < out_size; ++i) {
     vst1q_s32(out + i * stride + 0,
               round_shift_sqrt2_s16_s32_4x1_neon(vget_low_s16(in[i])));
@@ -348,8 +348,8 @@
 SHIFT_LOOP_HELPER(shift_left_2_s16_x8, int16x8_t, vshlq_n_s16, 2)
 SHIFT_LOOP_HELPER(shift_right_2_round_s32_x4, int32x4_t, vrshrq_n_s32, 2)
 
-static void fadst4x4_neon(const int16x4_t *input, int16x4_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fadst4x4_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
   int32x4_t u[6], v[6];
   const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
   const int16x4_t u01 = vqadd_s16(input[0], input[1]);
@@ -375,8 +375,8 @@
   output[3] = vrshrn_n_s32(u[3], TXFM_COS_BIT_MAX);
 }
 
-static void fadst4x8_neon(const int16x4_t *input, int16x4_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
   const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
@@ -424,8 +424,8 @@
   butterfly_s16_s32_x4_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
 }
 
-static void fadst8x4_neon(const int16x8_t *input, int16x8_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fadst8x4_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
   int32x4_t u_lo[4], u_hi[4];
   const int16x4_t sinpi = vld1_s16(sinpi_arr_q13(cos_bit));
   const int16x8_t u01 = vqaddq_s16(input[0], input[1]);
@@ -476,8 +476,8 @@
                            vrshrn_n_s32(u_hi[3], TXFM_COS_BIT_MAX));
 }
 
-static void fdct4x4_neon(const int16x4_t *input, int16x4_t *output,
-                         int cos_bit) {
+static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
 
@@ -605,8 +605,8 @@
   }
 }
 
-static void fdct8x4_neon(const int16x8_t *input, int16x8_t *output,
-                         int cos_bit) {
+static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
   const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
@@ -627,8 +627,8 @@
   output[3] = x2[3];
 }
 
-static void fdct4x8_neon(const int16x4_t *input, int16x4_t *output,
-                         int cos_bit) {
+static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
+                                          int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
   const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
@@ -655,8 +655,8 @@
   butterfly_s16_s32_x4_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
 }
 
-static void fdct8x8_neon(const int16x8_t *input, int16x8_t *output,
-                         int cos_bit) {
+static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
+                                          int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
   const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
@@ -683,8 +683,8 @@
   butterfly_s16_s32_x8_1003_neon(cospi24, x3[6], x3[5], &output[5], &output[3]);
 }
 
-static void fdct4x16_neon(const int16x4_t *input, int16x4_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
+                                           int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
   const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
@@ -739,8 +739,8 @@
                                  &output[3]);
 }
 
-static void fdct8x16_neon(const int16x8_t *input, int16x8_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
   const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
@@ -795,8 +795,8 @@
                                  &output[3]);
 }
 
-static void fdct8x32_neon(const int16x8_t *input, int16x8_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
   const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
@@ -901,8 +901,8 @@
                                  &output[3]);
 }
 
-static void fdct8x64_neon(const int16x8_t *input, int16x8_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi1 = vld1_s16(&cospi[4 * 1]);
   const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
@@ -1136,8 +1136,8 @@
   output[62] = x9[31];
 }
 
-static void fadst8x8_neon(const int16x8_t *input, int16x8_t *output,
-                          int cos_bit) {
+static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
+                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
   const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
@@ -1184,8 +1184,8 @@
   butterfly_s16_s32_x8_0112_neon(cospi12, x5[6], x5[7], &output[6], &output[1]);
 }
 
-static void fadst4x16_neon(const int16x4_t *input, int16x4_t *output,
-                           int cos_bit) {
+static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
+                                            int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
   const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
@@ -1292,8 +1292,8 @@
                                  &output[1]);
 }
 
-static void fadst8x16_neon(const int16x8_t *input, int16x8_t *output,
-                           int cos_bit) {
+static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
+                                            int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
   const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
   const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
@@ -1400,318 +1400,491 @@
                                  &output[1]);
 }
 
-static void fidentity4x4_neon(const int16x4_t *const input,
-                              int16x4_t *const output, const int cos_bit) {
+static AOM_FORCE_INLINE void fidentity4x4_neon(const int16x4_t *const input,
+                                               int16x4_t *const output,
+                                               const int cos_bit) {
   (void)cos_bit;
   round_shift_sqrt2_s16_s16_4xn_neon(input, output, 4);
 }
 
-static INLINE void fidentity8x4_neon(const int16x8_t *const input,
-                                     int16x8_t *const output,
-                                     const int cos_bit) {
+static AOM_FORCE_INLINE void fidentity8x4_neon(const int16x8_t *const input,
+                                               int16x8_t *const output,
+                                               const int cos_bit) {
   (void)cos_bit;
   round_shift_sqrt2_s16_s16_8xn_neon(input, output, 4);
 }
 
-static void fidentity4x8_neon(const int16x4_t *input, int16x4_t *output,
-                              int cos_bit) {
+static AOM_FORCE_INLINE void fidentity4x8_neon(const int16x4_t *input,
+                                               int16x4_t *output, int cos_bit) {
   (void)cos_bit;
   shift_left_1_s16_x4(input, output, 8);
 }
 
-static void fidentity8x8_neon(const int16x8_t *input, int16x8_t *output,
-                              int cos_bit) {
+static AOM_FORCE_INLINE void fidentity8x8_neon(const int16x8_t *input,
+                                               int16x8_t *output, int cos_bit) {
   (void)cos_bit;
   shift_left_1_s16_x8(input, output, 8);
 }
 
-static INLINE void fidentity4x16_neon(const int16x4_t *input, int16x4_t *output,
-                                      int cos_bit) {
+static AOM_FORCE_INLINE void fidentity4x16_neon(const int16x4_t *input,
+                                                int16x4_t *output,
+                                                int cos_bit) {
   (void)cos_bit;
   round_shift_2sqrt2_s16_s16_4xn_neon(input, output, 16);
 }
 
-static INLINE void fidentity8x16_neon(const int16x8_t *input, int16x8_t *output,
-                                      int cos_bit) {
+static AOM_FORCE_INLINE void fidentity8x16_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
   (void)cos_bit;
   round_shift_2sqrt2_s16_s16_8xn_neon(input, output, 16);
 }
 
-static INLINE void fidentity8x32_neon(const int16x8_t *input, int16x8_t *output,
-                                      int cos_bit) {
+static AOM_FORCE_INLINE void fidentity8x32_neon(const int16x8_t *input,
+                                                int16x8_t *output,
+                                                int cos_bit) {
   (void)cos_bit;
   shift_left_2_s16_x8(input, output, 32);
 }
 
+#define TRANSFORM_COL(name, tw, n)                                          \
+  static void name##_col_neon(const int16_t *input, int16x##tw##_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    load_buffer_s16_x##tw(input, stride, buf0, n);                          \
+    shift_left_2_s16_x##tw(buf0, buf0, n);                                  \
+    name##_neon(buf0, output, cos_bit);                                     \
+  }
+
+TRANSFORM_COL(fadst4x4, 4, 4)
+TRANSFORM_COL(fadst4x8, 4, 8)
+TRANSFORM_COL(fadst4x16, 4, 16)
+TRANSFORM_COL(fadst8x4, 8, 4)
+TRANSFORM_COL(fadst8x8, 8, 8)
+TRANSFORM_COL(fadst8x16, 8, 16)
+TRANSFORM_COL(fdct4x4, 4, 4)
+TRANSFORM_COL(fdct4x8, 4, 8)
+TRANSFORM_COL(fdct4x16, 4, 16)
+TRANSFORM_COL(fdct8x4, 8, 4)
+TRANSFORM_COL(fdct8x8, 8, 8)
+TRANSFORM_COL(fdct8x16, 8, 16)
+TRANSFORM_COL(fdct8x32, 8, 32)
+TRANSFORM_COL(fidentity4x4, 4, 4)
+TRANSFORM_COL(fidentity4x8, 4, 8)
+TRANSFORM_COL(fidentity4x16, 4, 16)
+TRANSFORM_COL(fidentity8x4, 8, 4)
+TRANSFORM_COL(fidentity8x8, 8, 8)
+TRANSFORM_COL(fidentity8x16, 8, 16)
+TRANSFORM_COL(fidentity8x32, 8, 32)
+
+#define TRANSFORM_ROW(name, tw, n)                                          \
+  static void name##_row_neon(const int16x##tw##_t *input, int32_t *output, \
+                              int stride, int cos_bit) {                    \
+    int16x##tw##_t buf0[n];                                                 \
+    name##_neon(input, buf0, cos_bit);                                      \
+    store_buffer_s16_x##tw(buf0, output, stride, n);                        \
+  }
+
+#define TRANSFORM_ROW_RECT(name, tw, n)                                        \
+  static void name##_row_rect_neon(const int16x##tw##_t *input,                \
+                                   int32_t *output, int stride, int cos_bit) { \
+    int16x##tw##_t buf0[n];                                                    \
+    name##_neon(input, buf0, cos_bit);                                         \
+    store_rect_buffer_s16_x##tw(buf0, output, stride, n);                      \
+  }
+
+TRANSFORM_ROW(fadst4x4, 4, 4)
+TRANSFORM_ROW(fadst4x16, 4, 16)
+TRANSFORM_ROW(fadst8x4, 8, 4)
+TRANSFORM_ROW(fadst8x8, 8, 8)
+TRANSFORM_ROW(fadst8x16, 8, 16)
+TRANSFORM_ROW(fdct4x4, 4, 4)
+TRANSFORM_ROW(fdct4x16, 4, 16)
+TRANSFORM_ROW(fdct8x4, 8, 4)
+TRANSFORM_ROW(fdct8x8, 8, 8)
+TRANSFORM_ROW(fdct8x16, 8, 16)
+TRANSFORM_ROW(fdct8x32, 8, 32)
+TRANSFORM_ROW(fidentity4x4, 4, 4)
+TRANSFORM_ROW(fidentity4x16, 4, 16)
+TRANSFORM_ROW(fidentity8x4, 8, 4)
+TRANSFORM_ROW(fidentity8x8, 8, 8)
+TRANSFORM_ROW(fidentity8x16, 8, 16)
+TRANSFORM_ROW(fidentity8x32, 8, 32)
+
+TRANSFORM_ROW_RECT(fadst4x8, 4, 8)
+TRANSFORM_ROW_RECT(fadst8x4, 8, 4)
+TRANSFORM_ROW_RECT(fadst8x8, 8, 8)
+TRANSFORM_ROW_RECT(fadst8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct4x8, 4, 8)
+TRANSFORM_ROW_RECT(fdct8x4, 8, 4)
+TRANSFORM_ROW_RECT(fdct8x8, 8, 8)
+TRANSFORM_ROW_RECT(fdct8x16, 8, 16)
+TRANSFORM_ROW_RECT(fdct8x32, 8, 32)
+TRANSFORM_ROW_RECT(fidentity4x8, 4, 8)
+TRANSFORM_ROW_RECT(fidentity8x4, 8, 4)
+TRANSFORM_ROW_RECT(fidentity8x8, 8, 8)
+TRANSFORM_ROW_RECT(fidentity8x16, 8, 16)
+TRANSFORM_ROW_RECT(fidentity8x32, 8, 32)
+
 typedef void (*transform_1d_lbd_4_neon)(const int16x4_t *input,
                                         int16x4_t *output, int cos_bit);
 typedef void (*transform_1d_lbd_8_neon)(const int16x8_t *input,
                                         int16x8_t *output, int cos_bit);
 
-static const transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = {
-  fdct4x4_neon,       // DCT_DCT
-  fadst4x4_neon,      // ADST_DCT
-  fdct4x4_neon,       // DCT_ADST
-  fadst4x4_neon,      // ADST_ADST
-  fadst4x4_neon,      // FLIPADST_DCT
-  fdct4x4_neon,       // DCT_FLIPADST
-  fadst4x4_neon,      // FLIPADST_FLIPADST
-  fadst4x4_neon,      // ADST_FLIPADST
-  fadst4x4_neon,      // FLIPADST_ADST
-  fidentity4x4_neon,  // IDTX
-  fdct4x4_neon,       // V_DCT
-  fidentity4x4_neon,  // H_DCT
-  fadst4x4_neon,      // V_ADST
-  fidentity4x4_neon,  // H_ADST
-  fadst4x4_neon,      // V_FLIPADST
-  fidentity4x4_neon   // H_FLIPADST
+typedef void (*col_transform_1d_lbd_4_neon)(const int16_t *input,
+                                            int16x4_t *output, int stride,
+                                            int cos_bit);
+typedef void (*col_transform_1d_lbd_8_neon)(const int16_t *input,
+                                            int16x8_t *output, int stride,
+                                            int cos_bit);
+
+typedef void (*row_transform_1d_lbd_4_neon)(const int16x4_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+typedef void (*row_transform_1d_lbd_8_neon)(const int16x8_t *input,
+                                            int32_t *output, int stride,
+                                            int cos_bit);
+
+static const col_transform_1d_lbd_4_neon col_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_col_neon,       // DCT_DCT
+  fadst4x4_col_neon,      // ADST_DCT
+  fdct4x4_col_neon,       // DCT_ADST
+  fadst4x4_col_neon,      // ADST_ADST
+  fadst4x4_col_neon,      // FLIPADST_DCT
+  fdct4x4_col_neon,       // DCT_FLIPADST
+  fadst4x4_col_neon,      // FLIPADST_FLIPADST
+  fadst4x4_col_neon,      // ADST_FLIPADST
+  fadst4x4_col_neon,      // FLIPADST_ADST
+  fidentity4x4_col_neon,  // IDTX
+  fdct4x4_col_neon,       // V_DCT
+  fidentity4x4_col_neon,  // H_DCT
+  fadst4x4_col_neon,      // V_ADST
+  fidentity4x4_col_neon,  // H_ADST
+  fadst4x4_col_neon,      // V_FLIPADST
+  fidentity4x4_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = {
-  fdct4x4_neon,       // DCT_DCT
-  fdct4x4_neon,       // ADST_DCT
-  fadst4x4_neon,      // DCT_ADST
-  fadst4x4_neon,      // ADST_ADST
-  fdct4x4_neon,       // FLIPADST_DCT
-  fadst4x4_neon,      // DCT_FLIPADST
-  fadst4x4_neon,      // FLIPADST_FLIPADST
-  fadst4x4_neon,      // ADST_FLIPADST
-  fadst4x4_neon,      // FLIPADST_ADST
-  fidentity4x4_neon,  // IDTX
-  fidentity4x4_neon,  // V_DCT
-  fdct4x4_neon,       // H_DCT
-  fidentity4x4_neon,  // V_ADST
-  fadst4x4_neon,      // H_ADST
-  fidentity4x4_neon,  // V_FLIPADST
-  fadst4x4_neon       // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_txfm4x4_arr[TX_TYPES] = {
+  fdct4x4_row_neon,       // DCT_DCT
+  fdct4x4_row_neon,       // ADST_DCT
+  fadst4x4_row_neon,      // DCT_ADST
+  fadst4x4_row_neon,      // ADST_ADST
+  fdct4x4_row_neon,       // FLIPADST_DCT
+  fadst4x4_row_neon,      // DCT_FLIPADST
+  fadst4x4_row_neon,      // FLIPADST_FLIPADST
+  fadst4x4_row_neon,      // ADST_FLIPADST
+  fadst4x4_row_neon,      // FLIPADST_ADST
+  fidentity4x4_row_neon,  // IDTX
+  fidentity4x4_row_neon,  // V_DCT
+  fdct4x4_row_neon,       // H_DCT
+  fidentity4x4_row_neon,  // V_ADST
+  fadst4x4_row_neon,      // H_ADST
+  fidentity4x4_row_neon,  // V_FLIPADST
+  fadst4x4_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,       // DCT_DCT
-  fadst4x8_neon,      // ADST_DCT
-  fdct4x8_neon,       // DCT_ADST
-  fadst4x8_neon,      // ADST_ADST
-  fadst4x8_neon,      // FLIPADST_DCT
-  fdct4x8_neon,       // DCT_FLIPADST
-  fadst4x8_neon,      // FLIPADST_FLIPADST
-  fadst4x8_neon,      // ADST_FLIPADST
-  fadst4x8_neon,      // FLIPADST_ADST
-  fidentity4x8_neon,  // IDTX
-  fdct4x8_neon,       // V_DCT
-  fidentity4x8_neon,  // H_DCT
-  fadst4x8_neon,      // V_ADST
-  fidentity4x8_neon,  // H_ADST
-  fadst4x8_neon,      // V_FLIPADST
-  fidentity4x8_neon   // H_FLIPADST
+static const col_transform_1d_lbd_4_neon col_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_col_neon,       // DCT_DCT
+  fadst4x8_col_neon,      // ADST_DCT
+  fdct4x8_col_neon,       // DCT_ADST
+  fadst4x8_col_neon,      // ADST_ADST
+  fadst4x8_col_neon,      // FLIPADST_DCT
+  fdct4x8_col_neon,       // DCT_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_FLIPADST
+  fadst4x8_col_neon,      // ADST_FLIPADST
+  fadst4x8_col_neon,      // FLIPADST_ADST
+  fidentity4x8_col_neon,  // IDTX
+  fdct4x8_col_neon,       // V_DCT
+  fidentity4x8_col_neon,  // H_DCT
+  fadst4x8_col_neon,      // V_ADST
+  fidentity4x8_col_neon,  // H_ADST
+  fadst4x8_col_neon,      // V_FLIPADST
+  fidentity4x8_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
-  fdct8x4_neon,       // DCT_DCT
-  fdct8x4_neon,       // ADST_DCT
-  fadst8x4_neon,      // DCT_ADST
-  fadst8x4_neon,      // ADST_ADST
-  fdct8x4_neon,       // FLIPADST_DCT
-  fadst8x4_neon,      // DCT_FLIPADST
-  fadst8x4_neon,      // FLIPADST_FLIPADST
-  fadst8x4_neon,      // ADST_FLIPADST
-  fadst8x4_neon,      // FLIPADST_ADST
-  fidentity8x4_neon,  // IDTX
-  fidentity8x4_neon,  // V_DCT
-  fdct8x4_neon,       // H_DCT
-  fidentity8x4_neon,  // V_ADST
-  fadst8x4_neon,      // H_ADST
-  fidentity8x4_neon,  // V_FLIPADST
-  fadst8x4_neon       // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_neon,       // DCT_DCT
+  fdct8x4_row_neon,       // ADST_DCT
+  fadst8x4_row_neon,      // DCT_ADST
+  fadst8x4_row_neon,      // ADST_ADST
+  fdct8x4_row_neon,       // FLIPADST_DCT
+  fadst8x4_row_neon,      // DCT_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_neon,      // ADST_FLIPADST
+  fadst8x4_row_neon,      // FLIPADST_ADST
+  fidentity8x4_row_neon,  // IDTX
+  fidentity8x4_row_neon,  // V_DCT
+  fdct8x4_row_neon,       // H_DCT
+  fidentity8x4_row_neon,  // V_ADST
+  fadst8x4_row_neon,      // H_ADST
+  fidentity8x4_row_neon,  // V_FLIPADST
+  fadst8x4_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
-  fdct8x4_neon,       // DCT_DCT
-  fadst8x4_neon,      // ADST_DCT
-  fdct8x4_neon,       // DCT_ADST
-  fadst8x4_neon,      // ADST_ADST
-  fadst8x4_neon,      // FLIPADST_DCT
-  fdct8x4_neon,       // DCT_FLIPADST
-  fadst8x4_neon,      // FLIPADST_FLIPADST
-  fadst8x4_neon,      // ADST_FLIPADST
-  fadst8x4_neon,      // FLIPADST_ADST
-  fidentity8x4_neon,  // IDTX
-  fdct8x4_neon,       // V_DCT
-  fidentity8x4_neon,  // H_DCT
-  fadst8x4_neon,      // V_ADST
-  fidentity8x4_neon,  // H_ADST
-  fadst8x4_neon,      // V_FLIPADST
-  fidentity8x4_neon   // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_row_rect_neon,       // DCT_DCT
+  fdct8x4_row_rect_neon,       // ADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_ADST
+  fadst8x4_row_rect_neon,      // ADST_ADST
+  fdct8x4_row_rect_neon,       // FLIPADST_DCT
+  fadst8x4_row_rect_neon,      // DCT_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x4_row_rect_neon,      // ADST_FLIPADST
+  fadst8x4_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x4_row_rect_neon,  // IDTX
+  fidentity8x4_row_rect_neon,  // V_DCT
+  fdct8x4_row_rect_neon,       // H_DCT
+  fidentity8x4_row_rect_neon,  // V_ADST
+  fadst8x4_row_rect_neon,      // H_ADST
+  fidentity8x4_row_rect_neon,  // V_FLIPADST
+  fadst8x4_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_4_neon row_txfm4x8_arr[TX_TYPES] = {
-  fdct4x8_neon,       // DCT_DCT
-  fdct4x8_neon,       // ADST_DCT
-  fadst4x8_neon,      // DCT_ADST
-  fadst4x8_neon,      // ADST_ADST
-  fdct4x8_neon,       // FLIPADST_DCT
-  fadst4x8_neon,      // DCT_FLIPADST
-  fadst4x8_neon,      // FLIPADST_FLIPADST
-  fadst4x8_neon,      // ADST_FLIPADST
-  fadst4x8_neon,      // FLIPADST_ADST
-  fidentity4x8_neon,  // IDTX
-  fidentity4x8_neon,  // V_DCT
-  fdct4x8_neon,       // H_DCT
-  fidentity4x8_neon,  // V_ADST
-  fadst4x8_neon,      // H_ADST
-  fidentity4x8_neon,  // V_FLIPADST
-  fadst4x8_neon       // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x4_arr[TX_TYPES] = {
+  fdct8x4_col_neon,       // DCT_DCT
+  fadst8x4_col_neon,      // ADST_DCT
+  fdct8x4_col_neon,       // DCT_ADST
+  fadst8x4_col_neon,      // ADST_ADST
+  fadst8x4_col_neon,      // FLIPADST_DCT
+  fdct8x4_col_neon,       // DCT_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_FLIPADST
+  fadst8x4_col_neon,      // ADST_FLIPADST
+  fadst8x4_col_neon,      // FLIPADST_ADST
+  fidentity8x4_col_neon,  // IDTX
+  fdct8x4_col_neon,       // V_DCT
+  fidentity8x4_col_neon,  // H_DCT
+  fadst8x4_col_neon,      // V_ADST
+  fidentity8x4_col_neon,  // H_ADST
+  fadst8x4_col_neon,      // V_FLIPADST
+  fidentity8x4_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,       // DCT_DCT
-  fadst8x8_neon,      // ADST_DCT
-  fdct8x8_neon,       // DCT_ADST
-  fadst8x8_neon,      // ADST_ADST
-  fadst8x8_neon,      // FLIPADST_DCT
-  fdct8x8_neon,       // DCT_FLIPADST
-  fadst8x8_neon,      // FLIPADST_FLIPADST
-  fadst8x8_neon,      // ADST_FLIPADST
-  fadst8x8_neon,      // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fdct8x8_neon,       // V_DCT
-  fidentity8x8_neon,  // H_DCT
-  fadst8x8_neon,      // V_ADST
-  fidentity8x8_neon,  // H_ADST
-  fadst8x8_neon,      // V_FLIPADST
-  fidentity8x8_neon,  // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_rect_txfm4x8_arr[TX_TYPES] = {
+  fdct4x8_row_rect_neon,       // DCT_DCT
+  fdct4x8_row_rect_neon,       // ADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_ADST
+  fadst4x8_row_rect_neon,      // ADST_ADST
+  fdct4x8_row_rect_neon,       // FLIPADST_DCT
+  fadst4x8_row_rect_neon,      // DCT_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst4x8_row_rect_neon,      // ADST_FLIPADST
+  fadst4x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity4x8_row_rect_neon,  // IDTX
+  fidentity4x8_row_rect_neon,  // V_DCT
+  fdct4x8_row_rect_neon,       // H_DCT
+  fidentity4x8_row_rect_neon,  // V_ADST
+  fadst4x8_row_rect_neon,      // H_ADST
+  fidentity4x8_row_rect_neon,  // V_FLIPADST
+  fadst4x8_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
-  fdct8x8_neon,       // DCT_DCT
-  fdct8x8_neon,       // ADST_DCT
-  fadst8x8_neon,      // DCT_ADST
-  fadst8x8_neon,      // ADST_ADST
-  fdct8x8_neon,       // FLIPADST_DCT
-  fadst8x8_neon,      // DCT_FLIPADST
-  fadst8x8_neon,      // FLIPADST_FLIPADST
-  fadst8x8_neon,      // ADST_FLIPADST
-  fadst8x8_neon,      // FLIPADST_ADST
-  fidentity8x8_neon,  // IDTX
-  fidentity8x8_neon,  // V_DCT
-  fdct8x8_neon,       // H_DCT
-  fidentity8x8_neon,  // V_ADST
-  fadst8x8_neon,      // H_ADST
-  fidentity8x8_neon,  // V_FLIPADST
-  fadst8x8_neon       // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_col_neon,       // DCT_DCT
+  fadst8x8_col_neon,      // ADST_DCT
+  fdct8x8_col_neon,       // DCT_ADST
+  fadst8x8_col_neon,      // ADST_ADST
+  fadst8x8_col_neon,      // FLIPADST_DCT
+  fdct8x8_col_neon,       // DCT_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_FLIPADST
+  fadst8x8_col_neon,      // ADST_FLIPADST
+  fadst8x8_col_neon,      // FLIPADST_ADST
+  fidentity8x8_col_neon,  // IDTX
+  fdct8x8_col_neon,       // V_DCT
+  fidentity8x8_col_neon,  // H_DCT
+  fadst8x8_col_neon,      // V_ADST
+  fidentity8x8_col_neon,  // H_ADST
+  fadst8x8_col_neon,      // V_FLIPADST
+  fidentity8x8_col_neon,  // H_FLIPADST
 };
 
-static const transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
-  fdct4x16_neon,       // DCT_DCT
-  fadst4x16_neon,      // ADST_DCT
-  fdct4x16_neon,       // DCT_ADST
-  fadst4x16_neon,      // ADST_ADST
-  fadst4x16_neon,      // FLIPADST_DCT
-  fdct4x16_neon,       // DCT_FLIPADST
-  fadst4x16_neon,      // FLIPADST_FLIPADST
-  fadst4x16_neon,      // ADST_FLIPADST
-  fadst4x16_neon,      // FLIPADST_ADST
-  fidentity4x16_neon,  // IDTX
-  fdct4x16_neon,       // V_DCT
-  fidentity4x16_neon,  // H_DCT
-  fadst4x16_neon,      // V_ADST
-  fidentity4x16_neon,  // H_ADST
-  fadst4x16_neon,      // V_FLIPADST
-  fidentity4x16_neon   // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_neon,       // DCT_DCT
+  fdct8x8_row_neon,       // ADST_DCT
+  fadst8x8_row_neon,      // DCT_ADST
+  fadst8x8_row_neon,      // ADST_ADST
+  fdct8x8_row_neon,       // FLIPADST_DCT
+  fadst8x8_row_neon,      // DCT_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_neon,      // ADST_FLIPADST
+  fadst8x8_row_neon,      // FLIPADST_ADST
+  fidentity8x8_row_neon,  // IDTX
+  fidentity8x8_row_neon,  // V_DCT
+  fdct8x8_row_neon,       // H_DCT
+  fidentity8x8_row_neon,  // V_ADST
+  fadst8x8_row_neon,      // H_ADST
+  fidentity8x8_row_neon,  // V_FLIPADST
+  fadst8x8_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
-  fdct4x16_neon,       // DCT_DCT
-  fdct4x16_neon,       // ADST_DCT
-  fadst4x16_neon,      // DCT_ADST
-  fadst4x16_neon,      // ADST_ADST
-  fdct4x16_neon,       // FLIPADST_DCT
-  fadst4x16_neon,      // DCT_FLIPADST
-  fadst4x16_neon,      // FLIPADST_FLIPADST
-  fadst4x16_neon,      // ADST_FLIPADST
-  fadst4x16_neon,      // FLIPADST_ADST
-  fidentity4x16_neon,  // IDTX
-  fidentity4x16_neon,  // V_DCT
-  fdct4x16_neon,       // H_DCT
-  fidentity4x16_neon,  // V_ADST
-  fadst4x16_neon,      // H_ADST
-  fidentity4x16_neon,  // V_FLIPADST
-  fadst4x16_neon       // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x8_arr[TX_TYPES] = {
+  fdct8x8_row_rect_neon,       // DCT_DCT
+  fdct8x8_row_rect_neon,       // ADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_ADST
+  fadst8x8_row_rect_neon,      // ADST_ADST
+  fdct8x8_row_rect_neon,       // FLIPADST_DCT
+  fadst8x8_row_rect_neon,      // DCT_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x8_row_rect_neon,      // ADST_FLIPADST
+  fadst8x8_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x8_row_rect_neon,  // IDTX
+  fidentity8x8_row_rect_neon,  // V_DCT
+  fdct8x8_row_rect_neon,       // H_DCT
+  fidentity8x8_row_rect_neon,  // V_ADST
+  fadst8x8_row_rect_neon,      // H_ADST
+  fidentity8x8_row_rect_neon,  // V_FLIPADST
+  fadst8x8_row_rect_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
-  fdct8x16_neon,       // DCT_DCT
-  fadst8x16_neon,      // ADST_DCT
-  fdct8x16_neon,       // DCT_ADST
-  fadst8x16_neon,      // ADST_ADST
-  fadst8x16_neon,      // FLIPADST_DCT
-  fdct8x16_neon,       // DCT_FLIPADST
-  fadst8x16_neon,      // FLIPADST_FLIPADST
-  fadst8x16_neon,      // ADST_FLIPADST
-  fadst8x16_neon,      // FLIPADST_ADST
-  fidentity8x16_neon,  // IDTX
-  fdct8x16_neon,       // V_DCT
-  fidentity8x16_neon,  // H_DCT
-  fadst8x16_neon,      // V_ADST
-  fidentity8x16_neon,  // H_ADST
-  fadst8x16_neon,      // V_FLIPADST
-  fidentity8x16_neon   // H_FLIPADST
+static const col_transform_1d_lbd_4_neon col_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_col_neon,       // DCT_DCT
+  fadst4x16_col_neon,      // ADST_DCT
+  fdct4x16_col_neon,       // DCT_ADST
+  fadst4x16_col_neon,      // ADST_ADST
+  fadst4x16_col_neon,      // FLIPADST_DCT
+  fdct4x16_col_neon,       // DCT_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_FLIPADST
+  fadst4x16_col_neon,      // ADST_FLIPADST
+  fadst4x16_col_neon,      // FLIPADST_ADST
+  fidentity4x16_col_neon,  // IDTX
+  fdct4x16_col_neon,       // V_DCT
+  fidentity4x16_col_neon,  // H_DCT
+  fadst4x16_col_neon,      // V_ADST
+  fidentity4x16_col_neon,  // H_ADST
+  fadst4x16_col_neon,      // V_FLIPADST
+  fidentity4x16_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
-  fdct8x16_neon,       // DCT_DCT
-  fdct8x16_neon,       // ADST_DCT
-  fadst8x16_neon,      // DCT_ADST
-  fadst8x16_neon,      // ADST_ADST
-  fdct8x16_neon,       // FLIPADST_DCT
-  fadst8x16_neon,      // DCT_FLIPADST
-  fadst8x16_neon,      // FLIPADST_FLIPADST
-  fadst8x16_neon,      // ADST_FLIPADST
-  fadst8x16_neon,      // FLIPADST_ADST
-  fidentity8x16_neon,  // IDTX
-  fidentity8x16_neon,  // V_DCT
-  fdct8x16_neon,       // H_DCT
-  fidentity8x16_neon,  // V_ADST
-  fadst8x16_neon,      // H_ADST
-  fidentity8x16_neon,  // V_FLIPADST
-  fadst8x16_neon       // H_FLIPADST
+static const row_transform_1d_lbd_4_neon row_txfm4x16_arr[TX_TYPES] = {
+  fdct4x16_row_neon,       // DCT_DCT
+  fdct4x16_row_neon,       // ADST_DCT
+  fadst4x16_row_neon,      // DCT_ADST
+  fadst4x16_row_neon,      // ADST_ADST
+  fdct4x16_row_neon,       // FLIPADST_DCT
+  fadst4x16_row_neon,      // DCT_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_FLIPADST
+  fadst4x16_row_neon,      // ADST_FLIPADST
+  fadst4x16_row_neon,      // FLIPADST_ADST
+  fidentity4x16_row_neon,  // IDTX
+  fidentity4x16_row_neon,  // V_DCT
+  fdct4x16_row_neon,       // H_DCT
+  fidentity4x16_row_neon,  // V_ADST
+  fadst4x16_row_neon,      // H_ADST
+  fidentity4x16_row_neon,  // V_FLIPADST
+  fadst4x16_row_neon       // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
-  fdct8x32_neon,       // DCT_DCT
-  NULL,                // ADST_DCT
-  NULL,                // DCT_ADST
-  NULL,                // ADST_ADST
-  NULL,                // FLIPADST_DCT
-  NULL,                // DCT_FLIPADST
-  NULL,                // FLIPADST_FLIPADST
-  NULL,                // ADST_FLIPADST
-  NULL,                // FLIPADST_ADST
-  fidentity8x32_neon,  // IDTX
-  fidentity8x32_neon,  // V_DCT
-  fdct8x32_neon,       // H_DCT
-  NULL,                // V_ADST
-  NULL,                // H_ADST
-  NULL,                // V_FLIPADST
-  NULL                 // H_FLIPADST
+static const col_transform_1d_lbd_8_neon col_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_col_neon,       // DCT_DCT
+  fadst8x16_col_neon,      // ADST_DCT
+  fdct8x16_col_neon,       // DCT_ADST
+  fadst8x16_col_neon,      // ADST_ADST
+  fadst8x16_col_neon,      // FLIPADST_DCT
+  fdct8x16_col_neon,       // DCT_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_FLIPADST
+  fadst8x16_col_neon,      // ADST_FLIPADST
+  fadst8x16_col_neon,      // FLIPADST_ADST
+  fidentity8x16_col_neon,  // IDTX
+  fdct8x16_col_neon,       // V_DCT
+  fidentity8x16_col_neon,  // H_DCT
+  fadst8x16_col_neon,      // V_ADST
+  fidentity8x16_col_neon,  // H_ADST
+  fadst8x16_col_neon,      // V_FLIPADST
+  fidentity8x16_col_neon   // H_FLIPADST
 };
 
-static const transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
-  fdct8x32_neon,       // DCT_DCT
-  NULL,                // ADST_DCT
-  NULL,                // DCT_ADST
-  NULL,                // ADST_ADST
-  NULL,                // FLIPADST_DCT
-  NULL,                // DCT_FLIPADST
-  NULL,                // FLIPADST_FLIPADST
-  NULL,                // ADST_FLIPADST
-  NULL,                // FLIPADST_ADST
-  fidentity8x32_neon,  // IDTX
-  fdct8x32_neon,       // V_DCT
-  fidentity8x32_neon,  // H_DCT
-  NULL,                // V_ADST
-  NULL,                // H_ADST
-  NULL,                // V_FLIPADST
-  NULL                 // H_FLIPADST
+static const row_transform_1d_lbd_8_neon row_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_neon,       // DCT_DCT
+  fdct8x16_row_neon,       // ADST_DCT
+  fadst8x16_row_neon,      // DCT_ADST
+  fadst8x16_row_neon,      // ADST_ADST
+  fdct8x16_row_neon,       // FLIPADST_DCT
+  fadst8x16_row_neon,      // DCT_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_neon,      // ADST_FLIPADST
+  fadst8x16_row_neon,      // FLIPADST_ADST
+  fidentity8x16_row_neon,  // IDTX
+  fidentity8x16_row_neon,  // V_DCT
+  fdct8x16_row_neon,       // H_DCT
+  fidentity8x16_row_neon,  // V_ADST
+  fadst8x16_row_neon,      // H_ADST
+  fidentity8x16_row_neon,  // V_FLIPADST
+  fadst8x16_row_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x16_arr[TX_TYPES] = {
+  fdct8x16_row_rect_neon,       // DCT_DCT
+  fdct8x16_row_rect_neon,       // ADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_ADST
+  fadst8x16_row_rect_neon,      // ADST_ADST
+  fdct8x16_row_rect_neon,       // FLIPADST_DCT
+  fadst8x16_row_rect_neon,      // DCT_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_FLIPADST
+  fadst8x16_row_rect_neon,      // ADST_FLIPADST
+  fadst8x16_row_rect_neon,      // FLIPADST_ADST
+  fidentity8x16_row_rect_neon,  // IDTX
+  fidentity8x16_row_rect_neon,  // V_DCT
+  fdct8x16_row_rect_neon,       // H_DCT
+  fidentity8x16_row_rect_neon,  // V_ADST
+  fadst8x16_row_rect_neon,      // H_ADST
+  fidentity8x16_row_rect_neon,  // V_FLIPADST
+  fadst8x16_row_rect_neon       // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_row_neon,  // IDTX
+  fidentity8x32_row_neon,  // V_DCT
+  fdct8x32_row_neon,       // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
+};
+
+static const row_transform_1d_lbd_8_neon row_rect_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_row_rect_neon,       // DCT_DCT
+  NULL,                         // ADST_DCT
+  NULL,                         // DCT_ADST
+  NULL,                         // ADST_ADST
+  NULL,                         // FLIPADST_DCT
+  NULL,                         // DCT_FLIPADST
+  NULL,                         // FLIPADST_FLIPADST
+  NULL,                         // ADST_FLIPADST
+  NULL,                         // FLIPADST_ADST
+  fidentity8x32_row_rect_neon,  // IDTX
+  fidentity8x32_row_rect_neon,  // V_DCT
+  fdct8x32_row_rect_neon,       // H_DCT
+  NULL,                         // V_ADST
+  NULL,                         // H_ADST
+  NULL,                         // V_FLIPADST
+  NULL                          // H_FLIPADST
+};
+
+static const col_transform_1d_lbd_8_neon col_txfm8x32_arr[TX_TYPES] = {
+  fdct8x32_col_neon,       // DCT_DCT
+  NULL,                    // ADST_DCT
+  NULL,                    // DCT_ADST
+  NULL,                    // ADST_ADST
+  NULL,                    // FLIPADST_DCT
+  NULL,                    // DCT_FLIPADST
+  NULL,                    // FLIPADST_FLIPADST
+  NULL,                    // ADST_FLIPADST
+  NULL,                    // FLIPADST_ADST
+  fidentity8x32_col_neon,  // IDTX
+  fdct8x32_col_neon,       // V_DCT
+  fidentity8x32_col_neon,  // H_DCT
+  NULL,                    // V_ADST
+  NULL,                    // H_ADST
+  NULL,                    // V_FLIPADST
+  NULL                     // H_FLIPADST
 };
 
 static void ud_adjust_input_and_stride(int ud_flip, const int16_t **input,
@@ -1726,24 +1899,20 @@
                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x4_t buf0[4], buf1[4];
-  const transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type];
-  const transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x4_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
-  load_buffer_16bit_to_16bit_w4(input, stride, buf0, 4);
-  shift_left_2_s16_x4(buf0, buf0, 4);
-  col_txfm(buf0, buf0, 13);
+  col_txfm(input, buf0, stride, 13);
   transpose_arrays_s16_4x4(buf0, buf1);
 
   if (lr_flip) {
     flip_buf_4_neon(buf1, buf0, 4);
-    row_txfm(buf0, buf0, 13);
-    store_buffer_16bit_to_32bit_4_to_4(buf0, output, 4, 4);
+    row_txfm(buf0, output, 4, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_buffer_16bit_to_32bit_4_to_4(buf1, output, 4, 4);
+    row_txfm(buf1, output, 4, 13);
   }
 }
 
@@ -1752,26 +1921,22 @@
   (void)bd;
   int16x4_t buf0[8];
   int16x8_t buf1[8];
-  const transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
-  int ud_flip, lr_flip;
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x4_arr[tx_type];
 
+  int ud_flip, lr_flip;
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
-  load_buffer_16bit_to_16bit_w4(input, stride, buf0, 8);
-  shift_left_2_s16_x4(buf0, buf0, 8);
-  col_txfm(buf0, buf0, 13);
+  col_txfm(input, buf0, stride, 13);
   shift_right_1_round_s16_x4(buf0, buf0, 8);
   transpose_arrays_s16_4x8(buf0, buf1);
 
   if (lr_flip) {
     int16x8_t buf2[8];
     flip_buf_8_neon(buf1, buf2, 4);
-    row_txfm(buf2, buf2, 13);
-    store_rect_buffer_16bit_to_32bit_w8(buf2, output, 8, 4);
+    row_txfm(buf2, output, 8, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_rect_buffer_16bit_to_32bit_w8(buf1, output, 8, 4);
+    row_txfm(buf1, output, 8, 13);
   }
 }
 
@@ -1780,16 +1945,13 @@
   (void)bd;
   int16x4_t buf0[16];
   int16x8_t buf1[16];
-  const transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
+  const col_transform_1d_lbd_4_neon col_txfm = col_txfm4x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x4_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
-  load_buffer_16bit_to_16bit_w4(input, stride, buf0, 16);
-  shift_left_2_s16_x4(buf0, buf0, 16);
-  col_txfm(buf0, buf0, 13);
-
+  col_txfm(input, buf0, stride, 13);
   shift_right_1_round_s16_x4(buf0, buf0, 16);
   transpose_arrays_s16_4x8(buf0, buf1);
   transpose_arrays_s16_4x8(buf0 + 8, buf1 + 8);
@@ -1798,12 +1960,10 @@
     if (lr_flip) {
       int16x8_t buf2[16];
       flip_buf_8_neon(buf1 + 8 * i, buf2, 4);
-      row_txfm(buf2, buf2, 12);
-      store_buffer_16bit_to_32bit_w8(buf2, output + 8 * i, 16, 4);
+      row_txfm(buf2, output + 8 * i, 16, 12);
     } else {
       int16x8_t *buf = buf1 + 8 * i;
-      row_txfm(buf, buf, 12);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 4);
+      row_txfm(buf, output + 8 * i, 16, 12);
     }
   }
 }
@@ -1813,26 +1973,22 @@
   (void)bd;
   int16x8_t buf0[8];
   int16x4_t buf1[8];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
-  const transform_1d_lbd_4_neon row_txfm = row_txfm4x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_rect_txfm4x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
-  load_buffer_16bit_to_16bit(input, stride, buf0, 4);
-  shift_left_2_s16_x8(buf0, buf0, 4);
-  col_txfm(buf0, buf0, 13);
+  col_txfm(input, buf0, stride, 13);
   shift_right_1_round_s16_x8(buf0, buf0, 4);
   transpose_arrays_s16_8x4(buf0, buf1);
 
   if (lr_flip) {
     int16x4_t buf2[8];
     flip_buf_4_neon(buf1, buf2, 8);
-    row_txfm(buf2, buf1, 13);
-    store_rect_buffer_16bit_to_32bit_w4(buf1, output, 4, 8);
+    row_txfm(buf2, output, 4, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_rect_buffer_16bit_to_32bit_w4(buf1, output, 4, 8);
+    row_txfm(buf1, output, 4, 13);
   }
 }
 
@@ -1840,25 +1996,21 @@
                                       int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[8], buf1[8];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
-  load_buffer_16bit_to_16bit(input, stride, buf0, 8);
-  shift_left_2_s16_x8(buf0, buf0, 8);
-  col_txfm(buf0, buf0, 13);
+  col_txfm(input, buf0, stride, 13);
   shift_right_1_round_s16_x8(buf0, buf0, 8);
   transpose_arrays_s16_8x8(buf0, buf1);
 
   if (lr_flip) {
     flip_buf_8_neon(buf1, buf0, 8);
-    row_txfm(buf0, buf0, 13);
-    store_buffer_16bit_to_32bit_w8(buf0, output, 8, 8);
+    row_txfm(buf0, output, 8, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_buffer_16bit_to_32bit_w8(buf1, output, 8, 8);
+    row_txfm(buf1, output, 8, 13);
   }
 }
 
@@ -1866,15 +2018,13 @@
                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[16];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
-  load_buffer_16bit_to_16bit(input, stride, buf0, 16);
-  shift_left_2_s16_x8(buf0, buf0, 16);
-  col_txfm(buf0, buf0, 13);
+  col_txfm(input, buf0, stride, 13);
   shift_right_2_round_s16_x8(buf0, buf0, 16);
   transpose_arrays_s16_8x8(buf0, buf1);
   transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
@@ -1882,12 +2032,10 @@
   for (int i = 0; i < 2; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
-      row_txfm(buf0, buf0, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 16, 8);
+      row_txfm(buf0, output + 8 * i, 16, 13);
     } else {
       int16x8_t *buf = buf1 + 8 * i;
-      row_txfm(buf, buf, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 8);
+      row_txfm(buf, output + 8 * i, 16, 13);
     }
   }
 }
@@ -1896,15 +2044,13 @@
                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[32];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x8_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
-  load_buffer_16bit_to_16bit(input, stride, buf0, 32);
-  shift_left_2_s16_x8(buf0, buf0, 32);
-  col_txfm(buf0, buf0, 12);
+  col_txfm(input, buf0, stride, 12);
   shift_right_2_round_s16_x8(buf0, buf0, 32);
   transpose_arrays_s16_8x8(buf0, buf1);
   transpose_arrays_s16_8x8(buf0 + 8, buf1 + 8);
@@ -1914,12 +2060,10 @@
   for (int i = 0; i < 4; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 8 * i, buf0, 8);
-      row_txfm(buf0, buf0, 12);
-      store_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 32, 8);
+      row_txfm(buf0, output + 8 * i, 32, 12);
     } else {
       int16x8_t *buf = buf1 + 8 * i;
-      row_txfm(buf, buf, 12);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 8);
+      row_txfm(buf, output + 8 * i, 32, 12);
     }
   }
 }
@@ -1930,27 +2074,23 @@
   int16x8_t buf0[16];
   int16x4_t buf1[16];
   int16x4_t buf2[16];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
-  const transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x4_arr[tx_type];
+  const row_transform_1d_lbd_4_neon row_txfm = row_txfm4x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 4);
   for (int i = 0; i < 2; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 4);
-    shift_left_2_s16_x8(buf0, buf0, 4);
-    col_txfm(buf0, buf0, 13);
+    col_txfm(input + 8 * i, buf0, stride, 13);
     shift_right_1_round_s16_x8(buf0, buf0, 4);
     transpose_arrays_s16_8x4(buf0, buf1 + 8 * i);
   }
 
   if (lr_flip) {
     flip_buf_4_neon(buf1, buf2, 16);
-    row_txfm(buf2, buf2, 13);
-    store_buffer_16bit_to_32bit_4_to_4(buf2, output, 4, 16);
+    row_txfm(buf2, output, 4, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_buffer_16bit_to_32bit_4_to_4(buf1, output, 4, 16);
+    row_txfm(buf1, output, 4, 13);
   }
 }
 
@@ -1958,27 +2098,23 @@
                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[16];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   for (int i = 0; i < 2; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 8);
-    shift_left_2_s16_x8(buf0, buf0, 8);
-    col_txfm(buf0, buf0, 13);
+    col_txfm(input + 8 * i, buf0, stride, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 8);
     transpose_arrays_s16_8x8(buf0, buf1 + 8 * i);
   }
 
   if (lr_flip) {
     flip_buf_8_neon(buf1, buf0, 16);
-    row_txfm(buf0, buf0, 13);
-    store_rect_buffer_16bit_to_32bit_w8(buf0, output, 8, 16);
+    row_txfm(buf0, output, 8, 13);
   } else {
-    row_txfm(buf1, buf1, 13);
-    store_rect_buffer_16bit_to_32bit_w8(buf1, output, 8, 16);
+    row_txfm(buf1, output, 8, 13);
   }
 }
 
@@ -1986,16 +2122,14 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[16], buf1[32];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
   int ud_flip, lr_flip;
 
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   for (int i = 0; i < 2; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 16);
-    shift_left_2_s16_x8(buf0, buf0, 16);
-    col_txfm(buf0, buf0, 13);
+    col_txfm(input + 8 * i, buf0, stride, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 16);
     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 16 + 8 * i);
     transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 16 + 8 * i);
@@ -2004,12 +2138,10 @@
   for (int i = 0; i < 2; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
-      row_txfm(buf0, buf0, 12);
-      store_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 16, 16);
+      row_txfm(buf0, output + 8 * i, 16, 12);
     } else {
       int16x8_t *buf = buf1 + 16 * i;
-      row_txfm(buf, buf, 12);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 16);
+      row_txfm(buf, output + 8 * i, 16, 12);
     }
   }
 }
@@ -2018,8 +2150,8 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[64];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x16_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x16_arr[tx_type];
 
   if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_16x32_c(input, output, stride, tx_type, bd);
@@ -2030,9 +2162,7 @@
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
   for (int i = 0; i < 2; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 32);
-    shift_left_2_s16_x8(buf0, buf0, 32);
-    col_txfm(buf0, buf0, 12);
+    col_txfm(input + 8 * i, buf0, stride, 12);
     shift_right_4_round_s16_x8(buf0, buf0, 32);
     transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 16 + 8 * i);
     transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 16 + 8 * i);
@@ -2043,12 +2173,10 @@
   for (int i = 0; i < 4; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 16 * i, buf0, 16);
-      row_txfm(buf0, buf0, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 32, 16);
+      row_txfm(buf0, output + 8 * i, 32, 13);
     } else {
       int16x8_t *buf = buf1 + 16 * i;
-      row_txfm(buf, buf, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+      row_txfm(buf, output + 8 * i, 32, 13);
     }
   }
 }
@@ -2057,8 +2185,8 @@
                                        int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[32];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x8_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
 
   if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
@@ -2069,20 +2197,16 @@
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 8);
   for (int i = 0; i < 4; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 8);
-    shift_left_2_s16_x8(buf0, buf0, 8);
-    col_txfm(buf0, buf0, 13);
+    col_txfm(input + 8 * i, buf0, stride, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 8);
     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
   }
 
   if (lr_flip) {
     flip_buf_8_neon(buf1, buf0, 32);
-    row_txfm(buf0, buf0, 12);
-    store_buffer_16bit_to_32bit_w8(buf0, output, 8, 32);
+    row_txfm(buf0, output, 8, 12);
   } else {
-    row_txfm(buf1, buf1, 12);
-    store_buffer_16bit_to_32bit_w8(buf1, output, 8, 32);
+    row_txfm(buf1, output, 8, 12);
   }
 }
 
@@ -2090,8 +2214,8 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[64];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x16_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_rect_txfm8x32_arr[tx_type];
 
   if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x16_c(input, output, stride, tx_type, bd);
@@ -2102,9 +2226,7 @@
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 16);
   for (int i = 0; i < 4; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 16);
-    shift_left_2_s16_x8(buf0, buf0, 16);
-    col_txfm(buf0, buf0, 13);
+    col_txfm(input + 8 * i, buf0, stride, 13);
     shift_right_4_round_s16_x8(buf0, buf0, 16);
     transpose_arrays_s16_8x8(buf0, buf1 + 0 * 32 + 8 * i);
     transpose_arrays_s16_8x8(buf0 + 8, buf1 + 1 * 32 + 8 * i);
@@ -2113,12 +2235,10 @@
   for (int i = 0; i < 2; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
-      row_txfm(buf0, buf0, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 16, 32);
+      row_txfm(buf0, output + 8 * i, 16, 13);
     } else {
       int16x8_t *buf = buf1 + 32 * i;
-      row_txfm(buf, buf, 13);
-      store_rect_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+      row_txfm(buf, output + 8 * i, 16, 13);
     }
   }
 }
@@ -2127,8 +2247,8 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[32], buf1[128];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
-  const transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const row_transform_1d_lbd_8_neon row_txfm = row_txfm8x32_arr[tx_type];
 
   if (col_txfm == NULL || row_txfm == NULL) {
     av1_fwd_txfm2d_32x32_c(input, output, stride, tx_type, bd);
@@ -2139,9 +2259,7 @@
   get_flip_cfg(tx_type, &ud_flip, &lr_flip);
   ud_adjust_input_and_stride(ud_flip, &input, &stride, 32);
   for (int i = 0; i < 4; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 32);
-    shift_left_2_s16_x8(buf0, buf0, 32);
-    col_txfm(buf0, buf0, 12);
+    col_txfm(input + 8 * i, buf0, stride, 12);
     shift_right_4_round_s16_x8(buf0, buf0, 32);
     transpose_arrays_s16_8x8(buf0 + 0 * 8, buf1 + 0 * 32 + 8 * i);
     transpose_arrays_s16_8x8(buf0 + 1 * 8, buf1 + 1 * 32 + 8 * i);
@@ -2152,12 +2270,10 @@
   for (int i = 0; i < 4; i++) {
     if (lr_flip) {
       flip_buf_8_neon(buf1 + 32 * i, buf0, 32);
-      row_txfm(buf0, buf0, 12);
-      store_buffer_16bit_to_32bit_w8(buf0, output + 8 * i, 32, 32);
+      row_txfm(buf0, output + 8 * i, 32, 12);
     } else {
       int16x8_t *buf = buf1 + 32 * i;
-      row_txfm(buf, buf, 12);
-      store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 32);
+      row_txfm(buf, output + 8 * i, 32, 12);
     }
   }
 }
@@ -2172,7 +2288,7 @@
   const transform_1d_lbd_8_neon row_txfm = fdct8x64_neon;
 
   for (int i = 0; i < 8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 16);
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 16);
     shift_left_2_s16_x8(buf0, buf0, 16);
     col_txfm(buf0, buf0, 13);
     shift_right_4_round_s16_x8(buf0, buf0, 16);
@@ -2184,7 +2300,7 @@
   for (int i = 0; i < 2; i++) {
     int16x8_t *buf = buf1 + 64 * i;
     row_txfm(buf, buf, 12);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 16, 32);
+    store_buffer_s16_x8(buf, output + 8 * i, 16, 32);
   }
   // Zero out the bottom 16x32 area.
   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
@@ -2200,7 +2316,7 @@
   const transform_1d_lbd_8_neon row_txfm = fdct8x16_neon;
 
   for (int i = 0; i < 2; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 64);
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
     col_txfm(buf0, buf0, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 64);
     for (int j = 0; j < 8; ++j) {
@@ -2211,7 +2327,7 @@
   for (int i = 0; i < 4; i++) {
     int16x8_t *buf = buf1 + 16 * i;
     row_txfm(buf, buf, 12);
-    store_buffer_16bit_to_32bit_w8(buf, output + 8 * i, 32, 16);
+    store_buffer_s16_x8(buf, output + 8 * i, 32, 16);
   }
 }
 
@@ -2647,7 +2763,7 @@
   const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
 
   for (int i = 0; i < 8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 64);
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
     col_txfm(buf0, buf0, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 64);
     for (int j = 0; j < 4; ++j) {
@@ -2666,7 +2782,7 @@
     fdct64_new_neon(bufB, bufB, 10);
     shift_right_2_round_s32_x4(bufA, bufA, 32);
     shift_right_2_round_s32_x4(bufB, bufB, 32);
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }
 
@@ -2674,12 +2790,10 @@
                                         int stride, TX_TYPE tx_type, int bd) {
   (void)bd;
   int16x8_t buf0[64], buf1[256];
-  const transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
+  const col_transform_1d_lbd_8_neon col_txfm = col_txfm8x32_arr[tx_type];
 
   for (int i = 0; i < 8; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 32);
-    shift_left_2_s16_x8(buf0, buf0, 32);
-    col_txfm(buf0, buf0, 12);
+    col_txfm(input + 8 * i, buf0, stride, 12);
     shift_right_4_round_s16_x8(buf0, buf0, 32);
     for (int j = 0; j < 4; ++j) {
       transpose_arrays_s16_8x8(buf0 + j * 8, buf1 + j * 64 + 8 * i);
@@ -2700,7 +2814,7 @@
     shift_right_2_round_s32_x4(bufB, bufB, 32);
     round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
     round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }
 
@@ -2713,7 +2827,7 @@
   const transform_1d_lbd_8_neon col_txfm = fdct8x64_neon;
 
   for (int i = 0; i < 4; i++) {
-    load_buffer_16bit_to_16bit(input + 8 * i, stride, buf0, 64);
+    load_buffer_s16_x8(input + 8 * i, stride, buf0, 64);
     col_txfm(buf0, buf0, 13);
     shift_right_2_round_s16_x8(buf0, buf0, 64);
     for (int j = 0; j < 4; ++j) {
@@ -2735,7 +2849,7 @@
     shift_right_2_round_s32_x4(bufB, bufB, 32);
     round_shift_sqrt2_s32_s32_4xn_neon(bufA, bufA, 32);
     round_shift_sqrt2_s32_s32_4xn_neon(bufB, bufB, 32);
-    store_output_32bit_w8(output + i * 8, bufA, bufB, 32, 32);
+    store_buffer_interleaved_s32_x8(output + i * 8, bufA, bufB, 32, 32);
   }
 }