Re-order av1_cospi_arr_q13_data for av1_fwd_txfm2d_neon.c

The txfm2d fdct and adst kernels only access particular parts of the
cospi arrays depending on the subdivision they are working with: for
example the fdct8x4_neon kernel only accesses elements originally at
indices 16 and 32 (storing cos(pi*16/64) and cos(pi*32/64)
respectively).

Since the smaller kernels only ever use a small number of these
constants it is beneficial to store the constants formed from smaller
subdivisions of 64 closer together. This allows us to make use of wider
vector load instructions: loading a pair of (64 bits each) 4-tuple
constants as a single 128-bit vector.

Working through increasing subdivisions of 64, we end up with the
following indices:

    32,
    16, 48,
    8, 24, 40, 56,
    4, 12, 20, 28, 36, 44, 52, 60,
    ...

Running the txfm2d speed tests with Clang 16 and GCC 12, this change
brings a geomean ~1% reduction in the times reported, primarily weighted
towards the smaller problem sizes where we can effectively exploit this
new data layout.

Change-Id: I919e4ddd123cf3ae5bfe5894c68c0f84db7a6293
diff --git a/av1/common/av1_txfm.c b/av1/common/av1_txfm.c
index 5facd33..5add3ab 100644
--- a/av1/common/av1_txfm.c
+++ b/av1/common/av1_txfm.c
@@ -55,70 +55,84 @@
 // cospi array are stored adjacent in memory, followed immediately by the same
 // constants but negated, i.e.:
 //   f(i,j) = (int)round(cos(PI*j/128) * (1<<(cos_bit_min+i))) << (3-i)
-//   av1_cospi_arr_q13_data[i][4*j+0] = f(i,j)
-//   av1_cospi_arr_q13_data[i][4*j+1] = f(i,63-j)
-//   av1_cospi_arr_q13_data[i][4*j+2] = -av1_cospi_arr_q13_data[i][4*j+0]
-//   av1_cospi_arr_q13_data[i][4*j+3] = -av1_cospi_arr_q13_data[i][4*j+1]
-// See also: https://en.wikipedia.org/wiki/Q_(number_format)
-const int16_t
-    av1_cospi_arr_q13_data[4][132] = {
-      { 8192,  0,     -8192, 0,     8192,  200,   -8192, -200,  8184,  400,
-        -8184, -400,  8168,  600,   -8168, -600,  8152,  800,   -8152, -800,
-        8128,  1000,  -8128, -1000, 8104,  1200,  -8104, -1200, 8072,  1400,
-        -8072, -1400, 8032,  1600,  -8032, -1600, 7992,  1792,  -7992, -1792,
-        7944,  1992,  -7944, -1992, 7896,  2184,  -7896, -2184, 7840,  2376,
-        -7840, -2376, 7776,  2568,  -7776, -2568, 7712,  2760,  -7712, -2760,
-        7640,  2952,  -7640, -2952, 7568,  3136,  -7568, -3136, 7488,  3320,
-        -7488, -3320, 7408,  3504,  -7408, -3504, 7320,  3680,  -7320, -3680,
-        7224,  3864,  -7224, -3864, 7128,  4040,  -7128, -4040, 7024,  4208,
-        -7024, -4208, 6920,  4384,  -6920, -4384, 6808,  4552,  -6808, -4552,
-        6696,  4720,  -6696, -4720, 6576,  4880,  -6576, -4880, 6456,  5040,
-        -6456, -5040, 6336,  5200,  -6336, -5200, 6200,  5352,  -6200, -5352,
-        6072,  5504,  -6072, -5504, 5936,  5648,  -5936, -5648, 5792,  5792,
-        -5792, -5792 },
-      { 8192,  0,     -8192, 0,     8188,  200,   -8188, -200,  8184,  400,
-        -8184, -400,  8168,  604,   -8168, -604,  8152,  804,   -8152, -804,
-        8132,  1004,  -8132, -1004, 8104,  1204,  -8104, -1204, 8072,  1400,
-        -8072, -1400, 8036,  1600,  -8036, -1600, 7992,  1796,  -7992, -1796,
-        7948,  1992,  -7948, -1992, 7896,  2184,  -7896, -2184, 7840,  2380,
-        -7840, -2380, 7780,  2568,  -7780, -2568, 7712,  2760,  -7712, -2760,
-        7644,  2948,  -7644, -2948, 7568,  3136,  -7568, -3136, 7488,  3320,
-        -7488, -3320, 7404,  3504,  -7404, -3504, 7316,  3684,  -7316, -3684,
-        7224,  3860,  -7224, -3860, 7128,  4036,  -7128, -4036, 7028,  4212,
-        -7028, -4212, 6920,  4384,  -6920, -4384, 6812,  4552,  -6812, -4552,
-        6696,  4716,  -6696, -4716, 6580,  4880,  -6580, -4880, 6460,  5040,
-        -6460, -5040, 6332,  5196,  -6332, -5196, 6204,  5352,  -6204, -5352,
-        6068,  5500,  -6068, -5500, 5932,  5648,  -5932, -5648, 5792,  5792,
-        -5792, -5792 },
-      { 8192,  0,     -8192, 0,     8190,  202,   -8190, -202,  8182,  402,
-        -8182, -402,  8170,  602,   -8170, -602,  8152,  802,   -8152, -802,
-        8130,  1002,  -8130, -1002, 8104,  1202,  -8104, -1202, 8072,  1400,
-        -8072, -1400, 8034,  1598,  -8034, -1598, 7992,  1794,  -7992, -1794,
-        7946,  1990,  -7946, -1990, 7896,  2184,  -7896, -2184, 7840,  2378,
-        -7840, -2378, 7778,  2570,  -7778, -2570, 7714,  2760,  -7714, -2760,
-        7644,  2948,  -7644, -2948, 7568,  3134,  -7568, -3134, 7490,  3320,
-        -7490, -3320, 7406,  3502,  -7406, -3502, 7318,  3684,  -7318, -3684,
-        7224,  3862,  -7224, -3862, 7128,  4038,  -7128, -4038, 7026,  4212,
-        -7026, -4212, 6922,  4382,  -6922, -4382, 6812,  4552,  -6812, -4552,
-        6698,  4718,  -6698, -4718, 6580,  4880,  -6580, -4880, 6458,  5040,
-        -6458, -5040, 6332,  5196,  -6332, -5196, 6204,  5350,  -6204, -5350,
-        6070,  5502,  -6070, -5502, 5934,  5648,  -5934, -5648, 5792,  5792,
-        -5792, -5792 },
-      { 8192,  0,     -8192, 0,     8190,  201,   -8190, -201,  8182,  402,
-        -8182, -402,  8170,  603,   -8170, -603,  8153,  803,   -8153, -803,
-        8130,  1003,  -8130, -1003, 8103,  1202,  -8103, -1202, 8071,  1401,
-        -8071, -1401, 8035,  1598,  -8035, -1598, 7993,  1795,  -7993, -1795,
-        7946,  1990,  -7946, -1990, 7895,  2185,  -7895, -2185, 7839,  2378,
-        -7839, -2378, 7779,  2570,  -7779, -2570, 7713,  2760,  -7713, -2760,
-        7643,  2948,  -7643, -2948, 7568,  3135,  -7568, -3135, 7489,  3320,
-        -7489, -3320, 7405,  3503,  -7405, -3503, 7317,  3683,  -7317, -3683,
-        7225,  3862,  -7225, -3862, 7128,  4038,  -7128, -4038, 7027,  4212,
-        -7027, -4212, 6921,  4383,  -6921, -4383, 6811,  4551,  -6811, -4551,
-        6698,  4717,  -6698, -4717, 6580,  4880,  -6580, -4880, 6458,  5040,
-        -6458, -5040, 6333,  5197,  -6333, -5197, 6203,  5351,  -6203, -5351,
-        6070,  5501,  -6070, -5501, 5933,  5649,  -5933, -5649, 5793,  5793,
-        -5793, -5793 },
-    };
+// and then in memory we store 4-tuples of constants together as:
+//   f4(i,j) = [ f(i,j), f(i,64-j), -f(i,j), -f(i,64-j) ]
+//
+// Constants are stored in Q2.13 format, see:
+// https://en.wikipedia.org/wiki/Q_(number_format)
+//
+// The order of the constants is such that increasing subdivisions of 64 store
+// f4 tuples contiguously:
+// av1_cospi_arr_q13_data[i] = {
+//   f4(i,32),  // f(i,32) twice
+//   f4(i,16),  // f(i,16) and f(i,48), f4(i,32) skipped since present above.
+//   f4(i,8), f(i,24), // f4(i,16) and f4(i,32) skipped since present above.
+//   f4(i,4), f(i,12), f4(i,20), f4(i,28),
+//   f4(i,2), f4(i,6), f4(i,10), f4(i,14), f4(i,18), ...
+//   f4(i,1), f4(i,3), f4(i,5), f4(i,7), f4(i,9), f4(i,11), ...
+// }
+const int16_t av1_cospi_arr_q13_data[4][128] = {
+  {
+      5792,  5792,  -5792, -5792, 7568,  3136,  -7568, -3136, 8032,  1600,
+      -8032, -1600, 6808,  4552,  -6808, -4552, 8152,  800,   -8152, -800,
+      7840,  2376,  -7840, -2376, 7224,  3864,  -7224, -3864, 6336,  5200,
+      -6336, -5200, 8184,  400,   -8184, -400,  8104,  1200,  -8104, -1200,
+      7944,  1992,  -7944, -1992, 7712,  2760,  -7712, -2760, 7408,  3504,
+      -7408, -3504, 7024,  4208,  -7024, -4208, 6576,  4880,  -6576, -4880,
+      6072,  5504,  -6072, -5504, 8192,  200,   -8192, -200,  8168,  600,
+      -8168, -600,  8128,  1000,  -8128, -1000, 8072,  1400,  -8072, -1400,
+      7992,  1792,  -7992, -1792, 7896,  2184,  -7896, -2184, 7776,  2568,
+      -7776, -2568, 7640,  2952,  -7640, -2952, 7488,  3320,  -7488, -3320,
+      7320,  3680,  -7320, -3680, 7128,  4040,  -7128, -4040, 6920,  4384,
+      -6920, -4384, 6696,  4720,  -6696, -4720, 6456,  5040,  -6456, -5040,
+      6200,  5352,  -6200, -5352, 5936,  5648,  -5936, -5648,
+  },
+  {
+      5792,  5792,  -5792, -5792, 7568,  3136,  -7568, -3136, 8036,  1600,
+      -8036, -1600, 6812,  4552,  -6812, -4552, 8152,  804,   -8152, -804,
+      7840,  2380,  -7840, -2380, 7224,  3860,  -7224, -3860, 6332,  5196,
+      -6332, -5196, 8184,  400,   -8184, -400,  8104,  1204,  -8104, -1204,
+      7948,  1992,  -7948, -1992, 7712,  2760,  -7712, -2760, 7404,  3504,
+      -7404, -3504, 7028,  4212,  -7028, -4212, 6580,  4880,  -6580, -4880,
+      6068,  5500,  -6068, -5500, 8188,  200,   -8188, -200,  8168,  604,
+      -8168, -604,  8132,  1004,  -8132, -1004, 8072,  1400,  -8072, -1400,
+      7992,  1796,  -7992, -1796, 7896,  2184,  -7896, -2184, 7780,  2568,
+      -7780, -2568, 7644,  2948,  -7644, -2948, 7488,  3320,  -7488, -3320,
+      7316,  3684,  -7316, -3684, 7128,  4036,  -7128, -4036, 6920,  4384,
+      -6920, -4384, 6696,  4716,  -6696, -4716, 6460,  5040,  -6460, -5040,
+      6204,  5352,  -6204, -5352, 5932,  5648,  -5932, -5648,
+  },
+  {
+      5792,  5792,  -5792, -5792, 7568,  3134,  -7568, -3134, 8034,  1598,
+      -8034, -1598, 6812,  4552,  -6812, -4552, 8152,  802,   -8152, -802,
+      7840,  2378,  -7840, -2378, 7224,  3862,  -7224, -3862, 6332,  5196,
+      -6332, -5196, 8182,  402,   -8182, -402,  8104,  1202,  -8104, -1202,
+      7946,  1990,  -7946, -1990, 7714,  2760,  -7714, -2760, 7406,  3502,
+      -7406, -3502, 7026,  4212,  -7026, -4212, 6580,  4880,  -6580, -4880,
+      6070,  5502,  -6070, -5502, 8190,  202,   -8190, -202,  8170,  602,
+      -8170, -602,  8130,  1002,  -8130, -1002, 8072,  1400,  -8072, -1400,
+      7992,  1794,  -7992, -1794, 7896,  2184,  -7896, -2184, 7778,  2570,
+      -7778, -2570, 7644,  2948,  -7644, -2948, 7490,  3320,  -7490, -3320,
+      7318,  3684,  -7318, -3684, 7128,  4038,  -7128, -4038, 6922,  4382,
+      -6922, -4382, 6698,  4718,  -6698, -4718, 6458,  5040,  -6458, -5040,
+      6204,  5350,  -6204, -5350, 5934,  5648,  -5934, -5648,
+  },
+  {
+      5793,  5793,  -5793, -5793, 7568,  3135,  -7568, -3135, 8035,  1598,
+      -8035, -1598, 6811,  4551,  -6811, -4551, 8153,  803,   -8153, -803,
+      7839,  2378,  -7839, -2378, 7225,  3862,  -7225, -3862, 6333,  5197,
+      -6333, -5197, 8182,  402,   -8182, -402,  8103,  1202,  -8103, -1202,
+      7946,  1990,  -7946, -1990, 7713,  2760,  -7713, -2760, 7405,  3503,
+      -7405, -3503, 7027,  4212,  -7027, -4212, 6580,  4880,  -6580, -4880,
+      6070,  5501,  -6070, -5501, 8190,  201,   -8190, -201,  8170,  603,
+      -8170, -603,  8130,  1003,  -8130, -1003, 8071,  1401,  -8071, -1401,
+      7993,  1795,  -7993, -1795, 7895,  2185,  -7895, -2185, 7779,  2570,
+      -7779, -2570, 7643,  2948,  -7643, -2948, 7489,  3320,  -7489, -3320,
+      7317,  3683,  -7317, -3683, 7128,  4038,  -7128, -4038, 6921,  4383,
+      -6921, -4383, 6698,  4717,  -6698, -4717, 6458,  5040,  -6458, -5040,
+      6203,  5351,  -6203, -5351, 5933,  5649,  -5933, -5649,
+  }
+};
 
 // av1_sinpi_arr_q13_data[i][j] =
 //   round((sqrt2 * sin((j+1)*Pi/9) * 2/3) * (1 << (cos_bit_min + i))) << (3-i)
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 5c02896..8b7e174 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -57,7 +57,7 @@
 #if HAVE_NEON
 // Store cospi/sinpi costants in Q2.13 format.
 // See: https://en.wikipedia.org/wiki/Q_(number_format)
-extern const int16_t av1_cospi_arr_q13_data[4][132];
+extern const int16_t av1_cospi_arr_q13_data[4][128];
 extern const int16_t av1_sinpi_arr_q13_data[4][4];
 
 static INLINE const int16_t *cospi_arr_q13(int n) {
diff --git a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
index 500d957..72f88c7 100644
--- a/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
+++ b/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c
@@ -378,12 +378,17 @@
 static AOM_FORCE_INLINE void fadst4x8_neon(const int16x4_t *input,
                                            int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1-2
   int16x4_t x2[8];
@@ -479,15 +484,15 @@
 static AOM_FORCE_INLINE void fdct4x4_neon(const int16x4_t *input,
                                           int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
+  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 1]);
 
   int16x4_t in12a = vadd_s16(input[1], input[2]);
   int16x4_t in12s = vsub_s16(input[1], input[2]);
   int16x4_t in03a = vadd_s16(input[0], input[3]);
   int16x4_t in03s = vsub_s16(input[0], input[3]);
 
-  int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 32]);
-  int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 32]);
+  int32x4_t u0ad1 = vmull_n_s16(in12a, cospi[4 * 0]);
+  int32x4_t u0ad2 = vmull_n_s16(in03a, cospi[4 * 0]);
 
   int32x4_t u[4];
   u[0] = vaddq_s32(u0ad1, u0ad2);
@@ -608,8 +613,11 @@
 static AOM_FORCE_INLINE void fdct8x4_neon(const int16x8_t *input,
                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
 
   // stage 1
   int16x8_t x1[4];
@@ -630,10 +638,14 @@
 static AOM_FORCE_INLINE void fdct4x8_neon(const int16x4_t *input,
                                           int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
 
   // stage 1
   int16x4_t x1[8];
@@ -658,10 +670,14 @@
 static AOM_FORCE_INLINE void fdct8x8_neon(const int16x8_t *input,
                                           int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
 
   // stage 1
   int16x8_t x1[8];
@@ -686,14 +702,20 @@
 static AOM_FORCE_INLINE void fdct4x16_neon(const int16x4_t *input,
                                            int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1
   int16x4_t x1[16];
@@ -742,14 +764,20 @@
 static AOM_FORCE_INLINE void fdct8x16_neon(const int16x8_t *input,
                                            int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 1
   int16x8_t x1[16];
@@ -798,22 +826,32 @@
 static AOM_FORCE_INLINE void fdct8x32_neon(const int16x8_t *input,
                                            int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   // stage 1
   int16x8_t x1[32];
@@ -904,38 +942,56 @@
 static AOM_FORCE_INLINE void fdct8x64_neon(const int16x8_t *input,
                                            int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi1 = vld1_s16(&cospi[4 * 1]);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi3 = vld1_s16(&cospi[4 * 3]);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi5 = vld1_s16(&cospi[4 * 5]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi7 = vld1_s16(&cospi[4 * 7]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi9 = vld1_s16(&cospi[4 * 9]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi11 = vld1_s16(&cospi[4 * 11]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi13 = vld1_s16(&cospi[4 * 13]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi15 = vld1_s16(&cospi[4 * 15]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi17 = vld1_s16(&cospi[4 * 17]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi19 = vld1_s16(&cospi[4 * 19]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi21 = vld1_s16(&cospi[4 * 21]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi23 = vld1_s16(&cospi[4 * 23]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi25 = vld1_s16(&cospi[4 * 25]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi27 = vld1_s16(&cospi[4 * 27]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi29 = vld1_s16(&cospi[4 * 29]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi31 = vld1_s16(&cospi[4 * 31]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
 
   // stage 1
   int16x8_t x1[64];
@@ -1139,12 +1195,17 @@
 static AOM_FORCE_INLINE void fadst8x8_neon(const int16x8_t *input,
                                            int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
 
   // stage 2
   int16x8_t x2[8];
@@ -1187,18 +1248,26 @@
 static AOM_FORCE_INLINE void fadst4x16_neon(const int16x4_t *input,
                                             int16x4_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   // stage 2
   int16x4_t x2[8];
@@ -1295,18 +1364,26 @@
 static AOM_FORCE_INLINE void fadst8x16_neon(const int16x8_t *input,
                                             int16x8_t *output, int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   // stage 2
   int16x8_t x2[8];
@@ -2331,24 +2408,35 @@
   }
 }
 
-static void fdct32_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit) {
+static void fdct32_new_neon(const int32x4_t *input, int32x4_t *output,
+                            int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
 
   int32x4_t buf0[32];
   int32x4_t buf1[32];
@@ -2542,40 +2630,59 @@
   output[31] = buf0[31];
 }
 
-static void fdct64_new_neon(int32x4_t *input, int32x4_t *output, int cos_bit) {
+static void fdct64_new_neon(const int32x4_t *input, int32x4_t *output,
+                            int cos_bit) {
   const int16_t *cospi = cospi_arr_q13(cos_bit);
-  const int16x4_t cospi1 = vld1_s16(&cospi[4 * 1]);
-  const int16x4_t cospi2 = vld1_s16(&cospi[4 * 2]);
-  const int16x4_t cospi3 = vld1_s16(&cospi[4 * 3]);
-  const int16x4_t cospi4 = vld1_s16(&cospi[4 * 4]);
-  const int16x4_t cospi5 = vld1_s16(&cospi[4 * 5]);
-  const int16x4_t cospi6 = vld1_s16(&cospi[4 * 6]);
-  const int16x4_t cospi7 = vld1_s16(&cospi[4 * 7]);
-  const int16x4_t cospi8 = vld1_s16(&cospi[4 * 8]);
-  const int16x4_t cospi9 = vld1_s16(&cospi[4 * 9]);
-  const int16x4_t cospi10 = vld1_s16(&cospi[4 * 10]);
-  const int16x4_t cospi11 = vld1_s16(&cospi[4 * 11]);
-  const int16x4_t cospi12 = vld1_s16(&cospi[4 * 12]);
-  const int16x4_t cospi13 = vld1_s16(&cospi[4 * 13]);
-  const int16x4_t cospi14 = vld1_s16(&cospi[4 * 14]);
-  const int16x4_t cospi15 = vld1_s16(&cospi[4 * 15]);
-  const int16x4_t cospi16 = vld1_s16(&cospi[4 * 16]);
-  const int16x4_t cospi17 = vld1_s16(&cospi[4 * 17]);
-  const int16x4_t cospi18 = vld1_s16(&cospi[4 * 18]);
-  const int16x4_t cospi19 = vld1_s16(&cospi[4 * 19]);
-  const int16x4_t cospi20 = vld1_s16(&cospi[4 * 20]);
-  const int16x4_t cospi21 = vld1_s16(&cospi[4 * 21]);
-  const int16x4_t cospi22 = vld1_s16(&cospi[4 * 22]);
-  const int16x4_t cospi23 = vld1_s16(&cospi[4 * 23]);
-  const int16x4_t cospi24 = vld1_s16(&cospi[4 * 24]);
-  const int16x4_t cospi25 = vld1_s16(&cospi[4 * 25]);
-  const int16x4_t cospi26 = vld1_s16(&cospi[4 * 26]);
-  const int16x4_t cospi27 = vld1_s16(&cospi[4 * 27]);
-  const int16x4_t cospi28 = vld1_s16(&cospi[4 * 28]);
-  const int16x4_t cospi29 = vld1_s16(&cospi[4 * 29]);
-  const int16x4_t cospi30 = vld1_s16(&cospi[4 * 30]);
-  const int16x4_t cospi31 = vld1_s16(&cospi[4 * 31]);
-  const int16x4_t cospi32 = vld1_s16(&cospi[4 * 32]);
+
+  const int16x8_t cospi32_16 = vld1q_s16(&cospi[4 * 0]);
+  const int16x8_t cospi8_24 = vld1q_s16(&cospi[4 * 2]);
+  const int16x8_t cospi4_12 = vld1q_s16(&cospi[4 * 4]);
+  const int16x8_t cospi20_28 = vld1q_s16(&cospi[4 * 6]);
+  const int16x8_t cospi2_6 = vld1q_s16(&cospi[4 * 8]);
+  const int16x8_t cospi10_14 = vld1q_s16(&cospi[4 * 10]);
+  const int16x8_t cospi18_22 = vld1q_s16(&cospi[4 * 12]);
+  const int16x8_t cospi26_30 = vld1q_s16(&cospi[4 * 14]);
+  const int16x8_t cospi1_3 = vld1q_s16(&cospi[4 * 16]);
+  const int16x8_t cospi5_7 = vld1q_s16(&cospi[4 * 18]);
+  const int16x8_t cospi9_11 = vld1q_s16(&cospi[4 * 20]);
+  const int16x8_t cospi13_15 = vld1q_s16(&cospi[4 * 22]);
+  const int16x8_t cospi17_19 = vld1q_s16(&cospi[4 * 24]);
+  const int16x8_t cospi21_23 = vld1q_s16(&cospi[4 * 26]);
+  const int16x8_t cospi25_27 = vld1q_s16(&cospi[4 * 28]);
+  const int16x8_t cospi29_31 = vld1q_s16(&cospi[4 * 30]);
+
+  const int16x4_t cospi32 = vget_low_s16(cospi32_16);
+  const int16x4_t cospi16 = vget_high_s16(cospi32_16);
+  const int16x4_t cospi8 = vget_low_s16(cospi8_24);
+  const int16x4_t cospi24 = vget_high_s16(cospi8_24);
+  const int16x4_t cospi4 = vget_low_s16(cospi4_12);
+  const int16x4_t cospi12 = vget_high_s16(cospi4_12);
+  const int16x4_t cospi20 = vget_low_s16(cospi20_28);
+  const int16x4_t cospi28 = vget_high_s16(cospi20_28);
+  const int16x4_t cospi2 = vget_low_s16(cospi2_6);
+  const int16x4_t cospi6 = vget_high_s16(cospi2_6);
+  const int16x4_t cospi10 = vget_low_s16(cospi10_14);
+  const int16x4_t cospi14 = vget_high_s16(cospi10_14);
+  const int16x4_t cospi18 = vget_low_s16(cospi18_22);
+  const int16x4_t cospi22 = vget_high_s16(cospi18_22);
+  const int16x4_t cospi26 = vget_low_s16(cospi26_30);
+  const int16x4_t cospi30 = vget_high_s16(cospi26_30);
+  const int16x4_t cospi1 = vget_low_s16(cospi1_3);
+  const int16x4_t cospi3 = vget_high_s16(cospi1_3);
+  const int16x4_t cospi5 = vget_low_s16(cospi5_7);
+  const int16x4_t cospi7 = vget_high_s16(cospi5_7);
+  const int16x4_t cospi9 = vget_low_s16(cospi9_11);
+  const int16x4_t cospi11 = vget_high_s16(cospi9_11);
+  const int16x4_t cospi13 = vget_low_s16(cospi13_15);
+  const int16x4_t cospi15 = vget_high_s16(cospi13_15);
+  const int16x4_t cospi17 = vget_low_s16(cospi17_19);
+  const int16x4_t cospi19 = vget_high_s16(cospi17_19);
+  const int16x4_t cospi21 = vget_low_s16(cospi21_23);
+  const int16x4_t cospi23 = vget_high_s16(cospi21_23);
+  const int16x4_t cospi25 = vget_low_s16(cospi25_27);
+  const int16x4_t cospi27 = vget_high_s16(cospi25_27);
+  const int16x4_t cospi29 = vget_low_s16(cospi29_31);
+  const int16x4_t cospi31 = vget_high_s16(cospi29_31);
 
   // stage 1
   int32x4_t x1[64];