blob: 09e5166b1416ab0f2431385f86ce555b7f37fdeb [file] [log] [blame]
Venkat000f2f62018-07-05 12:03:05 +05301/*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053012#include <arm_neon.h>
13
Venkat000f2f62018-07-05 12:03:05 +053014#include "config/aom_config.h"
15#include "config/aom_dsp_rtcd.h"
16#include "config/av1_rtcd.h"
17
Yaowu Xu01d4a322021-07-15 07:46:13 -070018#include "aom_dsp/arm/transpose_neon.h"
Venkat000f2f62018-07-05 12:03:05 +053019#include "av1/common/av1_inv_txfm1d.h"
20#include "av1/common/av1_inv_txfm1d_cfg.h"
21#include "av1/common/av1_txfm.h"
22#include "av1/common/enums.h"
23#include "av1/common/idct.h"
24#include "av1/common/arm/av1_inv_txfm_neon.h"
25
Venkat000f2f62018-07-05 12:03:05 +053026// 1D itx types
27typedef enum ATTRIBUTE_PACKED {
28 IDCT_1D,
29 IADST_1D,
30 IFLIPADST_1D = IADST_1D,
31 IIDENTITY_1D,
32 ITX_TYPES_1D,
33} ITX_TYPE_1D;
34
35static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
36 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
37 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
38 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
39 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
40};
41
42static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
43 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
44 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
45 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
46 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
47};
48
49// 1D functions
50static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
Yaowu Xueb5e4e22020-04-06 14:17:55 -070051 { av1_idct4, av1_iadst4, av1_iidentity4_c },
52 { av1_idct8, av1_iadst8, av1_iidentity8_c },
53 { av1_idct16, av1_iadst16, av1_iidentity16_c },
54 { av1_idct32, NULL, NULL },
55 { av1_idct64, NULL, NULL },
Venkat000f2f62018-07-05 12:03:05 +053056};
57
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053058static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
59 uint8_t *output, int stride,
60 int flipud,
61 const int height) {
62 int j = flipud ? (height - 1) : 0;
63 const int step = flipud ? -1 : 1;
64 int16x8_t temp_output;
65 for (int i = 0; i < height; ++i, j += step) {
66 temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
67 temp_output = vaddq_s16(temp_output, in[j]);
68 vst1_u8(output, vqmovun_s16(temp_output));
69 output += stride;
70 }
71}
72
73static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
74 int16x8_t res0,
75 int16x8_t res1) {
76 int16x8_t temp_output[2];
77 uint8x16_t temp_output_8q;
78 temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
79 temp_output[0] = vaddq_s16(temp_output[0], res0);
80 temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
81 temp_output[1] = vaddq_s16(temp_output[1], res1);
82 temp_output_8q =
83 vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
84 return temp_output_8q;
85}
86
87static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
88 uint8_t *output, int stride,
89 int flipud, int height) {
90 uint8x16_t temp_output_8q;
91 int j = flipud ? (height - 1) : 0;
92 const int step = flipud ? -1 : 1;
93 for (int i = 0; i < height; ++i, j += step) {
94 temp_output_8q = vld1q_u8(output + i * stride);
95 temp_output_8q =
96 lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
97 vst1q_u8((output + i * stride), temp_output_8q);
98 }
99}
100
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530101static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
102 int value) {
103 for (int i = 0; i < size; i++) {
104 a[i] = vdupq_n_s16((int16_t)value);
105 }
106}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530107
108static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
109 const int16x8_t in1, const int16x4_t c,
110 int16x8_t *t0, int16x8_t *t1) {
111 int32x4_t s0[2], s1[2];
112 int16x4_t v0[2], v1[2];
113
114 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
115 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
116 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
117 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
118
119 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
120 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
121 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
122 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
123
124 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
125 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
126 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
127 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
128
129 *t0 = vcombine_s16(v0[0], v0[1]);
130 *t1 = vcombine_s16(v1[0], v1[1]);
131}
132
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530133static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
134 const int16x8_t in1, const int16x4_t c,
135 int16x8_t *t0, int16x8_t *t1) {
136 int32x4_t s0[2], s1[2];
137 int16x4_t v0[2], v1[2];
138
139 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
140 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
141 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
142 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
143
144 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
145 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
146 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
147 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
148
149 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
150 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
151 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
152 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
153
154 *t0 = vcombine_s16(v0[0], v0[1]);
155 *t1 = vcombine_s16(v1[0], v1[1]);
156}
157
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530158static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
159 const int16x8_t in1, const int16x4_t c,
160 int16x8_t *t0, int16x8_t *t1) {
161 int32x4_t s0[2], s1[2];
162 int16x4_t v0[2], v1[2];
163
164 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
165 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
166 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168
169 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
170 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
171 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
172 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
173
174 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
175 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
176 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
177 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
178
179 *t0 = vcombine_s16(v0[0], v0[1]);
180 *t1 = vcombine_s16(v1[0], v1[1]);
181}
182
183static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
184 int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
185 int32x4_t s0_l, s0_h, s1_l, s1_h;
186 int16x4_t v0[2], v1[2];
187
188 s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
189 s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530190 s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
191 s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530192
193 v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
194 v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
195 v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
196 v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
197
198 *t0 = vcombine_s16(v0[0], v0[1]);
199 *t1 = vcombine_s16(v1[0], v1[1]);
200}
201
202static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
203 const int16x8_t in1, const int16x4_t c,
204 int16x8_t *t0, int16x8_t *t1) {
205 int32x4_t s0[2], s1[2];
206 int16x4_t v0[2], v1[2];
207
208 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
209 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
210 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
211 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
212
213 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
214 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
215 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
216 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
217
218 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
219 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
220 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
221 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
222
223 *t0 = vcombine_s16(v0[0], v0[1]);
224 *t1 = vcombine_s16(v1[0], v1[1]);
225}
226
227static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
228 int32x4_t t0[2], t1[2];
229 int16x4_t v0[2], v1[2];
230
231 // Don't add/sub before multiply, which will overflow in iadst8.
232 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
233 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
234 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
235 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
236
237 t0[0] = vaddq_s32(x0_lo, x1_lo);
238 t0[1] = vaddq_s32(x0_hi, x1_hi);
239 t1[0] = vsubq_s32(x0_lo, x1_lo);
240 t1[1] = vsubq_s32(x0_hi, x1_hi);
241
242 v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
243 v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
244 v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
245 v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
246
247 x[0] = vcombine_s16(v0[0], v0[1]);
248 x[1] = vcombine_s16(v1[0], v1[1]);
249}
250
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530251static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
252 const int16_t c2, const int16_t c3) {
James Zern42a54e52022-09-01 19:11:06 -0700253 int16x4_t val = vdup_n_s16(c0);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530254 val = vset_lane_s16(c1, val, 1);
255 val = vset_lane_s16(c2, val, 2);
256 val = vset_lane_s16(c3, val, 3);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530257 return val;
258}
259
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700260static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500261 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530262 const int32_t *cospi = cospi_arr(cos_bit);
263
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530264 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
265 (int16_t)cospi[20], (int16_t)cospi[44]);
266 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
267 (int16_t)cospi[52], (int16_t)cospi[12]);
268 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
269 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530270
271 int16x8_t x[8];
272 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
273
274 // Stage 1
275 x[0] = in[7];
276 x[1] = in[0];
277 x[2] = in[5];
278 x[3] = in[2];
279 x[4] = in[3];
280 x[5] = in[4];
281 x[6] = in[1];
282 x[7] = in[6];
283
284 // Stage 2
285 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
286 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
287 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
288 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
289
290 // Stage 3
291 x[0] = vqaddq_s16(s0, s4);
292 x[1] = vqaddq_s16(s1, s5);
293 x[2] = vqaddq_s16(s2, s6);
294 x[3] = vqaddq_s16(s3, s7);
295 x[4] = vqsubq_s16(s0, s4);
296 x[5] = vqsubq_s16(s1, s5);
297 x[6] = vqsubq_s16(s2, s6);
298 x[7] = vqsubq_s16(s3, s7);
299
300 // Stage 4
301 s0 = x[0];
302 s1 = x[1];
303 s2 = x[2];
304 s3 = x[3];
305 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
306 btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
307
308 // Stage 5
309 x[0] = vqaddq_s16(s0, s2);
310 x[1] = vqaddq_s16(s1, s3);
311 x[2] = vqsubq_s16(s0, s2);
312 x[3] = vqsubq_s16(s1, s3);
313 x[4] = vqaddq_s16(s4, s6);
314 x[5] = vqaddq_s16(s5, s7);
315 x[6] = vqsubq_s16(s4, s6);
316 x[7] = vqsubq_s16(s5, s7);
317
318 // stage 6
319 btf_16_half_neon(x + 2, c2);
320 btf_16_half_neon(x + 6, c2);
321
322 // Stage 7
323 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530324 out[1] = vqnegq_s16(x[4]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530325 out[2] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530326 out[3] = vqnegq_s16(x[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530327 out[4] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530328 out[5] = vqnegq_s16(x[7]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530329 out[6] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530330 out[7] = vqnegq_s16(x[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530331}
332
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700333static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500334 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530335 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530336 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
337 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530338
339 int16x8_t x[8];
340 int16x8_t s0, s1, s4, s5;
341
342 // Stage 1
343 x[1] = in[0];
344
345 // Stage 2
346
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530347 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530348
349 // Stage 3
350 x[0] = s0;
351 x[1] = s1;
352 x[4] = s0;
353 x[5] = s1;
354
355 // Stage 4
356 s0 = x[0];
357 s1 = x[1];
358 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
359
360 // Stage 5
361 x[0] = s0;
362 x[1] = s1;
363 x[2] = s0;
364 x[3] = s1;
365 x[4] = s4;
366 x[5] = s5;
367 x[6] = s4;
368 x[7] = s5;
369
370 // stage 6
371 btf_16_half_neon(x + 2, c2);
372 btf_16_half_neon(x + 6, c2);
373
374 // Stage 7
375 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530376 out[1] = vqnegq_s16(x[4]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530377 out[2] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530378 out[3] = vqnegq_s16(x[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530379 out[4] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530380 out[5] = vqnegq_s16(x[7]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530381 out[6] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530382 out[7] = vqnegq_s16(x[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530383}
384
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500385static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530386 const int32_t *cospi = cospi_arr(cos_bit);
387 int16x8_t step1[8], step2[8];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530388 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
389 (int16_t)cospi[40], (int16_t)cospi[24]);
390 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
391 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530392
393 // stage 2
394 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
395 btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
396
397 // stage 3
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530398 btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
399 btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530400 step2[4] = vqaddq_s16(step1[4], step1[5]);
401 step2[5] = vqsubq_s16(step1[4], step1[5]);
402 step2[6] = vqsubq_s16(step1[7], step1[6]);
403 step2[7] = vqaddq_s16(step1[7], step1[6]);
404
405 // stage 4
406 step1[0] = vqaddq_s16(step2[0], step2[3]);
407 step1[1] = vqaddq_s16(step2[1], step2[2]);
408 step1[2] = vqsubq_s16(step2[1], step2[2]);
409 step1[3] = vqsubq_s16(step2[0], step2[3]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530410 btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530411
412 // stage 5
413 out[0] = vqaddq_s16(step1[0], step2[7]);
414 out[1] = vqaddq_s16(step1[1], step1[6]);
415 out[2] = vqaddq_s16(step1[2], step1[5]);
416 out[3] = vqaddq_s16(step1[3], step2[4]);
417 out[4] = vqsubq_s16(step1[3], step2[4]);
418 out[5] = vqsubq_s16(step1[2], step1[5]);
419 out[6] = vqsubq_s16(step1[1], step1[6]);
420 out[7] = vqsubq_s16(step1[0], step2[7]);
421}
422
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700423static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500424 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530425 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530426 int16x8_t step1;
427 int32x4_t t32[2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530428
429 // stage 1
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530430 // stage 2
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530431 // stage 3
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530432 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
433 t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
434
435 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
436 vrshrn_n_s32(t32[1], INV_COS_BIT));
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530437
438 // stage 4
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530439 // stage 5
440 out[0] = step1;
441 out[1] = step1;
442 out[2] = step1;
443 out[3] = step1;
444 out[4] = step1;
445 out[5] = step1;
446 out[6] = step1;
447 out[7] = step1;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530448}
449
450void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
451 assert(!(size % 4));
452 if (!bit) return;
453 const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
454 for (int i = 0; i < size; i++) {
455 arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
456 }
457}
458
459static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
460 int16x8_t temp[8];
461 for (int i = 0; i < size; ++i) {
462 temp[i] = input[size - 1 - i];
463 }
464 for (int i = 0; i < size; ++i) {
465 input[i] = temp[i];
466 }
467}
468
469static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
Kyle Siefring7bb4c542022-12-09 10:27:41 -0500470 int stride,
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530471 int16x8_t *const a,
472 int out_size) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -0500473 for (int i = 0; i < out_size; ++i) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530474 a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
475 vmovn_s32(vld1q_s32(input + 4)));
Kyle Siefring7bb4c542022-12-09 10:27:41 -0500476 input += stride;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530477 }
478}
479
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530480static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
481 4 * 5793 };
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530482
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530483static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
484 int txw_idx, int8_t size, int bit) {
485 const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
486 int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
487 int16x4_t low_i16, high_i16;
488 int32x4_t low_i32, high_i32;
489 for (int i = 0; i < size; i++) {
490 int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
491 int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
492 low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
493 high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
494 low_i16 = vqmovn_s32(low_i32);
495 high_i16 = vqmovn_s32(high_i32);
496 output[i] = vcombine_s16(low_i16, high_i16);
497 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530498}
499
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530500static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
501 int size) {
502 int32x4_t out_low, out_high;
503 int16x4_t low, high;
504
505 for (int z = 0; z < size; ++z) {
506 out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
507 out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
508
509 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
510 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
511
512 output[z] = vcombine_s16(low, high);
513 }
514}
515
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700516static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500517 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530518 const int32_t *cospi = cospi_arr(cos_bit);
519 int16x8_t step1;
520 int32x4_t t32[2];
521
522 // stage 4
523
524 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
525 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
526 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
527 vrshrn_n_s32(t32[1], INV_COS_BIT));
528
529 // stage 6
530 // stage 7
531 out[0] = step1;
532 out[1] = step1;
533 out[2] = step1;
534 out[3] = step1;
535 out[4] = step1;
536 out[5] = step1;
537 out[6] = step1;
538 out[7] = step1;
539 out[8] = step1;
540 out[9] = step1;
541 out[10] = step1;
542 out[11] = step1;
543 out[12] = step1;
544 out[13] = step1;
545 out[14] = step1;
546 out[15] = step1;
547}
548
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500549static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530550 const int32_t *cospi = cospi_arr(cos_bit);
551 int16x8_t step1[16], step2[16];
552
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530553 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
554 (int16_t)cospi[36], (int16_t)cospi[28]);
555 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
556 (int16_t)cospi[52], (int16_t)cospi[12]);
557 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
558 (int16_t)cospi[40], (int16_t)cospi[24]);
559 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
560 (int16_t)cospi[16], (int16_t)cospi[48]);
561 const int16x4_t c4 =
562 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
563 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530564 // stage 2
565
566 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
567 btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
568 btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
569 btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
570
571 step2[0] = in[0];
572 step2[1] = in[8];
573 step2[2] = in[4];
574 step2[3] = in[12];
575 step2[4] = in[2];
576 step2[5] = in[10];
577 step2[6] = in[6];
578 step2[7] = in[14];
579
580 // stage 3
581
582 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
583 btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
584
585 step1[0] = step2[0];
586 step1[1] = step2[1];
587 step1[2] = step2[2];
588 step1[3] = step2[3];
589 step1[8] = vqaddq_s16(step2[8], step2[9]);
590 step1[9] = vqsubq_s16(step2[8], step2[9]);
591 step1[10] = vqsubq_s16(step2[11], step2[10]);
592 step1[11] = vqaddq_s16(step2[11], step2[10]);
593 step1[12] = vqaddq_s16(step2[12], step2[13]);
594 step1[13] = vqsubq_s16(step2[12], step2[13]);
595 step1[14] = vqsubq_s16(step2[15], step2[14]);
596 step1[15] = vqaddq_s16(step2[15], step2[14]);
597
598 // stage 4
599
600 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
601 btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
602 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530603 btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530604
605 step2[4] = vqaddq_s16(step1[4], step1[5]);
606 step2[5] = vqsubq_s16(step1[4], step1[5]);
607 step2[6] = vqsubq_s16(step1[7], step1[6]);
608 step2[7] = vqaddq_s16(step1[7], step1[6]);
609 step2[8] = step1[8];
610 step2[11] = step1[11];
611 step2[12] = step1[12];
612 step2[15] = step1[15];
613
614 // stage 5
615
616 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
617
618 step1[0] = vqaddq_s16(step2[0], step2[3]);
619 step1[1] = vqaddq_s16(step2[1], step2[2]);
620 step1[2] = vqsubq_s16(step2[1], step2[2]);
621 step1[3] = vqsubq_s16(step2[0], step2[3]);
622 step1[4] = step2[4];
623 step1[7] = step2[7];
624 step1[8] = vqaddq_s16(step2[8], step2[11]);
625 step1[9] = vqaddq_s16(step2[9], step2[10]);
626 step1[10] = vqsubq_s16(step2[9], step2[10]);
627 step1[11] = vqsubq_s16(step2[8], step2[11]);
628 step1[12] = vqsubq_s16(step2[15], step2[12]);
629 step1[13] = vqsubq_s16(step2[14], step2[13]);
630 step1[14] = vqaddq_s16(step2[14], step2[13]);
631 step1[15] = vqaddq_s16(step2[15], step2[12]);
632
633 // stage 6
634
635 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
636 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
637
638 step2[0] = vqaddq_s16(step1[0], step1[7]);
639 step2[1] = vqaddq_s16(step1[1], step1[6]);
640 step2[2] = vqaddq_s16(step1[2], step1[5]);
641 step2[3] = vqaddq_s16(step1[3], step1[4]);
642 step2[4] = vqsubq_s16(step1[3], step1[4]);
643 step2[5] = vqsubq_s16(step1[2], step1[5]);
644 step2[6] = vqsubq_s16(step1[1], step1[6]);
645 step2[7] = vqsubq_s16(step1[0], step1[7]);
646 step2[8] = step1[8];
647 step2[9] = step1[9];
648 step2[14] = step1[14];
649 step2[15] = step1[15];
650
651 // stage 7
652 out[0] = vqaddq_s16(step2[0], step2[15]);
653 out[1] = vqaddq_s16(step2[1], step2[14]);
654 out[2] = vqaddq_s16(step2[2], step2[13]);
655 out[3] = vqaddq_s16(step2[3], step2[12]);
656 out[4] = vqaddq_s16(step2[4], step2[11]);
657 out[5] = vqaddq_s16(step2[5], step2[10]);
658 out[6] = vqaddq_s16(step2[6], step2[9]);
659 out[7] = vqaddq_s16(step2[7], step2[8]);
660 out[8] = vqsubq_s16(step2[7], step2[8]);
661 out[9] = vqsubq_s16(step2[6], step2[9]);
662 out[10] = vqsubq_s16(step2[5], step2[10]);
663 out[11] = vqsubq_s16(step2[4], step2[11]);
664 out[12] = vqsubq_s16(step2[3], step2[12]);
665 out[13] = vqsubq_s16(step2[2], step2[13]);
666 out[14] = vqsubq_s16(step2[1], step2[14]);
667 out[15] = vqsubq_s16(step2[0], step2[15]);
668}
669
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700670static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500671 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530672 const int32_t *cospi = cospi_arr(cos_bit);
673 int16x8_t step1[16], step2[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530674 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
675 (int16_t)cospi[16], (int16_t)cospi[48]);
676 const int16x4_t c1 =
677 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
678 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530679
680 // stage 1
681 // stage 2
682
683 step2[0] = in[0];
684 step2[2] = in[4];
685 step2[4] = in[2];
686 step2[6] = in[6];
687
688 btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
689 btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
690 btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
691 btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
692
693 // stage 3
694
695 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
696 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
697
698 step1[0] = step2[0];
699 step1[2] = step2[2];
700 step1[8] = vqaddq_s16(step2[8], step2[9]);
701 step1[9] = vqsubq_s16(step2[8], step2[9]);
702 step1[10] = vqsubq_s16(step2[11], step2[10]);
703 step1[11] = vqaddq_s16(step2[11], step2[10]);
704 step1[12] = vqaddq_s16(step2[12], step2[13]);
705 step1[13] = vqsubq_s16(step2[12], step2[13]);
706 step1[14] = vqsubq_s16(step2[15], step2[14]);
707 step1[15] = vqaddq_s16(step2[15], step2[14]);
708
709 // stage 4
710
711 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
712 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
713 btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530714 btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530715
716 step2[4] = vqaddq_s16(step1[4], step1[5]);
717 step2[5] = vqsubq_s16(step1[4], step1[5]);
718 step2[6] = vqsubq_s16(step1[7], step1[6]);
719 step2[7] = vqaddq_s16(step1[7], step1[6]);
720 step2[8] = step1[8];
721 step2[11] = step1[11];
722 step2[12] = step1[12];
723 step2[15] = step1[15];
724
725 // stage 5
726
727 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
728 step1[0] = vqaddq_s16(step2[0], step2[3]);
729 step1[1] = vqaddq_s16(step2[1], step2[2]);
730 step1[2] = vqsubq_s16(step2[1], step2[2]);
731 step1[3] = vqsubq_s16(step2[0], step2[3]);
732 step1[4] = step2[4];
733 step1[7] = step2[7];
734 step1[8] = vqaddq_s16(step2[8], step2[11]);
735 step1[9] = vqaddq_s16(step2[9], step2[10]);
736 step1[10] = vqsubq_s16(step2[9], step2[10]);
737 step1[11] = vqsubq_s16(step2[8], step2[11]);
738 step1[12] = vqsubq_s16(step2[15], step2[12]);
739 step1[13] = vqsubq_s16(step2[14], step2[13]);
740 step1[14] = vqaddq_s16(step2[14], step2[13]);
741 step1[15] = vqaddq_s16(step2[15], step2[12]);
742
743 // stage 6
744 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
745 btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
746
747 step2[0] = vqaddq_s16(step1[0], step1[7]);
748 step2[1] = vqaddq_s16(step1[1], step1[6]);
749 step2[2] = vqaddq_s16(step1[2], step1[5]);
750 step2[3] = vqaddq_s16(step1[3], step1[4]);
751 step2[4] = vqsubq_s16(step1[3], step1[4]);
752 step2[5] = vqsubq_s16(step1[2], step1[5]);
753 step2[6] = vqsubq_s16(step1[1], step1[6]);
754 step2[7] = vqsubq_s16(step1[0], step1[7]);
755 step2[8] = step1[8];
756 step2[9] = step1[9];
757 step2[14] = step1[14];
758 step2[15] = step1[15];
759
760 // stage 7
761
762 out[0] = vqaddq_s16(step2[0], step2[15]);
763 out[1] = vqaddq_s16(step2[1], step2[14]);
764 out[2] = vqaddq_s16(step2[2], step2[13]);
765 out[3] = vqaddq_s16(step2[3], step2[12]);
766 out[4] = vqaddq_s16(step2[4], step2[11]);
767 out[5] = vqaddq_s16(step2[5], step2[10]);
768 out[6] = vqaddq_s16(step2[6], step2[9]);
769 out[7] = vqaddq_s16(step2[7], step2[8]);
770 out[8] = vqsubq_s16(step2[7], step2[8]);
771 out[9] = vqsubq_s16(step2[6], step2[9]);
772 out[10] = vqsubq_s16(step2[5], step2[10]);
773 out[11] = vqsubq_s16(step2[4], step2[11]);
774 out[12] = vqsubq_s16(step2[3], step2[12]);
775 out[13] = vqsubq_s16(step2[2], step2[13]);
776 out[14] = vqsubq_s16(step2[1], step2[14]);
777 out[15] = vqsubq_s16(step2[0], step2[15]);
778}
779
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700780static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500781 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530782 const int32_t *cospi = cospi_arr(cos_bit);
783
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530784 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
785 (int16_t)cospi[10], (int16_t)cospi[54]);
786 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
787 (int16_t)cospi[26], (int16_t)cospi[38]);
788 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
789 (int16_t)cospi[42], (int16_t)cospi[22]);
790 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
791 (int16_t)cospi[58], (int16_t)cospi[6]);
792 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
793 (int16_t)cospi[40], (int16_t)cospi[24]);
794 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
795 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530796
797 int16x8_t x[16];
798 int16x8_t t[14];
799 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
800 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
801
802 // Stage 1
803 x[0] = in[15];
804 x[1] = in[0];
805 x[2] = in[13];
806 x[3] = in[2];
807 x[4] = in[11];
808 x[5] = in[4];
809 x[6] = in[9];
810 x[7] = in[6];
811 x[8] = in[7];
812 x[9] = in[8];
813 x[10] = in[5];
814 x[11] = in[10];
815 x[12] = in[3];
816 x[13] = in[12];
817 x[14] = in[1];
818 x[15] = in[14];
819
820 // Stage 2
821 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
822 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
823 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
824 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
825 btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
826 btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
827 btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
828 btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
829
830 // Stage 3
831 x[0] = vqaddq_s16(s0, s8);
832 x[1] = vqaddq_s16(s1, s9);
833 x[2] = vqaddq_s16(s2, s10);
834 x[3] = vqaddq_s16(s3, s11);
835 x[4] = vqaddq_s16(s4, s12);
836 x[5] = vqaddq_s16(s5, s13);
837 x[6] = vqaddq_s16(s6, s14);
838 x[7] = vqaddq_s16(s7, s15);
839 x[8] = vqsubq_s16(s0, s8);
840 x[9] = vqsubq_s16(s1, s9);
841 x[10] = vqsubq_s16(s2, s10);
842 x[11] = vqsubq_s16(s3, s11);
843 x[12] = vqsubq_s16(s4, s12);
844 x[13] = vqsubq_s16(s5, s13);
845 x[14] = vqsubq_s16(s6, s14);
846 x[15] = vqsubq_s16(s7, s15);
847
848 // Stage 4
849 t[0] = x[0];
850 t[1] = x[1];
851 t[2] = x[2];
852 t[3] = x[3];
853 t[4] = x[4];
854 t[5] = x[5];
855 t[6] = x[6];
856 t[7] = x[7];
857 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
858 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
859 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
860 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
861
862 // Stage 5
863 x[0] = vqaddq_s16(t[0], t[4]);
864 x[1] = vqaddq_s16(t[1], t[5]);
865 x[2] = vqaddq_s16(t[2], t[6]);
866 x[3] = vqaddq_s16(t[3], t[7]);
867 x[4] = vqsubq_s16(t[0], t[4]);
868 x[5] = vqsubq_s16(t[1], t[5]);
869 x[6] = vqsubq_s16(t[2], t[6]);
870 x[7] = vqsubq_s16(t[3], t[7]);
871 x[8] = vqaddq_s16(s8, s12);
872 x[9] = vqaddq_s16(s9, s13);
873 x[10] = vqaddq_s16(s10, s14);
874 x[11] = vqaddq_s16(s11, s15);
875 x[12] = vqsubq_s16(s8, s12);
876 x[13] = vqsubq_s16(s9, s13);
877 x[14] = vqsubq_s16(s10, s14);
878 x[15] = vqsubq_s16(s11, s15);
879
880 // stage 6
881 t[0] = x[0];
882 t[1] = x[1];
883 t[2] = x[2];
884 t[3] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530885 btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
886 btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530887 t[8] = x[8];
888 t[9] = x[9];
889 t[10] = x[10];
890 t[11] = x[11];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530891 btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
892 btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530893
894 // Stage 7
895 x[0] = vqaddq_s16(t[0], t[2]);
896 x[1] = vqaddq_s16(t[1], t[3]);
897 x[2] = vqsubq_s16(t[0], t[2]);
898 x[3] = vqsubq_s16(t[1], t[3]);
899 x[4] = vqaddq_s16(s4, s6);
900 x[5] = vqaddq_s16(s5, s7);
901 x[6] = vqsubq_s16(s4, s6);
902 x[7] = vqsubq_s16(s5, s7);
903 x[8] = vqaddq_s16(t[8], t[10]);
904 x[9] = vqaddq_s16(t[9], t[11]);
905 x[10] = vqsubq_s16(t[8], t[10]);
906 x[11] = vqsubq_s16(t[9], t[11]);
907 x[12] = vqaddq_s16(s12, s14);
908 x[13] = vqaddq_s16(s13, s15);
909 x[14] = vqsubq_s16(s12, s14);
910 x[15] = vqsubq_s16(s13, s15);
911
912 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530913 btf_16_half_neon(x + 2, c5);
914 btf_16_half_neon(x + 6, c5);
915 btf_16_half_neon(x + 10, c5);
916 btf_16_half_neon(x + 14, c5);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530917
918 // Stage 9
919 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530920 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530921 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530922 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530923 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530924 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530925 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530926 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530927 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530928 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530929 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530930 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530931 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530932 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530933 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530934 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530935}
936
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700937static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500938 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530939 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530940 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
941 (int16_t)cospi[40], (int16_t)cospi[24]);
942 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
943 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530944
945 int16x8_t x[16];
946 int16x8_t t[10];
947 int16x8_t s0, s1, s4, s5;
948 int16x8_t s8, s9, s12, s13;
949
950 // Stage 1
951 x[1] = in[0];
952
953 // Stage 2
954 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
955
956 // Stage 3
957 x[0] = s0;
958 x[1] = s1;
959 x[8] = s0;
960 x[9] = s1;
961
962 // Stage 4
963 t[0] = x[0];
964 t[1] = x[1];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530965 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530966
967 // Stage 5
968 x[0] = t[0];
969 x[1] = t[1];
970 x[4] = t[0];
971 x[5] = t[1];
972 x[8] = s8;
973 x[9] = s9;
974 x[12] = s8;
975 x[13] = s9;
976
977 // stage 6
978 t[0] = x[0];
979 t[1] = x[1];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530980 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530981 t[8] = x[8];
982 t[9] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530983 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530984
985 // Stage 7
986 x[0] = t[0];
987 x[1] = t[1];
988 x[2] = t[0];
989 x[3] = t[1];
990 x[4] = s4;
991 x[5] = s5;
992 x[6] = s4;
993 x[7] = s5;
994 x[8] = t[8];
995 x[9] = t[9];
996 x[10] = t[8];
997 x[11] = t[9];
998 x[12] = s12;
999 x[13] = s13;
1000 x[14] = s12;
1001 x[15] = s13;
1002
1003 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301004 btf_16_half_neon(x + 2, c1);
1005 btf_16_half_neon(x + 6, c1);
1006 btf_16_half_neon(x + 10, c1);
1007 btf_16_half_neon(x + 14, c1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301008
1009 // Stage 9
1010 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301011 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301012 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301013 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301014 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301015 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301016 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301017 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301018 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301019 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301020 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301021 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301022 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301023 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301024 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301025 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301026}
1027
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001028static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001029 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301030 const int32_t *cospi = cospi_arr(cos_bit);
1031
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301032 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1033 (int16_t)cospi[40], (int16_t)cospi[24]);
1034 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1035 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301036
1037 int16x8_t x[16];
1038 int16x8_t t[14];
1039 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1040 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
1041
1042 // Stage 1
1043 x[1] = in[0];
1044 x[3] = in[2];
1045 x[5] = in[4];
1046 x[7] = in[6];
1047 x[8] = in[7];
1048 x[10] = in[5];
1049 x[12] = in[3];
1050 x[14] = in[1];
1051
1052 // Stage 2
1053 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1054 btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
1055 btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
1056 btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
1057
1058 btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
1059 btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
1060 btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
1061 btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
1062
1063 // Stage 3
1064 x[0] = vqaddq_s16(s0, s8);
1065 x[1] = vqaddq_s16(s1, s9);
1066 x[2] = vqaddq_s16(s2, s10);
1067 x[3] = vqaddq_s16(s3, s11);
1068 x[4] = vqaddq_s16(s4, s12);
1069 x[5] = vqaddq_s16(s5, s13);
1070 x[6] = vqaddq_s16(s6, s14);
1071 x[7] = vqaddq_s16(s7, s15);
1072 x[8] = vqsubq_s16(s0, s8);
1073 x[9] = vqsubq_s16(s1, s9);
1074 x[10] = vqsubq_s16(s2, s10);
1075 x[11] = vqsubq_s16(s3, s11);
1076 x[12] = vqsubq_s16(s4, s12);
1077 x[13] = vqsubq_s16(s5, s13);
1078 x[14] = vqsubq_s16(s6, s14);
1079 x[15] = vqsubq_s16(s7, s15);
1080
1081 // Stage 4
1082 t[0] = x[0];
1083 t[1] = x[1];
1084 t[2] = x[2];
1085 t[3] = x[3];
1086 t[4] = x[4];
1087 t[5] = x[5];
1088 t[6] = x[6];
1089 t[7] = x[7];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301090 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
1091 btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
1092 btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
1093 btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301094
1095 // Stage 5
1096 x[0] = vqaddq_s16(t[0], t[4]);
1097 x[1] = vqaddq_s16(t[1], t[5]);
1098 x[2] = vqaddq_s16(t[2], t[6]);
1099 x[3] = vqaddq_s16(t[3], t[7]);
1100 x[4] = vqsubq_s16(t[0], t[4]);
1101 x[5] = vqsubq_s16(t[1], t[5]);
1102 x[6] = vqsubq_s16(t[2], t[6]);
1103 x[7] = vqsubq_s16(t[3], t[7]);
1104 x[8] = vqaddq_s16(s8, s12);
1105 x[9] = vqaddq_s16(s9, s13);
1106 x[10] = vqaddq_s16(s10, s14);
1107 x[11] = vqaddq_s16(s11, s15);
1108 x[12] = vqsubq_s16(s8, s12);
1109 x[13] = vqsubq_s16(s9, s13);
1110 x[14] = vqsubq_s16(s10, s14);
1111 x[15] = vqsubq_s16(s11, s15);
1112
1113 // stage 6
1114 t[0] = x[0];
1115 t[1] = x[1];
1116 t[2] = x[2];
1117 t[3] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301118 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
1119 btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301120 t[8] = x[8];
1121 t[9] = x[9];
1122 t[10] = x[10];
1123 t[11] = x[11];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301124 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
1125 btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301126
1127 // Stage 7
1128 x[0] = vqaddq_s16(t[0], t[2]);
1129 x[1] = vqaddq_s16(t[1], t[3]);
1130 x[2] = vqsubq_s16(t[0], t[2]);
1131 x[3] = vqsubq_s16(t[1], t[3]);
1132 x[4] = vqaddq_s16(s4, s6);
1133 x[5] = vqaddq_s16(s5, s7);
1134 x[6] = vqsubq_s16(s4, s6);
1135 x[7] = vqsubq_s16(s5, s7);
1136 x[8] = vqaddq_s16(t[8], t[10]);
1137 x[9] = vqaddq_s16(t[9], t[11]);
1138 x[10] = vqsubq_s16(t[8], t[10]);
1139 x[11] = vqsubq_s16(t[9], t[11]);
1140 x[12] = vqaddq_s16(s12, s14);
1141 x[13] = vqaddq_s16(s13, s15);
1142 x[14] = vqsubq_s16(s12, s14);
1143 x[15] = vqsubq_s16(s13, s15);
1144
1145 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301146 btf_16_half_neon(x + 2, c1);
1147 btf_16_half_neon(x + 6, c1);
1148 btf_16_half_neon(x + 10, c1);
1149 btf_16_half_neon(x + 14, c1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301150
1151 // Stage 9
1152 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301153 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301154 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301155 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301156 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301157 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301158 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301159 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301160 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301161 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301162 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301163 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301164 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301165 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301166 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301167 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301168}
1169
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001170static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301171 const int32_t *cospi = cospi_arr(cos_bit);
1172 int16x8_t step1[32], step2[32];
1173
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301174 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
1175 (int16_t)cospi[34], (int16_t)cospi[30]);
1176 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
1177 (int16_t)cospi[50], (int16_t)cospi[14]);
1178 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
1179 (int16_t)cospi[42], (int16_t)cospi[22]);
1180 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
1181 (int16_t)cospi[58], (int16_t)cospi[6]);
1182 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
1183 (int16_t)cospi[36], (int16_t)cospi[28]);
1184 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
1185 (int16_t)cospi[52], (int16_t)cospi[12]);
1186 const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1187 (int16_t)cospi[40], (int16_t)cospi[24]);
1188 const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1189 (int16_t)cospi[16], (int16_t)cospi[48]);
1190 const int16x4_t c8 =
1191 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1192 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1193 const int16x4_t c9 =
1194 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1195 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301196
1197 // stage 2
1198
1199 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
1200 btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
1201 btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
1202 btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
1203 btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
1204 btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
1205 btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
1206 btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
1207
1208 step2[0] = in[0];
1209 step2[1] = in[16];
1210 step2[2] = in[8];
1211 step2[3] = in[24];
1212 step2[4] = in[4];
1213 step2[5] = in[20];
1214 step2[6] = in[12];
1215 step2[7] = in[28];
1216 step2[8] = in[2];
1217 step2[9] = in[18];
1218 step2[10] = in[10];
1219 step2[11] = in[26];
1220 step2[12] = in[6];
1221 step2[13] = in[22];
1222 step2[14] = in[14];
1223 step2[15] = in[30];
1224
1225 // stage 3
1226
1227 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
1228 btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
1229 btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
1230 btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
1231
1232 step1[0] = step2[0];
1233 step1[1] = step2[1];
1234 step1[2] = step2[2];
1235 step1[3] = step2[3];
1236 step1[4] = step2[4];
1237 step1[5] = step2[5];
1238 step1[6] = step2[6];
1239 step1[7] = step2[7];
1240
1241 step1[16] = vqaddq_s16(step2[16], step2[17]);
1242 step1[17] = vqsubq_s16(step2[16], step2[17]);
1243 step1[18] = vqsubq_s16(step2[19], step2[18]);
1244 step1[19] = vqaddq_s16(step2[19], step2[18]);
1245 step1[20] = vqaddq_s16(step2[20], step2[21]);
1246 step1[21] = vqsubq_s16(step2[20], step2[21]);
1247 step1[22] = vqsubq_s16(step2[23], step2[22]);
1248 step1[23] = vqaddq_s16(step2[23], step2[22]);
1249 step1[24] = vqaddq_s16(step2[24], step2[25]);
1250 step1[25] = vqsubq_s16(step2[24], step2[25]);
1251 step1[26] = vqsubq_s16(step2[27], step2[26]);
1252 step1[27] = vqaddq_s16(step2[27], step2[26]);
1253 step1[28] = vqaddq_s16(step2[28], step2[29]);
1254 step1[29] = vqsubq_s16(step2[28], step2[29]);
1255 step1[30] = vqsubq_s16(step2[31], step2[30]);
1256 step1[31] = vqaddq_s16(step2[31], step2[30]);
1257
1258 // stage 4
1259
1260 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
1261 btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
1262 btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301263 btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301264 btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301265 btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301266
1267 step2[0] = step1[0];
1268 step2[1] = step1[1];
1269 step2[2] = step1[2];
1270 step2[3] = step1[3];
1271 step2[8] = vqaddq_s16(step1[8], step1[9]);
1272 step2[9] = vqsubq_s16(step1[8], step1[9]);
1273 step2[10] = vqsubq_s16(step1[11], step1[10]);
1274 step2[11] = vqaddq_s16(step1[11], step1[10]);
1275 step2[12] = vqaddq_s16(step1[12], step1[13]);
1276 step2[13] = vqsubq_s16(step1[12], step1[13]);
1277 step2[14] = vqsubq_s16(step1[15], step1[14]);
1278 step2[15] = vqaddq_s16(step1[15], step1[14]);
1279 step2[16] = step1[16];
1280 step2[19] = step1[19];
1281 step2[20] = step1[20];
1282 step2[23] = step1[23];
1283 step2[24] = step1[24];
1284 step2[27] = step1[27];
1285 step2[28] = step1[28];
1286 step2[31] = step1[31];
1287
1288 // stage 5
1289
1290 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
1291 btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
1292 btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301293 btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301294
1295 step1[4] = vqaddq_s16(step2[4], step2[5]);
1296 step1[5] = vqsubq_s16(step2[4], step2[5]);
1297 step1[6] = vqsubq_s16(step2[7], step2[6]);
1298 step1[7] = vqaddq_s16(step2[7], step2[6]);
1299 step1[8] = step2[8];
1300 step1[11] = step2[11];
1301 step1[12] = step2[12];
1302 step1[15] = step2[15];
1303 step1[16] = vqaddq_s16(step2[16], step2[19]);
1304 step1[17] = vqaddq_s16(step2[17], step2[18]);
1305 step1[18] = vqsubq_s16(step2[17], step2[18]);
1306 step1[19] = vqsubq_s16(step2[16], step2[19]);
1307 step1[20] = vqsubq_s16(step2[23], step2[20]);
1308 step1[21] = vqsubq_s16(step2[22], step2[21]);
1309 step1[22] = vqaddq_s16(step2[22], step2[21]);
1310 step1[23] = vqaddq_s16(step2[23], step2[20]);
1311 step1[24] = vqaddq_s16(step2[24], step2[27]);
1312 step1[25] = vqaddq_s16(step2[25], step2[26]);
1313 step1[26] = vqsubq_s16(step2[25], step2[26]);
1314 step1[27] = vqsubq_s16(step2[24], step2[27]);
1315 step1[28] = vqsubq_s16(step2[31], step2[28]);
1316 step1[29] = vqsubq_s16(step2[30], step2[29]);
1317 step1[30] = vqaddq_s16(step2[30], step2[29]);
1318 step1[31] = vqaddq_s16(step2[31], step2[28]);
1319
1320 // stage 6
1321
1322 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
1323 btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
1324 btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301325 btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
1326 btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301327
1328 step2[0] = vqaddq_s16(step1[0], step1[3]);
1329 step2[1] = vqaddq_s16(step1[1], step1[2]);
1330 step2[2] = vqsubq_s16(step1[1], step1[2]);
1331 step2[3] = vqsubq_s16(step1[0], step1[3]);
1332 step2[4] = step1[4];
1333 step2[7] = step1[7];
1334 step2[8] = vqaddq_s16(step1[8], step1[11]);
1335 step2[9] = vqaddq_s16(step1[9], step1[10]);
1336 step2[10] = vqsubq_s16(step1[9], step1[10]);
1337 step2[11] = vqsubq_s16(step1[8], step1[11]);
1338 step2[12] = vqsubq_s16(step1[15], step1[12]);
1339 step2[13] = vqsubq_s16(step1[14], step1[13]);
1340 step2[14] = vqaddq_s16(step1[14], step1[13]);
1341 step2[15] = vqaddq_s16(step1[15], step1[12]);
1342 step2[16] = step1[16];
1343 step2[17] = step1[17];
1344 step2[22] = step1[22];
1345 step2[23] = step1[23];
1346 step2[24] = step1[24];
1347 step2[25] = step1[25];
1348 step2[30] = step1[30];
1349 step2[31] = step1[31];
1350
1351 // stage 7
1352
1353 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
1354 btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
1355
1356 step1[0] = vqaddq_s16(step2[0], step2[7]);
1357 step1[1] = vqaddq_s16(step2[1], step2[6]);
1358 step1[2] = vqaddq_s16(step2[2], step2[5]);
1359 step1[3] = vqaddq_s16(step2[3], step2[4]);
1360 step1[4] = vqsubq_s16(step2[3], step2[4]);
1361 step1[5] = vqsubq_s16(step2[2], step2[5]);
1362 step1[6] = vqsubq_s16(step2[1], step2[6]);
1363 step1[7] = vqsubq_s16(step2[0], step2[7]);
1364 step1[8] = step2[8];
1365 step1[9] = step2[9];
1366 step1[14] = step2[14];
1367 step1[15] = step2[15];
1368 step1[16] = vqaddq_s16(step2[16], step2[23]);
1369 step1[17] = vqaddq_s16(step2[17], step2[22]);
1370 step1[18] = vqaddq_s16(step2[18], step2[21]);
1371 step1[19] = vqaddq_s16(step2[19], step2[20]);
1372 step1[20] = vqsubq_s16(step2[19], step2[20]);
1373 step1[21] = vqsubq_s16(step2[18], step2[21]);
1374 step1[22] = vqsubq_s16(step2[17], step2[22]);
1375 step1[23] = vqsubq_s16(step2[16], step2[23]);
1376 step1[24] = vqsubq_s16(step2[31], step2[24]);
1377 step1[25] = vqsubq_s16(step2[30], step2[25]);
1378 step1[26] = vqsubq_s16(step2[29], step2[26]);
1379 step1[27] = vqsubq_s16(step2[28], step2[27]);
1380 step1[28] = vqaddq_s16(step2[27], step2[28]);
1381 step1[29] = vqaddq_s16(step2[26], step2[29]);
1382 step1[30] = vqaddq_s16(step2[25], step2[30]);
1383 step1[31] = vqaddq_s16(step2[24], step2[31]);
1384
1385 // stage 8
1386
1387 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
1388 btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
1389 btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
1390 btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
1391
1392 step2[0] = vqaddq_s16(step1[0], step1[15]);
1393 step2[1] = vqaddq_s16(step1[1], step1[14]);
1394 step2[2] = vqaddq_s16(step1[2], step1[13]);
1395 step2[3] = vqaddq_s16(step1[3], step1[12]);
1396 step2[4] = vqaddq_s16(step1[4], step1[11]);
1397 step2[5] = vqaddq_s16(step1[5], step1[10]);
1398 step2[6] = vqaddq_s16(step1[6], step1[9]);
1399 step2[7] = vqaddq_s16(step1[7], step1[8]);
1400 step2[8] = vqsubq_s16(step1[7], step1[8]);
1401 step2[9] = vqsubq_s16(step1[6], step1[9]);
1402 step2[10] = vqsubq_s16(step1[5], step1[10]);
1403 step2[11] = vqsubq_s16(step1[4], step1[11]);
1404 step2[12] = vqsubq_s16(step1[3], step1[12]);
1405 step2[13] = vqsubq_s16(step1[2], step1[13]);
1406 step2[14] = vqsubq_s16(step1[1], step1[14]);
1407 step2[15] = vqsubq_s16(step1[0], step1[15]);
1408 step2[16] = step1[16];
1409 step2[17] = step1[17];
1410 step2[18] = step1[18];
1411 step2[19] = step1[19];
1412 step2[28] = step1[28];
1413 step2[29] = step1[29];
1414 step2[30] = step1[30];
1415 step2[31] = step1[31];
1416
1417 // stage 9
1418
1419 out[0] = vqaddq_s16(step2[0], step2[31]);
1420 out[1] = vqaddq_s16(step2[1], step2[30]);
1421 out[2] = vqaddq_s16(step2[2], step2[29]);
1422 out[3] = vqaddq_s16(step2[3], step2[28]);
1423 out[4] = vqaddq_s16(step2[4], step2[27]);
1424 out[5] = vqaddq_s16(step2[5], step2[26]);
1425 out[6] = vqaddq_s16(step2[6], step2[25]);
1426 out[7] = vqaddq_s16(step2[7], step2[24]);
1427 out[8] = vqaddq_s16(step2[8], step2[23]);
1428 out[9] = vqaddq_s16(step2[9], step2[22]);
1429 out[10] = vqaddq_s16(step2[10], step2[21]);
1430 out[11] = vqaddq_s16(step2[11], step2[20]);
1431 out[12] = vqaddq_s16(step2[12], step2[19]);
1432 out[13] = vqaddq_s16(step2[13], step2[18]);
1433 out[14] = vqaddq_s16(step2[14], step2[17]);
1434 out[15] = vqaddq_s16(step2[15], step2[16]);
1435 out[16] = vqsubq_s16(step2[15], step2[16]);
1436 out[17] = vqsubq_s16(step2[14], step2[17]);
1437 out[18] = vqsubq_s16(step2[13], step2[18]);
1438 out[19] = vqsubq_s16(step2[12], step2[19]);
1439 out[20] = vqsubq_s16(step2[11], step2[20]);
1440 out[21] = vqsubq_s16(step2[10], step2[21]);
1441 out[22] = vqsubq_s16(step2[9], step2[22]);
1442 out[23] = vqsubq_s16(step2[8], step2[23]);
1443 out[24] = vqsubq_s16(step2[7], step2[24]);
1444 out[25] = vqsubq_s16(step2[6], step2[25]);
1445 out[26] = vqsubq_s16(step2[5], step2[26]);
1446 out[27] = vqsubq_s16(step2[4], step2[27]);
1447 out[28] = vqsubq_s16(step2[3], step2[28]);
1448 out[29] = vqsubq_s16(step2[2], step2[29]);
1449 out[30] = vqsubq_s16(step2[1], step2[30]);
1450 out[31] = vqsubq_s16(step2[0], step2[31]);
1451}
1452
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001453static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001454 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301455 const int32_t *cospi = cospi_arr(cos_bit);
1456 int16x8_t step1;
1457 int32x4_t t32[2];
1458
1459 // stage 1
1460 // stage 2
1461 // stage 3
1462 // stage 4
1463 // stage 5
1464
1465 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
1466 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
1467 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1468 vrshrn_n_s32(t32[1], INV_COS_BIT));
1469
1470 // stage 6
1471 // stage 7
1472 // stage 8
1473 // stage 9
1474
1475 out[0] = step1;
1476 out[1] = step1;
1477 out[2] = step1;
1478 out[3] = step1;
1479 out[4] = step1;
1480 out[5] = step1;
1481 out[6] = step1;
1482 out[7] = step1;
1483 out[8] = step1;
1484 out[9] = step1;
1485 out[10] = step1;
1486 out[11] = step1;
1487 out[12] = step1;
1488 out[13] = step1;
1489 out[14] = step1;
1490 out[15] = step1;
1491 out[16] = step1;
1492 out[17] = step1;
1493 out[18] = step1;
1494 out[19] = step1;
1495 out[20] = step1;
1496 out[21] = step1;
1497 out[22] = step1;
1498 out[23] = step1;
1499 out[24] = step1;
1500 out[25] = step1;
1501 out[26] = step1;
1502 out[27] = step1;
1503 out[28] = step1;
1504 out[29] = step1;
1505 out[30] = step1;
1506 out[31] = step1;
1507}
1508
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001509static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001510 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301511 const int32_t *cospi = cospi_arr(cos_bit);
1512 int16x8_t step1[32], step2[32];
1513 int32x4_t t32[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301514 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1515 (int16_t)cospi[40], (int16_t)cospi[24]);
1516 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1517 (int16_t)cospi[16], cospi[48]);
1518 const int16x4_t c2 =
1519 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1520 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1521 const int16x4_t c3 =
1522 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1523 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301524 // stage 1
1525 // stage 2
1526
1527 step2[0] = in[0];
1528 step2[4] = in[4];
1529 step2[8] = in[2];
1530 step2[12] = in[6];
1531
1532 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1533 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1534 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1535 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1536
1537 // stage 3
1538 step1[0] = step2[0];
1539 step1[4] = step2[4];
1540
1541 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1542 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1543
1544 step1[16] = step2[16];
1545 step1[17] = step2[16];
1546 step1[18] = step2[19];
1547 step1[19] = step2[19];
1548 step1[20] = step2[20];
1549 step1[21] = step2[20];
1550 step1[22] = step2[23];
1551 step1[23] = step2[23];
1552 step1[24] = step2[24];
1553 step1[25] = step2[24];
1554 step1[26] = step2[27];
1555 step1[27] = step2[27];
1556 step1[28] = step2[28];
1557 step1[29] = step2[28];
1558 step1[30] = step2[31];
1559 step1[31] = step2[31];
1560
1561 // stage 4
1562
1563 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1564 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301565 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301566 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301567 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301568
1569 step2[0] = step1[0];
1570 step2[8] = step1[8];
1571 step2[9] = step1[8];
1572 step2[10] = step1[11];
1573 step2[11] = step1[11];
1574 step2[12] = step1[12];
1575 step2[13] = step1[12];
1576 step2[14] = step1[15];
1577 step2[15] = step1[15];
1578 step2[16] = step1[16];
1579 step2[19] = step1[19];
1580 step2[20] = step1[20];
1581 step2[23] = step1[23];
1582 step2[24] = step1[24];
1583 step2[27] = step1[27];
1584 step2[28] = step1[28];
1585 step2[31] = step1[31];
1586
1587 // stage 5
1588
1589 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1590 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1591 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1592 vrshrn_n_s32(t32[1], INV_COS_BIT));
1593
1594 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301595 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301596
1597 step1[4] = step2[4];
1598 step1[5] = step2[4];
1599 step1[6] = step2[7];
1600 step1[7] = step2[7];
1601 step1[8] = step2[8];
1602 step1[11] = step2[11];
1603 step1[12] = step2[12];
1604 step1[15] = step2[15];
1605 step1[16] = vqaddq_s16(step2[16], step2[19]);
1606 step1[17] = vqaddq_s16(step2[17], step2[18]);
1607 step1[18] = vqsubq_s16(step2[17], step2[18]);
1608 step1[19] = vqsubq_s16(step2[16], step2[19]);
1609 step1[20] = vqsubq_s16(step2[23], step2[20]);
1610 step1[21] = vqsubq_s16(step2[22], step2[21]);
1611 step1[22] = vqaddq_s16(step2[22], step2[21]);
1612 step1[23] = vqaddq_s16(step2[23], step2[20]);
1613 step1[24] = vqaddq_s16(step2[24], step2[27]);
1614 step1[25] = vqaddq_s16(step2[25], step2[26]);
1615 step1[26] = vqsubq_s16(step2[25], step2[26]);
1616 step1[27] = vqsubq_s16(step2[24], step2[27]);
1617 step1[28] = vqsubq_s16(step2[31], step2[28]);
1618 step1[29] = vqsubq_s16(step2[30], step2[29]);
1619 step1[30] = vqaddq_s16(step2[30], step2[29]);
1620 step1[31] = vqaddq_s16(step2[31], step2[28]);
1621
1622 // stage 6
1623
1624 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1625 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1626 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301627 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1628 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301629
1630 step2[0] = step1[0];
1631 step2[1] = step1[0];
1632 step2[2] = step1[0];
1633 step2[3] = step1[0];
1634 step2[4] = step1[4];
1635 step2[7] = step1[7];
1636 step2[8] = vqaddq_s16(step1[8], step1[11]);
1637 step2[9] = vqaddq_s16(step1[9], step1[10]);
1638 step2[10] = vqsubq_s16(step1[9], step1[10]);
1639 step2[11] = vqsubq_s16(step1[8], step1[11]);
1640 step2[12] = vqsubq_s16(step1[15], step1[12]);
1641 step2[13] = vqsubq_s16(step1[14], step1[13]);
1642 step2[14] = vqaddq_s16(step1[14], step1[13]);
1643 step2[15] = vqaddq_s16(step1[15], step1[12]);
1644 step2[16] = step1[16];
1645 step2[17] = step1[17];
1646 step2[22] = step1[22];
1647 step2[23] = step1[23];
1648 step2[24] = step1[24];
1649 step2[25] = step1[25];
1650 step2[30] = step1[30];
1651 step2[31] = step1[31];
1652
1653 // stage 7
1654
1655 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1656 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1657
1658 step1[0] = vqaddq_s16(step2[0], step2[7]);
1659 step1[1] = vqaddq_s16(step2[1], step2[6]);
1660 step1[2] = vqaddq_s16(step2[2], step2[5]);
1661 step1[3] = vqaddq_s16(step2[3], step2[4]);
1662 step1[4] = vqsubq_s16(step2[3], step2[4]);
1663 step1[5] = vqsubq_s16(step2[2], step2[5]);
1664 step1[6] = vqsubq_s16(step2[1], step2[6]);
1665 step1[7] = vqsubq_s16(step2[0], step2[7]);
1666 step1[8] = step2[8];
1667 step1[9] = step2[9];
1668 step1[14] = step2[14];
1669 step1[15] = step2[15];
1670 step1[16] = vqaddq_s16(step2[16], step2[23]);
1671 step1[17] = vqaddq_s16(step2[17], step2[22]);
1672 step1[18] = vqaddq_s16(step2[18], step2[21]);
1673 step1[19] = vqaddq_s16(step2[19], step2[20]);
1674 step1[20] = vqsubq_s16(step2[19], step2[20]);
1675 step1[21] = vqsubq_s16(step2[18], step2[21]);
1676 step1[22] = vqsubq_s16(step2[17], step2[22]);
1677 step1[23] = vqsubq_s16(step2[16], step2[23]);
1678 step1[24] = vqsubq_s16(step2[31], step2[24]);
1679 step1[25] = vqsubq_s16(step2[30], step2[25]);
1680 step1[26] = vqsubq_s16(step2[29], step2[26]);
1681 step1[27] = vqsubq_s16(step2[28], step2[27]);
1682 step1[28] = vqaddq_s16(step2[27], step2[28]);
1683 step1[29] = vqaddq_s16(step2[26], step2[29]);
1684 step1[30] = vqaddq_s16(step2[25], step2[30]);
1685 step1[31] = vqaddq_s16(step2[24], step2[31]);
1686
1687 // stage 8
1688
1689 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1690 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1691 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1692 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1693
1694 step2[0] = vqaddq_s16(step1[0], step1[15]);
1695 step2[1] = vqaddq_s16(step1[1], step1[14]);
1696 step2[2] = vqaddq_s16(step1[2], step1[13]);
1697 step2[3] = vqaddq_s16(step1[3], step1[12]);
1698 step2[4] = vqaddq_s16(step1[4], step1[11]);
1699 step2[5] = vqaddq_s16(step1[5], step1[10]);
1700 step2[6] = vqaddq_s16(step1[6], step1[9]);
1701 step2[7] = vqaddq_s16(step1[7], step1[8]);
1702 step2[8] = vqsubq_s16(step1[7], step1[8]);
1703 step2[9] = vqsubq_s16(step1[6], step1[9]);
1704 step2[10] = vqsubq_s16(step1[5], step1[10]);
1705 step2[11] = vqsubq_s16(step1[4], step1[11]);
1706 step2[12] = vqsubq_s16(step1[3], step1[12]);
1707 step2[13] = vqsubq_s16(step1[2], step1[13]);
1708 step2[14] = vqsubq_s16(step1[1], step1[14]);
1709 step2[15] = vqsubq_s16(step1[0], step1[15]);
1710 step2[16] = step1[16];
1711 step2[17] = step1[17];
1712 step2[18] = step1[18];
1713 step2[19] = step1[19];
1714 step2[28] = step1[28];
1715 step2[29] = step1[29];
1716 step2[30] = step1[30];
1717 step2[31] = step1[31];
1718
1719 // stage 9
1720
1721 out[0] = vqaddq_s16(step2[0], step2[31]);
1722 out[1] = vqaddq_s16(step2[1], step2[30]);
1723 out[2] = vqaddq_s16(step2[2], step2[29]);
1724 out[3] = vqaddq_s16(step2[3], step2[28]);
1725 out[4] = vqaddq_s16(step2[4], step2[27]);
1726 out[5] = vqaddq_s16(step2[5], step2[26]);
1727 out[6] = vqaddq_s16(step2[6], step2[25]);
1728 out[7] = vqaddq_s16(step2[7], step2[24]);
1729 out[8] = vqaddq_s16(step2[8], step2[23]);
1730 out[9] = vqaddq_s16(step2[9], step2[22]);
1731 out[10] = vqaddq_s16(step2[10], step2[21]);
1732 out[11] = vqaddq_s16(step2[11], step2[20]);
1733 out[12] = vqaddq_s16(step2[12], step2[19]);
1734 out[13] = vqaddq_s16(step2[13], step2[18]);
1735 out[14] = vqaddq_s16(step2[14], step2[17]);
1736 out[15] = vqaddq_s16(step2[15], step2[16]);
1737 out[16] = vqsubq_s16(step2[15], step2[16]);
1738 out[17] = vqsubq_s16(step2[14], step2[17]);
1739 out[18] = vqsubq_s16(step2[13], step2[18]);
1740 out[19] = vqsubq_s16(step2[12], step2[19]);
1741 out[20] = vqsubq_s16(step2[11], step2[20]);
1742 out[21] = vqsubq_s16(step2[10], step2[21]);
1743 out[22] = vqsubq_s16(step2[9], step2[22]);
1744 out[23] = vqsubq_s16(step2[8], step2[23]);
1745 out[24] = vqsubq_s16(step2[7], step2[24]);
1746 out[25] = vqsubq_s16(step2[6], step2[25]);
1747 out[26] = vqsubq_s16(step2[5], step2[26]);
1748 out[27] = vqsubq_s16(step2[4], step2[27]);
1749 out[28] = vqsubq_s16(step2[3], step2[28]);
1750 out[29] = vqsubq_s16(step2[2], step2[29]);
1751 out[30] = vqsubq_s16(step2[1], step2[30]);
1752 out[31] = vqsubq_s16(step2[0], step2[31]);
1753}
1754
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001755static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001756 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301757 const int32_t *cospi = cospi_arr(cos_bit);
1758 int16x8_t step1[32], step2[32];
1759 int32x4_t t32[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301760 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1761 (int16_t)cospi[40], (int16_t)cospi[24]);
1762 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1763 (int16_t)cospi[16], (int16_t)cospi[48]);
1764 const int16x4_t c2 =
1765 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1766 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1767 const int16x4_t c3 =
1768 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1769 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301770
1771 // stage 1
1772 // stage 2
1773
1774 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1775 btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
1776 btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
1777 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1778 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1779 btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
1780 btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
1781 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1782
1783 step2[0] = in[0];
1784 step2[2] = in[8];
1785 step2[4] = in[4];
1786 step2[6] = in[12];
1787 step2[8] = in[2];
1788 step2[10] = in[10];
1789 step2[12] = in[6];
1790 step2[14] = in[14];
1791
1792 // stage 3
1793
1794 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1795 btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
1796 btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
1797 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1798
1799 step1[0] = step2[0];
1800 step1[2] = step2[2];
1801 step1[4] = step2[4];
1802 step1[6] = step2[6];
1803 step1[16] = vqaddq_s16(step2[16], step2[17]);
1804 step1[17] = vqsubq_s16(step2[16], step2[17]);
1805 step1[18] = vqsubq_s16(step2[19], step2[18]);
1806 step1[19] = vqaddq_s16(step2[19], step2[18]);
1807 step1[20] = vqaddq_s16(step2[20], step2[21]);
1808 step1[21] = vqsubq_s16(step2[20], step2[21]);
1809 step1[22] = vqsubq_s16(step2[23], step2[22]);
1810 step1[23] = vqaddq_s16(step2[23], step2[22]);
1811 step1[24] = vqaddq_s16(step2[24], step2[25]);
1812 step1[25] = vqsubq_s16(step2[24], step2[25]);
1813 step1[26] = vqsubq_s16(step2[27], step2[26]);
1814 step1[27] = vqaddq_s16(step2[27], step2[26]);
1815 step1[28] = vqaddq_s16(step2[28], step2[29]);
1816 step1[29] = vqsubq_s16(step2[28], step2[29]);
1817 step1[30] = vqsubq_s16(step2[31], step2[30]);
1818 step1[31] = vqaddq_s16(step2[31], step2[30]);
1819
1820 // stage 4
1821
1822 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1823 btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
1824 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301825 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301826 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301827 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301828
1829 step2[0] = step1[0];
1830 step2[2] = step1[2];
1831 step2[8] = vqaddq_s16(step1[8], step1[9]);
1832 step2[9] = vqsubq_s16(step1[8], step1[9]);
1833 step2[10] = vqsubq_s16(step1[11], step1[10]);
1834 step2[11] = vqaddq_s16(step1[11], step1[10]);
1835 step2[12] = vqaddq_s16(step1[12], step1[13]);
1836 step2[13] = vqsubq_s16(step1[12], step1[13]);
1837 step2[14] = vqsubq_s16(step1[15], step1[14]);
1838 step2[15] = vqaddq_s16(step1[15], step1[14]);
1839 step2[16] = step1[16];
1840 step2[19] = step1[19];
1841 step2[20] = step1[20];
1842 step2[23] = step1[23];
1843 step2[24] = step1[24];
1844 step2[27] = step1[27];
1845 step2[28] = step1[28];
1846 step2[31] = step1[31];
1847
1848 // stage 5
1849
1850 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1851 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1852
1853 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1854 vrshrn_n_s32(t32[1], INV_COS_BIT));
1855
1856 btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
1857 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301858 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301859
1860 step1[4] = vqaddq_s16(step2[4], step2[5]);
1861 step1[5] = vqsubq_s16(step2[4], step2[5]);
1862 step1[6] = vqsubq_s16(step2[7], step2[6]);
1863 step1[7] = vqaddq_s16(step2[7], step2[6]);
1864 step1[8] = step2[8];
1865 step1[11] = step2[11];
1866 step1[12] = step2[12];
1867 step1[15] = step2[15];
1868 step1[16] = vqaddq_s16(step2[16], step2[19]);
1869 step1[17] = vqaddq_s16(step2[17], step2[18]);
1870 step1[18] = vqsubq_s16(step2[17], step2[18]);
1871 step1[19] = vqsubq_s16(step2[16], step2[19]);
1872 step1[20] = vqsubq_s16(step2[23], step2[20]);
1873 step1[21] = vqsubq_s16(step2[22], step2[21]);
1874 step1[22] = vqaddq_s16(step2[22], step2[21]);
1875 step1[23] = vqaddq_s16(step2[23], step2[20]);
1876 step1[24] = vqaddq_s16(step2[24], step2[27]);
1877 step1[25] = vqaddq_s16(step2[25], step2[26]);
1878 step1[26] = vqsubq_s16(step2[25], step2[26]);
1879 step1[27] = vqsubq_s16(step2[24], step2[27]);
1880 step1[28] = vqsubq_s16(step2[31], step2[28]);
1881 step1[29] = vqsubq_s16(step2[30], step2[29]);
1882 step1[30] = vqaddq_s16(step2[30], step2[29]);
1883 step1[31] = vqaddq_s16(step2[31], step2[28]);
1884
1885 // stage 6
1886
1887 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1888 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1889 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301890 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1891 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301892
1893 step2[0] = vqaddq_s16(step1[0], step1[3]);
1894 step2[1] = vqaddq_s16(step1[0], step1[2]);
1895 step2[2] = vqsubq_s16(step1[0], step1[2]);
1896 step2[3] = vqsubq_s16(step1[0], step1[3]);
1897 step2[4] = step1[4];
1898 step2[7] = step1[7];
1899 step2[8] = vqaddq_s16(step1[8], step1[11]);
1900 step2[9] = vqaddq_s16(step1[9], step1[10]);
1901 step2[10] = vqsubq_s16(step1[9], step1[10]);
1902 step2[11] = vqsubq_s16(step1[8], step1[11]);
1903 step2[12] = vqsubq_s16(step1[15], step1[12]);
1904 step2[13] = vqsubq_s16(step1[14], step1[13]);
1905 step2[14] = vqaddq_s16(step1[14], step1[13]);
1906 step2[15] = vqaddq_s16(step1[15], step1[12]);
1907 step2[16] = step1[16];
1908 step2[17] = step1[17];
1909 step2[22] = step1[22];
1910 step2[23] = step1[23];
1911 step2[24] = step1[24];
1912 step2[25] = step1[25];
1913 step2[30] = step1[30];
1914 step2[31] = step1[31];
1915
1916 // stage 7
1917
1918 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1919 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1920
1921 step1[0] = vqaddq_s16(step2[0], step2[7]);
1922 step1[1] = vqaddq_s16(step2[1], step2[6]);
1923 step1[2] = vqaddq_s16(step2[2], step2[5]);
1924 step1[3] = vqaddq_s16(step2[3], step2[4]);
1925 step1[4] = vqsubq_s16(step2[3], step2[4]);
1926 step1[5] = vqsubq_s16(step2[2], step2[5]);
1927 step1[6] = vqsubq_s16(step2[1], step2[6]);
1928 step1[7] = vqsubq_s16(step2[0], step2[7]);
1929 step1[8] = step2[8];
1930 step1[9] = step2[9];
1931 step1[14] = step2[14];
1932 step1[15] = step2[15];
1933 step1[16] = vqaddq_s16(step2[16], step2[23]);
1934 step1[17] = vqaddq_s16(step2[17], step2[22]);
1935 step1[18] = vqaddq_s16(step2[18], step2[21]);
1936 step1[19] = vqaddq_s16(step2[19], step2[20]);
1937 step1[20] = vqsubq_s16(step2[19], step2[20]);
1938 step1[21] = vqsubq_s16(step2[18], step2[21]);
1939 step1[22] = vqsubq_s16(step2[17], step2[22]);
1940 step1[23] = vqsubq_s16(step2[16], step2[23]);
1941 step1[24] = vqsubq_s16(step2[31], step2[24]);
1942 step1[25] = vqsubq_s16(step2[30], step2[25]);
1943 step1[26] = vqsubq_s16(step2[29], step2[26]);
1944 step1[27] = vqsubq_s16(step2[28], step2[27]);
1945 step1[28] = vqaddq_s16(step2[27], step2[28]);
1946 step1[29] = vqaddq_s16(step2[26], step2[29]);
1947 step1[30] = vqaddq_s16(step2[25], step2[30]);
1948 step1[31] = vqaddq_s16(step2[24], step2[31]);
1949
1950 // stage 8
1951
1952 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1953 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1954 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1955 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1956
1957 step2[0] = vqaddq_s16(step1[0], step1[15]);
1958 step2[1] = vqaddq_s16(step1[1], step1[14]);
1959 step2[2] = vqaddq_s16(step1[2], step1[13]);
1960 step2[3] = vqaddq_s16(step1[3], step1[12]);
1961 step2[4] = vqaddq_s16(step1[4], step1[11]);
1962 step2[5] = vqaddq_s16(step1[5], step1[10]);
1963 step2[6] = vqaddq_s16(step1[6], step1[9]);
1964 step2[7] = vqaddq_s16(step1[7], step1[8]);
1965 step2[8] = vqsubq_s16(step1[7], step1[8]);
1966 step2[9] = vqsubq_s16(step1[6], step1[9]);
1967 step2[10] = vqsubq_s16(step1[5], step1[10]);
1968 step2[11] = vqsubq_s16(step1[4], step1[11]);
1969 step2[12] = vqsubq_s16(step1[3], step1[12]);
1970 step2[13] = vqsubq_s16(step1[2], step1[13]);
1971 step2[14] = vqsubq_s16(step1[1], step1[14]);
1972 step2[15] = vqsubq_s16(step1[0], step1[15]);
1973 step2[16] = step1[16];
1974 step2[17] = step1[17];
1975 step2[18] = step1[18];
1976 step2[19] = step1[19];
1977 step2[28] = step1[28];
1978 step2[29] = step1[29];
1979 step2[30] = step1[30];
1980 step2[31] = step1[31];
1981
1982 // stage 9
1983
1984 out[0] = vqaddq_s16(step2[0], step2[31]);
1985 out[1] = vqaddq_s16(step2[1], step2[30]);
1986 out[2] = vqaddq_s16(step2[2], step2[29]);
1987 out[3] = vqaddq_s16(step2[3], step2[28]);
1988 out[4] = vqaddq_s16(step2[4], step2[27]);
1989 out[5] = vqaddq_s16(step2[5], step2[26]);
1990 out[6] = vqaddq_s16(step2[6], step2[25]);
1991 out[7] = vqaddq_s16(step2[7], step2[24]);
1992 out[8] = vqaddq_s16(step2[8], step2[23]);
1993 out[9] = vqaddq_s16(step2[9], step2[22]);
1994 out[10] = vqaddq_s16(step2[10], step2[21]);
1995 out[11] = vqaddq_s16(step2[11], step2[20]);
1996 out[12] = vqaddq_s16(step2[12], step2[19]);
1997 out[13] = vqaddq_s16(step2[13], step2[18]);
1998 out[14] = vqaddq_s16(step2[14], step2[17]);
1999 out[15] = vqaddq_s16(step2[15], step2[16]);
2000 out[16] = vqsubq_s16(step2[15], step2[16]);
2001 out[17] = vqsubq_s16(step2[14], step2[17]);
2002 out[18] = vqsubq_s16(step2[13], step2[18]);
2003 out[19] = vqsubq_s16(step2[12], step2[19]);
2004 out[20] = vqsubq_s16(step2[11], step2[20]);
2005 out[21] = vqsubq_s16(step2[10], step2[21]);
2006 out[22] = vqsubq_s16(step2[9], step2[22]);
2007 out[23] = vqsubq_s16(step2[8], step2[23]);
2008 out[24] = vqsubq_s16(step2[7], step2[24]);
2009 out[25] = vqsubq_s16(step2[6], step2[25]);
2010 out[26] = vqsubq_s16(step2[5], step2[26]);
2011 out[27] = vqsubq_s16(step2[4], step2[27]);
2012 out[28] = vqsubq_s16(step2[3], step2[28]);
2013 out[29] = vqsubq_s16(step2[2], step2[29]);
2014 out[30] = vqsubq_s16(step2[1], step2[30]);
2015 out[31] = vqsubq_s16(step2[0], step2[31]);
2016}
sachin garg56f10202018-09-24 14:05:25 +00002017static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
2018 int8_t cos_bit) {
2019 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302020 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2021 (int16_t)cospi[16], (int16_t)cospi[48]);
sachin garg56f10202018-09-24 14:05:25 +00002022
2023 btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
2024 btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
2025 btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
2026 btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
2027
2028 step1[0] = vqaddq_s16(step2[0], step2[15]);
2029 step1[1] = vqaddq_s16(step2[1], step2[14]);
2030 step1[2] = vqaddq_s16(step2[2], step2[13]);
2031 step1[3] = vqaddq_s16(step2[3], step2[12]);
2032 step1[4] = vqaddq_s16(step2[4], step2[11]);
2033 step1[5] = vqaddq_s16(step2[5], step2[10]);
2034 step1[6] = vqaddq_s16(step2[6], step2[9]);
2035 step1[7] = vqaddq_s16(step2[7], step2[8]);
2036 step1[8] = vqsubq_s16(step2[7], step2[8]);
2037 step1[9] = vqsubq_s16(step2[6], step2[9]);
2038 step1[10] = vqsubq_s16(step2[5], step2[10]);
2039 step1[11] = vqsubq_s16(step2[4], step2[11]);
2040 step1[12] = vqsubq_s16(step2[3], step2[12]);
2041 step1[13] = vqsubq_s16(step2[2], step2[13]);
2042 step1[14] = vqsubq_s16(step2[1], step2[14]);
2043 step1[15] = vqsubq_s16(step2[0], step2[15]);
2044 step1[16] = step2[16];
2045 step1[17] = step2[17];
2046 step1[18] = step2[18];
2047 step1[19] = step2[19];
2048 step1[28] = step2[28];
2049 step1[29] = step2[29];
2050 step1[30] = step2[30];
2051 step1[31] = step2[31];
2052 step1[32] = vqaddq_s16(step2[32], step2[47]);
2053 step1[33] = vqaddq_s16(step2[33], step2[46]);
2054 step1[34] = vqaddq_s16(step2[34], step2[45]);
2055 step1[35] = vqaddq_s16(step2[35], step2[44]);
2056 step1[36] = vqaddq_s16(step2[36], step2[43]);
2057 step1[37] = vqaddq_s16(step2[37], step2[42]);
2058 step1[38] = vqaddq_s16(step2[38], step2[41]);
2059 step1[39] = vqaddq_s16(step2[39], step2[40]);
2060 step1[40] = vqsubq_s16(step2[39], step2[40]);
2061 step1[41] = vqsubq_s16(step2[38], step2[41]);
2062 step1[42] = vqsubq_s16(step2[37], step2[42]);
2063 step1[43] = vqsubq_s16(step2[36], step2[43]);
2064 step1[44] = vqsubq_s16(step2[35], step2[44]);
2065 step1[45] = vqsubq_s16(step2[34], step2[45]);
2066 step1[46] = vqsubq_s16(step2[33], step2[46]);
2067 step1[47] = vqsubq_s16(step2[32], step2[47]);
2068 step1[48] = vqsubq_s16(step2[63], step2[48]);
2069 step1[49] = vqsubq_s16(step2[62], step2[49]);
2070 step1[50] = vqsubq_s16(step2[61], step2[50]);
2071 step1[51] = vqsubq_s16(step2[60], step2[51]);
2072 step1[52] = vqsubq_s16(step2[59], step2[52]);
2073 step1[53] = vqsubq_s16(step2[58], step2[53]);
2074 step1[54] = vqsubq_s16(step2[57], step2[54]);
2075 step1[55] = vqsubq_s16(step2[56], step2[55]);
2076 step1[56] = vqaddq_s16(step2[56], step2[55]);
2077 step1[57] = vqaddq_s16(step2[57], step2[54]);
2078 step1[58] = vqaddq_s16(step2[58], step2[53]);
2079 step1[59] = vqaddq_s16(step2[59], step2[52]);
2080 step1[60] = vqaddq_s16(step2[60], step2[51]);
2081 step1[61] = vqaddq_s16(step2[61], step2[50]);
2082 step1[62] = vqaddq_s16(step2[62], step2[49]);
2083 step1[63] = vqaddq_s16(step2[63], step2[48]);
2084}
2085
2086static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
2087 int8_t cos_bit) {
2088 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302089 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2090 (int16_t)cospi[16], (int16_t)cospi[48]);
sachin garg56f10202018-09-24 14:05:25 +00002091
2092 btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
2093 btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
2094 btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
2095 btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
2096 btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
2097 btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
2098 btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
2099 btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
2100
2101 step2[0] = vqaddq_s16(step1[0], step1[31]);
2102 step2[1] = vqaddq_s16(step1[1], step1[30]);
2103 step2[2] = vqaddq_s16(step1[2], step1[29]);
2104 step2[3] = vqaddq_s16(step1[3], step1[28]);
2105 step2[4] = vqaddq_s16(step1[4], step1[27]);
2106 step2[5] = vqaddq_s16(step1[5], step1[26]);
2107 step2[6] = vqaddq_s16(step1[6], step1[25]);
2108 step2[7] = vqaddq_s16(step1[7], step1[24]);
2109 step2[8] = vqaddq_s16(step1[8], step1[23]);
2110 step2[9] = vqaddq_s16(step1[9], step1[22]);
2111 step2[10] = vqaddq_s16(step1[10], step1[21]);
2112 step2[11] = vqaddq_s16(step1[11], step1[20]);
2113 step2[12] = vqaddq_s16(step1[12], step1[19]);
2114 step2[13] = vqaddq_s16(step1[13], step1[18]);
2115 step2[14] = vqaddq_s16(step1[14], step1[17]);
2116 step2[15] = vqaddq_s16(step1[15], step1[16]);
2117 step2[16] = vqsubq_s16(step1[15], step1[16]);
2118 step2[17] = vqsubq_s16(step1[14], step1[17]);
2119 step2[18] = vqsubq_s16(step1[13], step1[18]);
2120 step2[19] = vqsubq_s16(step1[12], step1[19]);
2121 step2[20] = vqsubq_s16(step1[11], step1[20]);
2122 step2[21] = vqsubq_s16(step1[10], step1[21]);
2123 step2[22] = vqsubq_s16(step1[9], step1[22]);
2124 step2[23] = vqsubq_s16(step1[8], step1[23]);
2125 step2[24] = vqsubq_s16(step1[7], step1[24]);
2126 step2[25] = vqsubq_s16(step1[6], step1[25]);
2127 step2[26] = vqsubq_s16(step1[5], step1[26]);
2128 step2[27] = vqsubq_s16(step1[4], step1[27]);
2129 step2[28] = vqsubq_s16(step1[3], step1[28]);
2130 step2[29] = vqsubq_s16(step1[2], step1[29]);
2131 step2[30] = vqsubq_s16(step1[1], step1[30]);
2132 step2[31] = vqsubq_s16(step1[0], step1[31]);
2133 step2[32] = step1[32];
2134 step2[33] = step1[33];
2135 step2[34] = step1[34];
2136 step2[35] = step1[35];
2137 step2[36] = step1[36];
2138 step2[37] = step1[37];
2139 step2[38] = step1[38];
2140 step2[39] = step1[39];
2141 step2[56] = step1[56];
2142 step2[57] = step1[57];
2143 step2[58] = step1[58];
2144 step2[59] = step1[59];
2145 step2[60] = step1[60];
2146 step2[61] = step1[61];
2147 step2[62] = step1[62];
2148 step2[63] = step1[63];
2149}
2150
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002151static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002152 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002153 const int32_t *cospi = cospi_arr(cos_bit);
2154 int16x8_t step2[64], step1[64];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302155 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2156 (int16_t)cospi[36], (int16_t)cospi[28]);
2157 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2158 (int16_t)cospi[52], (int16_t)cospi[12]);
2159 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2160 (int16_t)cospi[40], (int16_t)cospi[24]);
2161 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2162 (int16_t)cospi[16], (int16_t)cospi[48]);
2163 const int16x4_t c4 =
2164 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
2165 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
2166 const int16x4_t c5 =
2167 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
2168 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2169 const int16x4_t c6 =
2170 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2171 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2172 const int16x4_t c7 =
2173 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2174 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00002175
2176 // stage 1
2177 // stage 2
2178
2179 step2[0] = in[0];
2180 step2[2] = in[16];
2181 step2[4] = in[8];
2182 step2[6] = in[24];
2183 step2[8] = in[4];
2184 step2[10] = in[20];
2185 step2[12] = in[12];
2186 step2[14] = in[28];
2187 step2[16] = in[2];
2188 step2[18] = in[18];
2189 step2[20] = in[10];
2190 step2[22] = in[26];
2191 step2[24] = in[6];
2192 step2[26] = in[22];
2193 step2[28] = in[14];
2194 step2[30] = in[30];
2195
2196 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2197 btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
2198 btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
2199 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
2200 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
2201 btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
2202 btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
2203 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2204 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2205 btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
2206 btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
2207 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
2208 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
2209 btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
2210 btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
2211 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2212
2213 // stage 3
2214
2215 step1[0] = step2[0];
2216 step1[2] = step2[2];
2217 step1[4] = step2[4];
2218 step1[6] = step2[6];
2219 step1[8] = step2[8];
2220 step1[10] = step2[10];
2221 step1[12] = step2[12];
2222 step1[14] = step2[14];
2223
2224 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2225 btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
2226 btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
2227 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
2228 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
2229 btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
2230 btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
2231 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2232
2233 step1[32] = vqaddq_s16(step2[32], step2[33]);
2234 step1[33] = vqsubq_s16(step2[32], step2[33]);
2235 step1[34] = vqsubq_s16(step2[35], step2[34]);
2236 step1[35] = vqaddq_s16(step2[35], step2[34]);
2237 step1[36] = vqaddq_s16(step2[36], step2[37]);
2238 step1[37] = vqsubq_s16(step2[36], step2[37]);
2239 step1[38] = vqsubq_s16(step2[39], step2[38]);
2240 step1[39] = vqaddq_s16(step2[39], step2[38]);
2241 step1[40] = vqaddq_s16(step2[40], step2[41]);
2242 step1[41] = vqsubq_s16(step2[40], step2[41]);
2243 step1[42] = vqsubq_s16(step2[43], step2[42]);
2244 step1[43] = vqaddq_s16(step2[43], step2[42]);
2245 step1[44] = vqaddq_s16(step2[44], step2[45]);
2246 step1[45] = vqsubq_s16(step2[44], step2[45]);
2247 step1[46] = vqsubq_s16(step2[47], step2[46]);
2248 step1[47] = vqaddq_s16(step2[47], step2[46]);
2249 step1[48] = vqaddq_s16(step2[48], step2[49]);
2250 step1[49] = vqsubq_s16(step2[48], step2[49]);
2251 step1[50] = vqsubq_s16(step2[51], step2[50]);
2252 step1[51] = vqaddq_s16(step2[51], step2[50]);
2253 step1[52] = vqaddq_s16(step2[52], step2[53]);
2254 step1[53] = vqsubq_s16(step2[52], step2[53]);
2255 step1[54] = vqsubq_s16(step2[55], step2[54]);
2256 step1[55] = vqaddq_s16(step2[55], step2[54]);
2257 step1[56] = vqaddq_s16(step2[56], step2[57]);
2258 step1[57] = vqsubq_s16(step2[56], step2[57]);
2259 step1[58] = vqsubq_s16(step2[59], step2[58]);
2260 step1[59] = vqaddq_s16(step2[59], step2[58]);
2261 step1[60] = vqaddq_s16(step2[60], step2[61]);
2262 step1[61] = vqsubq_s16(step2[60], step2[61]);
2263 step1[62] = vqsubq_s16(step2[63], step2[62]);
2264 step1[63] = vqaddq_s16(step2[63], step2[62]);
2265
2266 // stage 4
2267
2268 step2[0] = step1[0];
2269 step2[2] = step1[2];
2270 step2[4] = step1[4];
2271 step2[6] = step1[6];
2272
2273 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2274 btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
2275 btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
2276 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
2277 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302278 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
sachin garg56f10202018-09-24 14:05:25 +00002279 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302280 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00002281 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302282 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
sachin garg56f10202018-09-24 14:05:25 +00002283 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302284 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00002285
2286 step2[16] = vqaddq_s16(step1[16], step1[17]);
2287 step2[17] = vqsubq_s16(step1[16], step1[17]);
2288 step2[18] = vqsubq_s16(step1[19], step1[18]);
2289 step2[19] = vqaddq_s16(step1[19], step1[18]);
2290 step2[20] = vqaddq_s16(step1[20], step1[21]);
2291 step2[21] = vqsubq_s16(step1[20], step1[21]);
2292 step2[22] = vqsubq_s16(step1[23], step1[22]);
2293 step2[23] = vqaddq_s16(step1[23], step1[22]);
2294 step2[24] = vqaddq_s16(step1[24], step1[25]);
2295 step2[25] = vqsubq_s16(step1[24], step1[25]);
2296 step2[26] = vqsubq_s16(step1[27], step1[26]);
2297 step2[27] = vqaddq_s16(step1[27], step1[26]);
2298 step2[28] = vqaddq_s16(step1[28], step1[29]);
2299 step2[29] = vqsubq_s16(step1[28], step1[29]);
2300 step2[30] = vqsubq_s16(step1[31], step1[30]);
2301 step2[31] = vqaddq_s16(step1[31], step1[30]);
2302 step2[32] = step1[32];
2303 step2[35] = step1[35];
2304 step2[36] = step1[36];
2305 step2[39] = step1[39];
2306 step2[40] = step1[40];
2307 step2[43] = step1[43];
2308 step2[44] = step1[44];
2309 step2[47] = step1[47];
2310 step2[48] = step1[48];
2311 step2[51] = step1[51];
2312 step2[52] = step1[52];
2313 step2[55] = step1[55];
2314 step2[56] = step1[56];
2315 step2[59] = step1[59];
2316 step2[60] = step1[60];
2317 step2[63] = step1[63];
2318
2319 // stage 5
2320
2321 step1[0] = step2[0];
2322 step1[2] = step2[2];
2323
2324 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
2325 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
2326 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302327 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
sachin garg56f10202018-09-24 14:05:25 +00002328 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302329 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00002330
2331 step1[8] = vqaddq_s16(step2[8], step2[9]);
2332 step1[9] = vqsubq_s16(step2[8], step2[9]);
2333 step1[10] = vqsubq_s16(step2[11], step2[10]);
2334 step1[11] = vqaddq_s16(step2[11], step2[10]);
2335 step1[12] = vqaddq_s16(step2[12], step2[13]);
2336 step1[13] = vqsubq_s16(step2[12], step2[13]);
2337 step1[14] = vqsubq_s16(step2[15], step2[14]);
2338 step1[15] = vqaddq_s16(step2[15], step2[14]);
2339 step1[16] = step2[16];
2340 step1[19] = step2[19];
2341 step1[20] = step2[20];
2342 step1[23] = step2[23];
2343 step1[24] = step2[24];
2344 step1[27] = step2[27];
2345 step1[28] = step2[28];
2346 step1[31] = step2[31];
2347 step1[32] = vqaddq_s16(step2[32], step2[35]);
2348 step1[33] = vqaddq_s16(step2[33], step2[34]);
2349 step1[34] = vqsubq_s16(step2[33], step2[34]);
2350 step1[35] = vqsubq_s16(step2[32], step2[35]);
2351 step1[36] = vqsubq_s16(step2[39], step2[36]);
2352 step1[37] = vqsubq_s16(step2[38], step2[37]);
2353 step1[38] = vqaddq_s16(step2[38], step2[37]);
2354 step1[39] = vqaddq_s16(step2[39], step2[36]);
2355 step1[40] = vqaddq_s16(step2[40], step2[43]);
2356 step1[41] = vqaddq_s16(step2[41], step2[42]);
2357 step1[42] = vqsubq_s16(step2[41], step2[42]);
2358 step1[43] = vqsubq_s16(step2[40], step2[43]);
2359 step1[44] = vqsubq_s16(step2[47], step2[44]);
2360 step1[45] = vqsubq_s16(step2[46], step2[45]);
2361 step1[46] = vqaddq_s16(step2[46], step2[45]);
2362 step1[47] = vqaddq_s16(step2[47], step2[44]);
2363 step1[48] = vqaddq_s16(step2[48], step2[51]);
2364 step1[49] = vqaddq_s16(step2[49], step2[50]);
2365 step1[50] = vqsubq_s16(step2[49], step2[50]);
2366 step1[51] = vqsubq_s16(step2[48], step2[51]);
2367 step1[52] = vqsubq_s16(step2[55], step2[52]);
2368 step1[53] = vqsubq_s16(step2[54], step2[53]);
2369 step1[54] = vqaddq_s16(step2[54], step2[53]);
2370 step1[55] = vqaddq_s16(step2[55], step2[52]);
2371 step1[56] = vqaddq_s16(step2[56], step2[59]);
2372 step1[57] = vqaddq_s16(step2[57], step2[58]);
2373 step1[58] = vqsubq_s16(step2[57], step2[58]);
2374 step1[59] = vqsubq_s16(step2[56], step2[59]);
2375 step1[60] = vqsubq_s16(step2[63], step2[60]);
2376 step1[61] = vqsubq_s16(step2[62], step2[61]);
2377 step1[62] = vqaddq_s16(step2[62], step2[61]);
2378 step1[63] = vqaddq_s16(step2[63], step2[60]);
2379
2380 // stage 6
2381
2382 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2383 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
2384 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302385 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
sachin garg56f10202018-09-24 14:05:25 +00002386 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2387 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302388 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
2389 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00002390 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2391 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302392 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
2393 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00002394
2395 step2[4] = vqaddq_s16(step1[4], step1[5]);
2396 step2[5] = vqsubq_s16(step1[4], step1[5]);
2397 step2[6] = vqsubq_s16(step1[7], step1[6]);
2398 step2[7] = vqaddq_s16(step1[7], step1[6]);
2399 step2[8] = step1[8];
2400 step2[11] = step1[11];
2401 step2[12] = step1[12];
2402 step2[15] = step1[15];
2403 step2[16] = vqaddq_s16(step1[16], step1[19]);
2404 step2[17] = vqaddq_s16(step1[17], step1[18]);
2405 step2[18] = vqsubq_s16(step1[17], step1[18]);
2406 step2[19] = vqsubq_s16(step1[16], step1[19]);
2407 step2[20] = vqsubq_s16(step1[23], step1[20]);
2408 step2[21] = vqsubq_s16(step1[22], step1[21]);
2409 step2[22] = vqaddq_s16(step1[22], step1[21]);
2410 step2[23] = vqaddq_s16(step1[23], step1[20]);
2411 step2[24] = vqaddq_s16(step1[24], step1[27]);
2412 step2[25] = vqaddq_s16(step1[25], step1[26]);
2413 step2[26] = vqsubq_s16(step1[25], step1[26]);
2414 step2[27] = vqsubq_s16(step1[24], step1[27]);
2415 step2[28] = vqsubq_s16(step1[31], step1[28]);
2416 step2[29] = vqsubq_s16(step1[30], step1[29]);
2417 step2[30] = vqaddq_s16(step1[30], step1[29]);
2418 step2[31] = vqaddq_s16(step1[31], step1[28]);
2419 step2[32] = step1[32];
2420 step2[33] = step1[33];
2421 step2[38] = step1[38];
2422 step2[39] = step1[39];
2423 step2[40] = step1[40];
2424 step2[41] = step1[41];
2425 step2[46] = step1[46];
2426 step2[47] = step1[47];
2427 step2[48] = step1[48];
2428 step2[49] = step1[49];
2429 step2[54] = step1[54];
2430 step2[55] = step1[55];
2431 step2[56] = step1[56];
2432 step2[57] = step1[57];
2433 step2[62] = step1[62];
2434 step2[63] = step1[63];
2435
2436 // stage 7
2437
2438 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
2439 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2440 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302441 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
2442 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00002443
2444 step1[0] = vqaddq_s16(step2[0], step2[3]);
2445 step1[1] = vqaddq_s16(step2[1], step2[2]);
2446 step1[2] = vqsubq_s16(step2[1], step2[2]);
2447 step1[3] = vqsubq_s16(step2[0], step2[3]);
2448 step1[4] = step2[4];
2449 step1[7] = step2[7];
2450 step1[8] = vqaddq_s16(step2[8], step2[11]);
2451 step1[9] = vqaddq_s16(step2[9], step2[10]);
2452 step1[10] = vqsubq_s16(step2[9], step2[10]);
2453 step1[11] = vqsubq_s16(step2[8], step2[11]);
2454 step1[12] = vqsubq_s16(step2[15], step2[12]);
2455 step1[13] = vqsubq_s16(step2[14], step2[13]);
2456 step1[14] = vqaddq_s16(step2[14], step2[13]);
2457 step1[15] = vqaddq_s16(step2[15], step2[12]);
2458 step1[16] = step2[16];
2459 step1[17] = step2[17];
2460 step1[22] = step2[22];
2461 step1[23] = step2[23];
2462 step1[24] = step2[24];
2463 step1[25] = step2[25];
2464 step1[30] = step2[30];
2465 step1[31] = step2[31];
2466 step1[32] = vqaddq_s16(step2[32], step2[39]);
2467 step1[33] = vqaddq_s16(step2[33], step2[38]);
2468 step1[34] = vqaddq_s16(step2[34], step2[37]);
2469 step1[35] = vqaddq_s16(step2[35], step2[36]);
2470 step1[36] = vqsubq_s16(step2[35], step2[36]);
2471 step1[37] = vqsubq_s16(step2[34], step2[37]);
2472 step1[38] = vqsubq_s16(step2[33], step2[38]);
2473 step1[39] = vqsubq_s16(step2[32], step2[39]);
2474 step1[40] = vqsubq_s16(step2[47], step2[40]);
2475 step1[41] = vqsubq_s16(step2[46], step2[41]);
2476 step1[42] = vqsubq_s16(step2[45], step2[42]);
2477 step1[43] = vqsubq_s16(step2[44], step2[43]);
2478 step1[44] = vqaddq_s16(step2[43], step2[44]);
2479 step1[45] = vqaddq_s16(step2[42], step2[45]);
2480 step1[46] = vqaddq_s16(step2[41], step2[46]);
2481 step1[47] = vqaddq_s16(step2[40], step2[47]);
2482 step1[48] = vqaddq_s16(step2[48], step2[55]);
2483 step1[49] = vqaddq_s16(step2[49], step2[54]);
2484 step1[50] = vqaddq_s16(step2[50], step2[53]);
2485 step1[51] = vqaddq_s16(step2[51], step2[52]);
2486 step1[52] = vqsubq_s16(step2[51], step2[52]);
2487 step1[53] = vqsubq_s16(step2[50], step2[53]);
2488 step1[54] = vqsubq_s16(step2[49], step2[54]);
2489 step1[55] = vqsubq_s16(step2[48], step2[55]);
2490 step1[56] = vqsubq_s16(step2[63], step2[56]);
2491 step1[57] = vqsubq_s16(step2[62], step2[57]);
2492 step1[58] = vqsubq_s16(step2[61], step2[58]);
2493 step1[59] = vqsubq_s16(step2[60], step2[59]);
2494 step1[60] = vqaddq_s16(step2[59], step2[60]);
2495 step1[61] = vqaddq_s16(step2[58], step2[61]);
2496 step1[62] = vqaddq_s16(step2[57], step2[62]);
2497 step1[63] = vqaddq_s16(step2[56], step2[63]);
2498
2499 // stage 8
2500
2501 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2502 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2503 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2504 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2505 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2506 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302507 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
2508 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
2509 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
2510 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00002511
2512 step2[0] = vqaddq_s16(step1[0], step1[7]);
2513 step2[1] = vqaddq_s16(step1[1], step1[6]);
2514 step2[2] = vqaddq_s16(step1[2], step1[5]);
2515 step2[3] = vqaddq_s16(step1[3], step1[4]);
2516 step2[4] = vqsubq_s16(step1[3], step1[4]);
2517 step2[5] = vqsubq_s16(step1[2], step1[5]);
2518 step2[6] = vqsubq_s16(step1[1], step1[6]);
2519 step2[7] = vqsubq_s16(step1[0], step1[7]);
2520 step2[8] = step1[8];
2521 step2[9] = step1[9];
2522 step2[14] = step1[14];
2523 step2[15] = step1[15];
2524 step2[16] = vqaddq_s16(step1[16], step1[23]);
2525 step2[17] = vqaddq_s16(step1[17], step1[22]);
2526 step2[18] = vqaddq_s16(step1[18], step1[21]);
2527 step2[19] = vqaddq_s16(step1[19], step1[20]);
2528 step2[20] = vqsubq_s16(step1[19], step1[20]);
2529 step2[21] = vqsubq_s16(step1[18], step1[21]);
2530 step2[22] = vqsubq_s16(step1[17], step1[22]);
2531 step2[23] = vqsubq_s16(step1[16], step1[23]);
2532 step2[24] = vqsubq_s16(step1[31], step1[24]);
2533 step2[25] = vqsubq_s16(step1[30], step1[25]);
2534 step2[26] = vqsubq_s16(step1[29], step1[26]);
2535 step2[27] = vqsubq_s16(step1[28], step1[27]);
2536 step2[28] = vqaddq_s16(step1[28], step1[27]);
2537 step2[29] = vqaddq_s16(step1[29], step1[26]);
2538 step2[30] = vqaddq_s16(step1[30], step1[25]);
2539 step2[31] = vqaddq_s16(step1[31], step1[24]);
2540 step2[32] = step1[32];
2541 step2[33] = step1[33];
2542 step2[34] = step1[34];
2543 step2[35] = step1[35];
2544 step2[44] = step1[44];
2545 step2[45] = step1[45];
2546 step2[46] = step1[46];
2547 step2[47] = step1[47];
2548 step2[48] = step1[48];
2549 step2[49] = step1[49];
2550 step2[50] = step1[50];
2551 step2[51] = step1[51];
2552 step2[60] = step1[60];
2553 step2[61] = step1[61];
2554 step2[62] = step1[62];
2555 step2[63] = step1[63];
2556
2557 // stage 9
2558 idct64_stage9_neon(step2, step1, cos_bit);
2559
2560 // stage 10
2561 idct64_stage10_neon(step1, step2, cos_bit);
2562
2563 // stage 11
2564
2565 out[0] = vqaddq_s16(step2[0], step2[63]);
2566 out[1] = vqaddq_s16(step2[1], step2[62]);
2567 out[2] = vqaddq_s16(step2[2], step2[61]);
2568 out[3] = vqaddq_s16(step2[3], step2[60]);
2569 out[4] = vqaddq_s16(step2[4], step2[59]);
2570 out[5] = vqaddq_s16(step2[5], step2[58]);
2571 out[6] = vqaddq_s16(step2[6], step2[57]);
2572 out[7] = vqaddq_s16(step2[7], step2[56]);
2573 out[8] = vqaddq_s16(step2[8], step2[55]);
2574 out[9] = vqaddq_s16(step2[9], step2[54]);
2575 out[10] = vqaddq_s16(step2[10], step2[53]);
2576 out[11] = vqaddq_s16(step2[11], step2[52]);
2577 out[12] = vqaddq_s16(step2[12], step2[51]);
2578 out[13] = vqaddq_s16(step2[13], step2[50]);
2579 out[14] = vqaddq_s16(step2[14], step2[49]);
2580 out[15] = vqaddq_s16(step2[15], step2[48]);
2581 out[16] = vqaddq_s16(step2[16], step2[47]);
2582 out[17] = vqaddq_s16(step2[17], step2[46]);
2583 out[18] = vqaddq_s16(step2[18], step2[45]);
2584 out[19] = vqaddq_s16(step2[19], step2[44]);
2585 out[20] = vqaddq_s16(step2[20], step2[43]);
2586 out[21] = vqaddq_s16(step2[21], step2[42]);
2587 out[22] = vqaddq_s16(step2[22], step2[41]);
2588 out[23] = vqaddq_s16(step2[23], step2[40]);
2589 out[24] = vqaddq_s16(step2[24], step2[39]);
2590 out[25] = vqaddq_s16(step2[25], step2[38]);
2591 out[26] = vqaddq_s16(step2[26], step2[37]);
2592 out[27] = vqaddq_s16(step2[27], step2[36]);
2593 out[28] = vqaddq_s16(step2[28], step2[35]);
2594 out[29] = vqaddq_s16(step2[29], step2[34]);
2595 out[30] = vqaddq_s16(step2[30], step2[33]);
2596 out[31] = vqaddq_s16(step2[31], step2[32]);
2597 out[32] = vqsubq_s16(step2[31], step2[32]);
2598 out[33] = vqsubq_s16(step2[30], step2[33]);
2599 out[34] = vqsubq_s16(step2[29], step2[34]);
2600 out[35] = vqsubq_s16(step2[28], step2[35]);
2601 out[36] = vqsubq_s16(step2[27], step2[36]);
2602 out[37] = vqsubq_s16(step2[26], step2[37]);
2603 out[38] = vqsubq_s16(step2[25], step2[38]);
2604 out[39] = vqsubq_s16(step2[24], step2[39]);
2605 out[40] = vqsubq_s16(step2[23], step2[40]);
2606 out[41] = vqsubq_s16(step2[22], step2[41]);
2607 out[42] = vqsubq_s16(step2[21], step2[42]);
2608 out[43] = vqsubq_s16(step2[20], step2[43]);
2609 out[44] = vqsubq_s16(step2[19], step2[44]);
2610 out[45] = vqsubq_s16(step2[18], step2[45]);
2611 out[46] = vqsubq_s16(step2[17], step2[46]);
2612 out[47] = vqsubq_s16(step2[16], step2[47]);
2613 out[48] = vqsubq_s16(step2[15], step2[48]);
2614 out[49] = vqsubq_s16(step2[14], step2[49]);
2615 out[50] = vqsubq_s16(step2[13], step2[50]);
2616 out[51] = vqsubq_s16(step2[12], step2[51]);
2617 out[52] = vqsubq_s16(step2[11], step2[52]);
2618 out[53] = vqsubq_s16(step2[10], step2[53]);
2619 out[54] = vqsubq_s16(step2[9], step2[54]);
2620 out[55] = vqsubq_s16(step2[8], step2[55]);
2621 out[56] = vqsubq_s16(step2[7], step2[56]);
2622 out[57] = vqsubq_s16(step2[6], step2[57]);
2623 out[58] = vqsubq_s16(step2[5], step2[58]);
2624 out[59] = vqsubq_s16(step2[4], step2[59]);
2625 out[60] = vqsubq_s16(step2[3], step2[60]);
2626 out[61] = vqsubq_s16(step2[2], step2[61]);
2627 out[62] = vqsubq_s16(step2[1], step2[62]);
2628 out[63] = vqsubq_s16(step2[0], step2[63]);
2629}
2630
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002631static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002632 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002633 const int32_t *cospi = cospi_arr(cos_bit);
2634 int16x8_t step1;
2635 int32x4_t t32[2];
2636
2637 // stage 1
2638 // stage 2
2639 // stage 3
2640 // stage 4
2641 // stage 5
2642 // stage 6
2643
2644 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
2645 t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
2646
2647 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
2648 vrshrn_n_s32(t32[1], INV_COS_BIT));
2649 // stage 7
2650 // stage 8
2651 // stage 9
2652 // stage 10
2653 // stage 11
2654 out[0] = step1;
2655 out[1] = step1;
2656 out[2] = step1;
2657 out[3] = step1;
2658 out[4] = step1;
2659 out[5] = step1;
2660 out[6] = step1;
2661 out[7] = step1;
2662 out[8] = step1;
2663 out[9] = step1;
2664 out[10] = step1;
2665 out[11] = step1;
2666 out[12] = step1;
2667 out[13] = step1;
2668 out[14] = step1;
2669 out[15] = step1;
2670 out[16] = step1;
2671 out[17] = step1;
2672 out[18] = step1;
2673 out[19] = step1;
2674 out[20] = step1;
2675 out[21] = step1;
2676 out[22] = step1;
2677 out[23] = step1;
2678 out[24] = step1;
2679 out[25] = step1;
2680 out[26] = step1;
2681 out[27] = step1;
2682 out[28] = step1;
2683 out[29] = step1;
2684 out[30] = step1;
2685 out[31] = step1;
2686 out[32] = step1;
2687 out[33] = step1;
2688 out[34] = step1;
2689 out[35] = step1;
2690 out[36] = step1;
2691 out[37] = step1;
2692 out[38] = step1;
2693 out[39] = step1;
2694 out[40] = step1;
2695 out[41] = step1;
2696 out[42] = step1;
2697 out[43] = step1;
2698 out[44] = step1;
2699 out[45] = step1;
2700 out[46] = step1;
2701 out[47] = step1;
2702 out[48] = step1;
2703 out[49] = step1;
2704 out[50] = step1;
2705 out[51] = step1;
2706 out[52] = step1;
2707 out[53] = step1;
2708 out[54] = step1;
2709 out[55] = step1;
2710 out[56] = step1;
2711 out[57] = step1;
2712 out[58] = step1;
2713 out[59] = step1;
2714 out[60] = step1;
2715 out[61] = step1;
2716 out[62] = step1;
2717 out[63] = step1;
2718}
2719
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002720static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002721 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002722 const int32_t *cospi = cospi_arr(cos_bit);
2723 int16x8_t step2[64], step1[64];
2724
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302725 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2726 (int16_t)cospi[36], (int16_t)cospi[28]);
2727 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2728 (int16_t)cospi[52], (int16_t)cospi[12]);
2729 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2730 (int16_t)cospi[40], (int16_t)cospi[24]);
2731 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2732 (int16_t)cospi[16], (int16_t)cospi[48]);
2733 const int16x4_t c4 =
2734 set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
2735 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2736 const int16x4_t c5 =
2737 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2738 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2739 const int16x4_t c6 =
2740 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2741 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00002742
2743 // stage 1
2744 // stage 2
2745
2746 step2[0] = in[0];
2747 step2[8] = in[4];
2748 step2[16] = in[2];
2749 step2[24] = in[6];
2750
2751 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2752 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2753 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2754 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2755
2756 // stage 3
2757
2758 step1[0] = step2[0];
2759 step1[8] = step2[8];
2760
2761 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2762 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2763
2764 step1[32] = step2[32];
2765 step1[33] = step2[32];
2766 step1[38] = step2[39];
2767 step1[39] = step2[39];
2768 step1[40] = step2[40];
2769 step1[41] = step2[40];
2770 step1[46] = step2[47];
2771 step1[47] = step2[47];
2772 step1[48] = step2[48];
2773 step1[49] = step2[48];
2774 step1[54] = step2[55];
2775 step1[55] = step2[55];
2776 step1[56] = step2[56];
2777 step1[57] = step2[56];
2778 step1[62] = step2[63];
2779 step1[63] = step2[63];
2780
2781 // stage 4
2782
2783 step2[0] = step1[0];
2784
2785 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2786 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302787 btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00002788 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302789 btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00002790
2791 step2[16] = step1[16];
2792 step2[17] = step1[16];
2793 step2[22] = step1[23];
2794 step2[23] = step1[23];
2795 step2[24] = step1[24];
2796 step2[25] = step1[24];
2797 step2[30] = step1[31];
2798 step2[31] = step1[31];
2799 step2[32] = step1[32];
2800 step2[39] = step1[39];
2801 step2[40] = step1[40];
2802 step2[47] = step1[47];
2803 step2[48] = step1[48];
2804 step2[55] = step1[55];
2805 step2[56] = step1[56];
2806 step2[63] = step1[63];
2807
2808 // stage 5
2809
2810 step1[0] = step2[0];
2811
2812 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302813 btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00002814
2815 step1[8] = step2[8];
2816 step1[9] = step2[8];
2817 step1[14] = step2[15];
2818 step1[15] = step2[15];
2819
2820 step1[16] = step2[16];
2821 step1[23] = step2[23];
2822 step1[24] = step2[24];
2823 step1[31] = step2[31];
2824 step1[32] = step2[32];
2825 step1[33] = step2[33];
2826 step1[34] = step2[33];
2827 step1[35] = step2[32];
2828 step1[36] = step2[39];
2829 step1[37] = step2[38];
2830 step1[38] = step2[38];
2831 step1[39] = step2[39];
2832 step1[40] = step2[40];
2833 step1[41] = step2[41];
2834 step1[42] = step2[41];
2835 step1[43] = step2[40];
2836 step1[44] = step2[47];
2837 step1[45] = step2[46];
2838 step1[46] = step2[46];
2839 step1[47] = step2[47];
2840 step1[48] = step2[48];
2841 step1[49] = step2[49];
2842 step1[50] = step2[49];
2843 step1[51] = step2[48];
2844 step1[52] = step2[55];
2845 step1[53] = step2[54];
2846 step1[54] = step2[54];
2847 step1[55] = step2[55];
2848 step1[56] = step2[56];
2849 step1[57] = step2[57];
2850 step1[58] = step2[57];
2851 step1[59] = step2[56];
2852 step1[60] = step2[63];
2853 step1[61] = step2[62];
2854 step1[62] = step2[62];
2855 step1[63] = step2[63];
2856
2857 // stage 6
2858
2859 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2860 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2861 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2862 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302863 btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
2864 btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00002865 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2866 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302867 btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
2868 btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00002869
2870 step2[8] = step1[8];
2871 step2[15] = step1[15];
2872 step2[16] = step1[16];
2873 step2[17] = step1[17];
2874 step2[18] = step1[17];
2875 step2[19] = step1[16];
2876 step2[20] = step1[23];
2877 step2[21] = step1[22];
2878 step2[22] = step1[22];
2879 step2[23] = step1[23];
2880 step2[24] = step1[24];
2881 step2[25] = step1[25];
2882 step2[26] = step1[25];
2883 step2[27] = step1[24];
2884 step2[28] = step1[31];
2885 step2[29] = step1[30];
2886 step2[30] = step1[30];
2887 step2[31] = step1[31];
2888 step2[32] = step1[32];
2889 step2[33] = step1[33];
2890 step2[38] = step1[38];
2891 step2[39] = step1[39];
2892 step2[40] = step1[40];
2893 step2[41] = step1[41];
2894 step2[46] = step1[46];
2895 step2[47] = step1[47];
2896 step2[48] = step1[48];
2897 step2[49] = step1[49];
2898 step2[54] = step1[54];
2899 step2[55] = step1[55];
2900 step2[56] = step1[56];
2901 step2[57] = step1[57];
2902 step2[62] = step1[62];
2903 step2[63] = step1[63];
2904
2905 // stage 7
2906
2907 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2908 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302909 btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
2910 btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00002911
2912 step1[0] = step2[0];
2913 step1[1] = step2[1];
2914 step1[2] = step2[1];
2915 step1[3] = step2[0];
2916 step1[8] = step2[8];
2917 step1[9] = step2[9];
2918 step1[10] = step2[9];
2919 step1[11] = step2[8];
2920 step1[12] = step2[15];
2921 step1[13] = step2[14];
2922 step1[14] = step2[14];
2923 step1[15] = step2[15];
2924 step1[16] = step2[16];
2925 step1[17] = step2[17];
2926 step1[22] = step2[22];
2927 step1[23] = step2[23];
2928 step1[24] = step2[24];
2929 step1[25] = step2[25];
2930 step1[30] = step2[30];
2931 step1[31] = step2[31];
2932 step1[32] = vqaddq_s16(step2[32], step2[39]);
2933 step1[33] = vqaddq_s16(step2[33], step2[38]);
2934 step1[34] = vqaddq_s16(step2[34], step2[37]);
2935 step1[35] = vqaddq_s16(step2[35], step2[36]);
2936 step1[36] = vqsubq_s16(step2[35], step2[36]);
2937 step1[37] = vqsubq_s16(step2[34], step2[37]);
2938 step1[38] = vqsubq_s16(step2[33], step2[38]);
2939 step1[39] = vqsubq_s16(step2[32], step2[39]);
2940 step1[40] = vqsubq_s16(step2[47], step2[40]);
2941 step1[41] = vqsubq_s16(step2[46], step2[41]);
2942 step1[42] = vqsubq_s16(step2[45], step2[42]);
2943 step1[43] = vqsubq_s16(step2[44], step2[43]);
2944 step1[44] = vqaddq_s16(step2[43], step2[44]);
2945 step1[45] = vqaddq_s16(step2[42], step2[45]);
2946 step1[46] = vqaddq_s16(step2[41], step2[46]);
2947 step1[47] = vqaddq_s16(step2[40], step2[47]);
2948 step1[48] = vqaddq_s16(step2[48], step2[55]);
2949 step1[49] = vqaddq_s16(step2[49], step2[54]);
2950 step1[50] = vqaddq_s16(step2[50], step2[53]);
2951 step1[51] = vqaddq_s16(step2[51], step2[52]);
2952 step1[52] = vqsubq_s16(step2[51], step2[52]);
2953 step1[53] = vqsubq_s16(step2[50], step2[53]);
2954 step1[54] = vqsubq_s16(step2[49], step2[54]);
2955 step1[55] = vqsubq_s16(step2[48], step2[55]);
2956 step1[56] = vqsubq_s16(step2[63], step2[56]);
2957 step1[57] = vqsubq_s16(step2[62], step2[57]);
2958 step1[58] = vqsubq_s16(step2[61], step2[58]);
2959 step1[59] = vqsubq_s16(step2[60], step2[59]);
2960 step1[60] = vqaddq_s16(step2[59], step2[60]);
2961 step1[61] = vqaddq_s16(step2[58], step2[61]);
2962 step1[62] = vqaddq_s16(step2[57], step2[62]);
2963 step1[63] = vqaddq_s16(step2[56], step2[63]);
2964
2965 // stage 8
2966
2967 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2968 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2969 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2970 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2971 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2972 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302973 btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
2974 btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
2975 btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
2976 btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00002977
2978 step2[0] = step1[0];
2979 step2[1] = step1[1];
2980 step2[2] = step1[2];
2981 step2[3] = step1[3];
2982 step2[4] = step1[3];
2983 step2[5] = step1[2];
2984 step2[6] = step1[1];
2985 step2[7] = step1[0];
2986 step2[8] = step1[8];
2987 step2[9] = step1[9];
2988 step2[14] = step1[14];
2989 step2[15] = step1[15];
2990 step2[16] = vqaddq_s16(step1[16], step1[23]);
2991 step2[17] = vqaddq_s16(step1[17], step1[22]);
2992 step2[18] = vqaddq_s16(step1[18], step1[21]);
2993 step2[19] = vqaddq_s16(step1[19], step1[20]);
2994 step2[20] = vqsubq_s16(step1[19], step1[20]);
2995 step2[21] = vqsubq_s16(step1[18], step1[21]);
2996 step2[22] = vqsubq_s16(step1[17], step1[22]);
2997 step2[23] = vqsubq_s16(step1[16], step1[23]);
2998 step2[24] = vqsubq_s16(step1[31], step1[24]);
2999 step2[25] = vqsubq_s16(step1[30], step1[25]);
3000 step2[26] = vqsubq_s16(step1[29], step1[26]);
3001 step2[27] = vqsubq_s16(step1[28], step1[27]);
3002 step2[28] = vqaddq_s16(step1[28], step1[27]);
3003 step2[29] = vqaddq_s16(step1[29], step1[26]);
3004 step2[30] = vqaddq_s16(step1[30], step1[25]);
3005 step2[31] = vqaddq_s16(step1[31], step1[24]);
3006 step2[32] = step1[32];
3007 step2[33] = step1[33];
3008 step2[34] = step1[34];
3009 step2[35] = step1[35];
3010 step2[44] = step1[44];
3011 step2[45] = step1[45];
3012 step2[46] = step1[46];
3013 step2[47] = step1[47];
3014 step2[48] = step1[48];
3015 step2[49] = step1[49];
3016 step2[50] = step1[50];
3017 step2[51] = step1[51];
3018 step2[60] = step1[60];
3019 step2[61] = step1[61];
3020 step2[62] = step1[62];
3021 step2[63] = step1[63];
3022
3023 // stage 9
3024 idct64_stage9_neon(step2, step1, cos_bit);
3025
3026 // stage 10
3027 idct64_stage10_neon(step1, step2, cos_bit);
3028
3029 // stage 11
3030
3031 out[0] = vqaddq_s16(step2[0], step2[63]);
3032 out[1] = vqaddq_s16(step2[1], step2[62]);
3033 out[2] = vqaddq_s16(step2[2], step2[61]);
3034 out[3] = vqaddq_s16(step2[3], step2[60]);
3035 out[4] = vqaddq_s16(step2[4], step2[59]);
3036 out[5] = vqaddq_s16(step2[5], step2[58]);
3037 out[6] = vqaddq_s16(step2[6], step2[57]);
3038 out[7] = vqaddq_s16(step2[7], step2[56]);
3039 out[8] = vqaddq_s16(step2[8], step2[55]);
3040 out[9] = vqaddq_s16(step2[9], step2[54]);
3041 out[10] = vqaddq_s16(step2[10], step2[53]);
3042 out[11] = vqaddq_s16(step2[11], step2[52]);
3043 out[12] = vqaddq_s16(step2[12], step2[51]);
3044 out[13] = vqaddq_s16(step2[13], step2[50]);
3045 out[14] = vqaddq_s16(step2[14], step2[49]);
3046 out[15] = vqaddq_s16(step2[15], step2[48]);
3047 out[16] = vqaddq_s16(step2[16], step2[47]);
3048 out[17] = vqaddq_s16(step2[17], step2[46]);
3049 out[18] = vqaddq_s16(step2[18], step2[45]);
3050 out[19] = vqaddq_s16(step2[19], step2[44]);
3051 out[20] = vqaddq_s16(step2[20], step2[43]);
3052 out[21] = vqaddq_s16(step2[21], step2[42]);
3053 out[22] = vqaddq_s16(step2[22], step2[41]);
3054 out[23] = vqaddq_s16(step2[23], step2[40]);
3055 out[24] = vqaddq_s16(step2[24], step2[39]);
3056 out[25] = vqaddq_s16(step2[25], step2[38]);
3057 out[26] = vqaddq_s16(step2[26], step2[37]);
3058 out[27] = vqaddq_s16(step2[27], step2[36]);
3059 out[28] = vqaddq_s16(step2[28], step2[35]);
3060 out[29] = vqaddq_s16(step2[29], step2[34]);
3061 out[30] = vqaddq_s16(step2[30], step2[33]);
3062 out[31] = vqaddq_s16(step2[31], step2[32]);
3063 out[32] = vqsubq_s16(step2[31], step2[32]);
3064 out[33] = vqsubq_s16(step2[30], step2[33]);
3065 out[34] = vqsubq_s16(step2[29], step2[34]);
3066 out[35] = vqsubq_s16(step2[28], step2[35]);
3067 out[36] = vqsubq_s16(step2[27], step2[36]);
3068 out[37] = vqsubq_s16(step2[26], step2[37]);
3069 out[38] = vqsubq_s16(step2[25], step2[38]);
3070 out[39] = vqsubq_s16(step2[24], step2[39]);
3071 out[40] = vqsubq_s16(step2[23], step2[40]);
3072 out[41] = vqsubq_s16(step2[22], step2[41]);
3073 out[42] = vqsubq_s16(step2[21], step2[42]);
3074 out[43] = vqsubq_s16(step2[20], step2[43]);
3075 out[44] = vqsubq_s16(step2[19], step2[44]);
3076 out[45] = vqsubq_s16(step2[18], step2[45]);
3077 out[46] = vqsubq_s16(step2[17], step2[46]);
3078 out[47] = vqsubq_s16(step2[16], step2[47]);
3079 out[48] = vqsubq_s16(step2[15], step2[48]);
3080 out[49] = vqsubq_s16(step2[14], step2[49]);
3081 out[50] = vqsubq_s16(step2[13], step2[50]);
3082 out[51] = vqsubq_s16(step2[12], step2[51]);
3083 out[52] = vqsubq_s16(step2[11], step2[52]);
3084 out[53] = vqsubq_s16(step2[10], step2[53]);
3085 out[54] = vqsubq_s16(step2[9], step2[54]);
3086 out[55] = vqsubq_s16(step2[8], step2[55]);
3087 out[56] = vqsubq_s16(step2[7], step2[56]);
3088 out[57] = vqsubq_s16(step2[6], step2[57]);
3089 out[58] = vqsubq_s16(step2[5], step2[58]);
3090 out[59] = vqsubq_s16(step2[4], step2[59]);
3091 out[60] = vqsubq_s16(step2[3], step2[60]);
3092 out[61] = vqsubq_s16(step2[2], step2[61]);
3093 out[62] = vqsubq_s16(step2[1], step2[62]);
3094 out[63] = vqsubq_s16(step2[0], step2[63]);
3095}
3096
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003097static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05003098 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00003099 const int32_t *cospi = cospi_arr(cos_bit);
3100 int16x8_t step2[64], step1[64];
3101
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303102 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
3103 (int16_t)cospi[36], (int16_t)cospi[28]);
3104 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
3105 (int16_t)cospi[52], (int16_t)cospi[12]);
3106 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
3107 (int16_t)cospi[40], (int16_t)cospi[24]);
3108 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
3109 (int16_t)cospi[16], (int16_t)cospi[48]);
3110 const int16x4_t c4 =
3111 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
3112 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
3113 const int16x4_t c5 =
3114 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
3115 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
3116 const int16x4_t c6 =
3117 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
3118 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
3119 const int16x4_t c7 =
3120 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
3121 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00003122
3123 // stage 1
3124 // stage 2
3125
3126 step2[0] = in[0];
3127 step2[4] = in[8];
3128 step2[8] = in[4];
3129 step2[12] = in[12];
3130 step2[16] = in[2];
3131 step2[20] = in[10];
3132 step2[24] = in[6];
3133 step2[28] = in[14];
3134
3135 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
3136 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
3137 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
3138 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
3139 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
3140 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
3141 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
3142 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
3143
3144 // stage 3
3145
3146 step1[0] = step2[0];
3147 step1[4] = step2[4];
3148 step1[8] = step2[8];
3149 step1[12] = step2[12];
3150
3151 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
3152 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
3153 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
3154 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
3155
3156 step1[32] = step2[32];
3157 step1[33] = step2[32];
3158 step1[34] = step2[35];
3159 step1[35] = step2[35];
3160 step1[36] = step2[36];
3161 step1[37] = step2[36];
3162 step1[38] = step2[39];
3163 step1[39] = step2[39];
3164 step1[40] = step2[40];
3165 step1[41] = step2[40];
3166 step1[42] = step2[43];
3167 step1[43] = step2[43];
3168 step1[44] = step2[44];
3169 step1[45] = step2[44];
3170 step1[46] = step2[47];
3171 step1[47] = step2[47];
3172 step1[48] = step2[48];
3173 step1[49] = step2[48];
3174 step1[50] = step2[51];
3175 step1[51] = step2[51];
3176 step1[52] = step2[52];
3177 step1[53] = step2[52];
3178 step1[54] = step2[55];
3179 step1[55] = step2[55];
3180 step1[56] = step2[56];
3181 step1[57] = step2[56];
3182 step1[58] = step2[59];
3183 step1[59] = step2[59];
3184 step1[60] = step2[60];
3185 step1[61] = step2[60];
3186 step1[62] = step2[63];
3187 step1[63] = step2[63];
3188
3189 // stage 4
3190
3191 step2[0] = step1[0];
3192 step2[4] = step1[4];
3193
3194 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
3195 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
3196 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303197 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
sachin garg56f10202018-09-24 14:05:25 +00003198 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303199 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00003200 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303201 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
sachin garg56f10202018-09-24 14:05:25 +00003202 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303203 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00003204
3205 step2[16] = step1[16];
3206 step2[17] = step1[16];
3207 step2[18] = step1[19];
3208 step2[19] = step1[19];
3209 step2[20] = step1[20];
3210 step2[21] = step1[20];
3211 step2[22] = step1[23];
3212 step2[23] = step1[23];
3213 step2[24] = step1[24];
3214 step2[25] = step1[24];
3215 step2[26] = step1[27];
3216 step2[27] = step1[27];
3217 step2[28] = step1[28];
3218 step2[29] = step1[28];
3219 step2[30] = step1[31];
3220 step2[31] = step1[31];
3221 step2[32] = step1[32];
3222 step2[35] = step1[35];
3223 step2[36] = step1[36];
3224 step2[39] = step1[39];
3225 step2[40] = step1[40];
3226 step2[43] = step1[43];
3227 step2[44] = step1[44];
3228 step2[47] = step1[47];
3229 step2[48] = step1[48];
3230 step2[51] = step1[51];
3231 step2[52] = step1[52];
3232 step2[55] = step1[55];
3233 step2[56] = step1[56];
3234 step2[59] = step1[59];
3235 step2[60] = step1[60];
3236 step2[63] = step1[63];
3237
3238 // stage 5
3239
3240 step1[0] = step2[0];
3241
3242 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
3243 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303244 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
sachin garg56f10202018-09-24 14:05:25 +00003245 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303246 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00003247
3248 step1[8] = step2[8];
3249 step1[9] = step2[8];
3250 step1[10] = step2[11];
3251 step1[11] = step2[11];
3252 step1[12] = step2[12];
3253 step1[13] = step2[12];
3254 step1[14] = step2[15];
3255 step1[15] = step2[15];
3256 step1[16] = step2[16];
3257 step1[19] = step2[19];
3258 step1[20] = step2[20];
3259 step1[23] = step2[23];
3260 step1[24] = step2[24];
3261 step1[27] = step2[27];
3262 step1[28] = step2[28];
3263 step1[31] = step2[31];
3264 step1[32] = vqaddq_s16(step2[32], step2[35]);
3265 step1[33] = vqaddq_s16(step2[33], step2[34]);
3266 step1[34] = vqsubq_s16(step2[33], step2[34]);
3267 step1[35] = vqsubq_s16(step2[32], step2[35]);
3268 step1[36] = vqsubq_s16(step2[39], step2[36]);
3269 step1[37] = vqsubq_s16(step2[38], step2[37]);
3270 step1[38] = vqaddq_s16(step2[38], step2[37]);
3271 step1[39] = vqaddq_s16(step2[39], step2[36]);
3272 step1[40] = vqaddq_s16(step2[40], step2[43]);
3273 step1[41] = vqaddq_s16(step2[41], step2[42]);
3274 step1[42] = vqsubq_s16(step2[41], step2[42]);
3275 step1[43] = vqsubq_s16(step2[40], step2[43]);
3276 step1[44] = vqsubq_s16(step2[47], step2[44]);
3277 step1[45] = vqsubq_s16(step2[46], step2[45]);
3278 step1[46] = vqaddq_s16(step2[46], step2[45]);
3279 step1[47] = vqaddq_s16(step2[47], step2[44]);
3280 step1[48] = vqaddq_s16(step2[48], step2[51]);
3281 step1[49] = vqaddq_s16(step2[49], step2[50]);
3282 step1[50] = vqsubq_s16(step2[49], step2[50]);
3283 step1[51] = vqsubq_s16(step2[48], step2[51]);
3284 step1[52] = vqsubq_s16(step2[55], step2[52]);
3285 step1[53] = vqsubq_s16(step2[54], step2[53]);
3286 step1[54] = vqaddq_s16(step2[54], step2[53]);
3287 step1[55] = vqaddq_s16(step2[55], step2[52]);
3288 step1[56] = vqaddq_s16(step2[56], step2[59]);
3289 step1[57] = vqaddq_s16(step2[57], step2[58]);
3290 step1[58] = vqsubq_s16(step2[57], step2[58]);
3291 step1[59] = vqsubq_s16(step2[56], step2[59]);
3292 step1[60] = vqsubq_s16(step2[63], step2[60]);
3293 step1[61] = vqsubq_s16(step2[62], step2[61]);
3294 step1[62] = vqaddq_s16(step2[62], step2[61]);
3295 step1[63] = vqaddq_s16(step2[63], step2[60]);
3296
3297 // stage 6
3298
3299 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
3300 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303301 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
sachin garg56f10202018-09-24 14:05:25 +00003302 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
3303 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303304 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
3305 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00003306 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
3307 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303308 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
3309 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00003310
3311 step2[4] = step1[4];
3312 step2[5] = step1[4];
3313 step2[6] = step1[7];
3314 step2[7] = step1[7];
3315 step2[8] = step1[8];
3316 step2[11] = step1[11];
3317 step2[12] = step1[12];
3318 step2[15] = step1[15];
3319 step2[16] = vqaddq_s16(step1[16], step1[19]);
3320 step2[17] = vqaddq_s16(step1[17], step1[18]);
3321 step2[18] = vqsubq_s16(step1[17], step1[18]);
3322 step2[19] = vqsubq_s16(step1[16], step1[19]);
3323 step2[20] = vqsubq_s16(step1[23], step1[20]);
3324 step2[21] = vqsubq_s16(step1[22], step1[21]);
3325 step2[22] = vqaddq_s16(step1[22], step1[21]);
3326 step2[23] = vqaddq_s16(step1[23], step1[20]);
3327 step2[24] = vqaddq_s16(step1[24], step1[27]);
3328 step2[25] = vqaddq_s16(step1[25], step1[26]);
3329 step2[26] = vqsubq_s16(step1[25], step1[26]);
3330 step2[27] = vqsubq_s16(step1[24], step1[27]);
3331 step2[28] = vqsubq_s16(step1[31], step1[28]);
3332 step2[29] = vqsubq_s16(step1[30], step1[29]);
3333 step2[30] = vqaddq_s16(step1[30], step1[29]);
3334 step2[31] = vqaddq_s16(step1[31], step1[28]);
3335 step2[32] = step1[32];
3336 step2[33] = step1[33];
3337 step2[38] = step1[38];
3338 step2[39] = step1[39];
3339 step2[40] = step1[40];
3340 step2[41] = step1[41];
3341 step2[46] = step1[46];
3342 step2[47] = step1[47];
3343 step2[48] = step1[48];
3344 step2[49] = step1[49];
3345 step2[54] = step1[54];
3346 step2[55] = step1[55];
3347 step2[56] = step1[56];
3348 step2[57] = step1[57];
3349 step2[62] = step1[62];
3350 step2[63] = step1[63];
3351
3352 // stage 7
3353
3354 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
3355 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
3356 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303357 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
3358 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00003359
3360 step1[0] = step2[0];
3361 step1[1] = step2[1];
3362 step1[2] = step2[1];
3363 step1[3] = step2[0];
3364 step1[4] = step2[4];
3365 step1[7] = step2[7];
3366 step1[8] = vqaddq_s16(step2[8], step2[11]);
3367 step1[9] = vqaddq_s16(step2[9], step2[10]);
3368 step1[10] = vqsubq_s16(step2[9], step2[10]);
3369 step1[11] = vqsubq_s16(step2[8], step2[11]);
3370 step1[12] = vqsubq_s16(step2[15], step2[12]);
3371 step1[13] = vqsubq_s16(step2[14], step2[13]);
3372 step1[14] = vqaddq_s16(step2[14], step2[13]);
3373 step1[15] = vqaddq_s16(step2[15], step2[12]);
3374 step1[16] = step2[16];
3375 step1[17] = step2[17];
3376 step1[22] = step2[22];
3377 step1[23] = step2[23];
3378 step1[24] = step2[24];
3379 step1[25] = step2[25];
3380 step1[30] = step2[30];
3381 step1[31] = step2[31];
3382 step1[32] = vqaddq_s16(step2[32], step2[39]);
3383 step1[33] = vqaddq_s16(step2[33], step2[38]);
3384 step1[34] = vqaddq_s16(step2[34], step2[37]);
3385 step1[35] = vqaddq_s16(step2[35], step2[36]);
3386 step1[36] = vqsubq_s16(step2[35], step2[36]);
3387 step1[37] = vqsubq_s16(step2[34], step2[37]);
3388 step1[38] = vqsubq_s16(step2[33], step2[38]);
3389 step1[39] = vqsubq_s16(step2[32], step2[39]);
3390 step1[40] = vqsubq_s16(step2[47], step2[40]);
3391 step1[41] = vqsubq_s16(step2[46], step2[41]);
3392 step1[42] = vqsubq_s16(step2[45], step2[42]);
3393 step1[43] = vqsubq_s16(step2[44], step2[43]);
3394 step1[44] = vqaddq_s16(step2[43], step2[44]);
3395 step1[45] = vqaddq_s16(step2[42], step2[45]);
3396 step1[46] = vqaddq_s16(step2[41], step2[46]);
3397 step1[47] = vqaddq_s16(step2[40], step2[47]);
3398 step1[48] = vqaddq_s16(step2[48], step2[55]);
3399 step1[49] = vqaddq_s16(step2[49], step2[54]);
3400 step1[50] = vqaddq_s16(step2[50], step2[53]);
3401 step1[51] = vqaddq_s16(step2[51], step2[52]);
3402 step1[52] = vqsubq_s16(step2[51], step2[52]);
3403 step1[53] = vqsubq_s16(step2[50], step2[53]);
3404 step1[54] = vqsubq_s16(step2[49], step2[54]);
3405 step1[55] = vqsubq_s16(step2[48], step2[55]);
3406 step1[56] = vqsubq_s16(step2[63], step2[56]);
3407 step1[57] = vqsubq_s16(step2[62], step2[57]);
3408 step1[58] = vqsubq_s16(step2[61], step2[58]);
3409 step1[59] = vqsubq_s16(step2[60], step2[59]);
3410 step1[60] = vqaddq_s16(step2[59], step2[60]);
3411 step1[61] = vqaddq_s16(step2[58], step2[61]);
3412 step1[62] = vqaddq_s16(step2[57], step2[62]);
3413 step1[63] = vqaddq_s16(step2[56], step2[63]);
3414
3415 // stage 8
3416
3417 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3418 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3419 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3420 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3421 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3422 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303423 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
3424 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
3425 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
3426 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00003427
3428 step2[0] = vqaddq_s16(step1[0], step1[7]);
3429 step2[1] = vqaddq_s16(step1[1], step1[6]);
3430 step2[2] = vqaddq_s16(step1[2], step1[5]);
3431 step2[3] = vqaddq_s16(step1[3], step1[4]);
3432 step2[4] = vqsubq_s16(step1[3], step1[4]);
3433 step2[5] = vqsubq_s16(step1[2], step1[5]);
3434 step2[6] = vqsubq_s16(step1[1], step1[6]);
3435 step2[7] = vqsubq_s16(step1[0], step1[7]);
3436 step2[8] = step1[8];
3437 step2[9] = step1[9];
3438 step2[14] = step1[14];
3439 step2[15] = step1[15];
3440 step2[16] = vqaddq_s16(step1[16], step1[23]);
3441 step2[17] = vqaddq_s16(step1[17], step1[22]);
3442 step2[18] = vqaddq_s16(step1[18], step1[21]);
3443 step2[19] = vqaddq_s16(step1[19], step1[20]);
3444 step2[20] = vqsubq_s16(step1[19], step1[20]);
3445 step2[21] = vqsubq_s16(step1[18], step1[21]);
3446 step2[22] = vqsubq_s16(step1[17], step1[22]);
3447 step2[23] = vqsubq_s16(step1[16], step1[23]);
3448 step2[24] = vqsubq_s16(step1[31], step1[24]);
3449 step2[25] = vqsubq_s16(step1[30], step1[25]);
3450 step2[26] = vqsubq_s16(step1[29], step1[26]);
3451 step2[27] = vqsubq_s16(step1[28], step1[27]);
3452 step2[28] = vqaddq_s16(step1[28], step1[27]);
3453 step2[29] = vqaddq_s16(step1[29], step1[26]);
3454 step2[30] = vqaddq_s16(step1[30], step1[25]);
3455 step2[31] = vqaddq_s16(step1[31], step1[24]);
3456 step2[32] = step1[32];
3457 step2[33] = step1[33];
3458 step2[34] = step1[34];
3459 step2[35] = step1[35];
3460 step2[44] = step1[44];
3461 step2[45] = step1[45];
3462 step2[46] = step1[46];
3463 step2[47] = step1[47];
3464 step2[48] = step1[48];
3465 step2[49] = step1[49];
3466 step2[50] = step1[50];
3467 step2[51] = step1[51];
3468 step2[60] = step1[60];
3469 step2[61] = step1[61];
3470 step2[62] = step1[62];
3471 step2[63] = step1[63];
3472
3473 // stage 9
3474 idct64_stage9_neon(step2, step1, cos_bit);
3475
3476 // stage 10
3477 idct64_stage10_neon(step1, step2, cos_bit);
3478
3479 // stage 11
3480
3481 out[0] = vqaddq_s16(step2[0], step2[63]);
3482 out[1] = vqaddq_s16(step2[1], step2[62]);
3483 out[2] = vqaddq_s16(step2[2], step2[61]);
3484 out[3] = vqaddq_s16(step2[3], step2[60]);
3485 out[4] = vqaddq_s16(step2[4], step2[59]);
3486 out[5] = vqaddq_s16(step2[5], step2[58]);
3487 out[6] = vqaddq_s16(step2[6], step2[57]);
3488 out[7] = vqaddq_s16(step2[7], step2[56]);
3489 out[8] = vqaddq_s16(step2[8], step2[55]);
3490 out[9] = vqaddq_s16(step2[9], step2[54]);
3491 out[10] = vqaddq_s16(step2[10], step2[53]);
3492 out[11] = vqaddq_s16(step2[11], step2[52]);
3493 out[12] = vqaddq_s16(step2[12], step2[51]);
3494 out[13] = vqaddq_s16(step2[13], step2[50]);
3495 out[14] = vqaddq_s16(step2[14], step2[49]);
3496 out[15] = vqaddq_s16(step2[15], step2[48]);
3497 out[16] = vqaddq_s16(step2[16], step2[47]);
3498 out[17] = vqaddq_s16(step2[17], step2[46]);
3499 out[18] = vqaddq_s16(step2[18], step2[45]);
3500 out[19] = vqaddq_s16(step2[19], step2[44]);
3501 out[20] = vqaddq_s16(step2[20], step2[43]);
3502 out[21] = vqaddq_s16(step2[21], step2[42]);
3503 out[22] = vqaddq_s16(step2[22], step2[41]);
3504 out[23] = vqaddq_s16(step2[23], step2[40]);
3505 out[24] = vqaddq_s16(step2[24], step2[39]);
3506 out[25] = vqaddq_s16(step2[25], step2[38]);
3507 out[26] = vqaddq_s16(step2[26], step2[37]);
3508 out[27] = vqaddq_s16(step2[27], step2[36]);
3509 out[28] = vqaddq_s16(step2[28], step2[35]);
3510 out[29] = vqaddq_s16(step2[29], step2[34]);
3511 out[30] = vqaddq_s16(step2[30], step2[33]);
3512 out[31] = vqaddq_s16(step2[31], step2[32]);
3513 out[32] = vqsubq_s16(step2[31], step2[32]);
3514 out[33] = vqsubq_s16(step2[30], step2[33]);
3515 out[34] = vqsubq_s16(step2[29], step2[34]);
3516 out[35] = vqsubq_s16(step2[28], step2[35]);
3517 out[36] = vqsubq_s16(step2[27], step2[36]);
3518 out[37] = vqsubq_s16(step2[26], step2[37]);
3519 out[38] = vqsubq_s16(step2[25], step2[38]);
3520 out[39] = vqsubq_s16(step2[24], step2[39]);
3521 out[40] = vqsubq_s16(step2[23], step2[40]);
3522 out[41] = vqsubq_s16(step2[22], step2[41]);
3523 out[42] = vqsubq_s16(step2[21], step2[42]);
3524 out[43] = vqsubq_s16(step2[20], step2[43]);
3525 out[44] = vqsubq_s16(step2[19], step2[44]);
3526 out[45] = vqsubq_s16(step2[18], step2[45]);
3527 out[46] = vqsubq_s16(step2[17], step2[46]);
3528 out[47] = vqsubq_s16(step2[16], step2[47]);
3529 out[48] = vqsubq_s16(step2[15], step2[48]);
3530 out[49] = vqsubq_s16(step2[14], step2[49]);
3531 out[50] = vqsubq_s16(step2[13], step2[50]);
3532 out[51] = vqsubq_s16(step2[12], step2[51]);
3533 out[52] = vqsubq_s16(step2[11], step2[52]);
3534 out[53] = vqsubq_s16(step2[10], step2[53]);
3535 out[54] = vqsubq_s16(step2[9], step2[54]);
3536 out[55] = vqsubq_s16(step2[8], step2[55]);
3537 out[56] = vqsubq_s16(step2[7], step2[56]);
3538 out[57] = vqsubq_s16(step2[6], step2[57]);
3539 out[58] = vqsubq_s16(step2[5], step2[58]);
3540 out[59] = vqsubq_s16(step2[4], step2[59]);
3541 out[60] = vqsubq_s16(step2[3], step2[60]);
3542 out[61] = vqsubq_s16(step2[2], step2[61]);
3543 out[62] = vqsubq_s16(step2[1], step2[62]);
3544 out[63] = vqsubq_s16(step2[0], step2[63]);
3545}
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303546
Venkat000f2f62018-07-05 12:03:05 +05303547// Functions for blocks with eob at DC and within
3548// topleft 8x8, 16x16, 32x32 corner
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303549static const transform_neon
3550 lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
3551 {
3552 { NULL, NULL, NULL, NULL },
3553 { NULL, NULL, NULL, NULL },
3554 { NULL, NULL, NULL, NULL },
3555 },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003556 { { idct8_low1_neon, idct8_neon, NULL, NULL },
3557 { iadst8_low1_neon, iadst8_neon, NULL, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303558 { NULL, NULL, NULL, NULL } },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303559 {
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003560 { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
3561 { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303562 { NULL, NULL, NULL, NULL },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303563 },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003564 { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303565 { NULL, NULL, NULL, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303566 { NULL, NULL, NULL, NULL } },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003567 { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
3568 idct64_low32_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303569 { NULL, NULL, NULL, NULL },
3570 { NULL, NULL, NULL, NULL } }
3571 };
3572
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303573static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
3574 uint8_t *output, int stride,
3575 TX_TYPE tx_type,
3576 TX_SIZE tx_size, int eob) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303577 (void)tx_type;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303578 int16x8_t a[32 * 4];
3579 int16x8_t b[32 * 4];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303580 int eobx, eoby;
3581 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003582 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303583 const int txw_idx = get_txw_idx(tx_size);
3584 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303585 const int txfm_size_col = tx_size_wide[tx_size];
3586 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303587 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3588 0);
3589 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3590 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303591 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303592 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303593 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003594 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3595 const int input_stride = txfm_size_row;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303596 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303597
3598 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003599 int16x8_t *cur_a = &a[i * txfm_size_col];
3600 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3601 buf_size_nonzero_w);
3602 input += 8;
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003603 if (abs(rect_type) == 1) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003604 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003605 }
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003606 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3607 -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303608 for (int j = 0; j < buf_size_w_div8; ++j) {
George Steede2391552023-08-16 18:31:42 +01003609 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303610 }
3611 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303612 }
3613 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303614 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3615 txh_idx, txfm_size_row, -shift[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303616 }
3617 if (txfm_size_col >= 16) {
3618 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3619 lowbd_add_flip_buffer_16xn_neon(
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303620 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303621 }
3622 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303623 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303624 }
3625}
3626
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303627static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
3628 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3629 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303630 int16x8_t a[16 * 2];
3631 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303632 int eobx, eoby, ud_flip, lr_flip;
3633 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003634 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303635 const int txw_idx = get_txw_idx(tx_size);
3636 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303637 const int txfm_size_col = tx_size_wide[tx_size];
3638 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303639 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3640 0);
3641 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303642 const int buf_size_w_div8 = txfm_size_col >> 3;
3643 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003644 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3645 const int input_stride = txfm_size_row;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303646 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303647 int temp_b = 0;
3648 const transform_neon row_txfm =
3649 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303650
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303651 assert(row_txfm != NULL);
3652
3653 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3654
3655 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003656 int16x8_t *cur_a = &a[i * txfm_size_col];
3657 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3658 buf_size_nonzero_w);
3659 input += 8;
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003660 if (abs(rect_type) == 1) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003661 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003662 }
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003663 row_txfm(cur_a, cur_a, INV_COS_BIT);
3664 av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303665 if (lr_flip == 1) {
3666 for (int j = 0; j < buf_size_w_div8; ++j) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003667 flip_buf_ud_neon(&cur_a[j * 8], 8);
George Steede2391552023-08-16 18:31:42 +01003668 transpose_arrays_s16_8x8(
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003669 &cur_a[j * 8],
3670 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303671 }
3672 temp_b += 8;
3673 } else {
3674 for (int j = 0; j < buf_size_w_div8; ++j) {
George Steede2391552023-08-16 18:31:42 +01003675 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303676 }
3677 temp_b += 8;
3678 }
3679 }
3680 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303681 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3682 txh_idx, txfm_size_row, -shift[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303683 }
3684 if (txfm_size_col >= 16) {
3685 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3686 lowbd_add_flip_buffer_16xn_neon(
3687 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3688 }
3689 } else if (txfm_size_col == 8) {
3690 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3691 }
3692}
3693
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303694static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
3695 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3696 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303697 int16x8_t a[16 * 2];
3698 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303699 int eobx, eoby, ud_flip, lr_flip;
3700 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003701 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303702 const int txw_idx = get_txw_idx(tx_size);
3703 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303704 const int txfm_size_col = tx_size_wide[tx_size];
3705 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303706 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3707 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303708 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303709 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303710 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003711 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
3712 const int input_stride = txfm_size_row;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303713 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303714 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303715 const transform_neon col_txfm =
3716 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3717
3718 assert(col_txfm != NULL);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303719
3720 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3721
3722 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003723 int16x8_t *cur_a = &a[i * txfm_size_col];
3724 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
3725 buf_size_nonzero_w);
3726 input += 8;
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003727 if (abs(rect_type) == 1) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003728 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
Kyle Siefringfe9647f2022-12-08 16:56:00 -05003729 }
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003730 identity_txfm_round_neon(cur_a, cur_a, txw_idx, buf_size_nonzero_w,
3731 -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303732 for (int j = 0; j < buf_size_w_div8; ++j) {
George Steede2391552023-08-16 18:31:42 +01003733 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303734 }
3735 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303736 }
3737 for (int j = 0; j < buf_size_w_div8; ++j) {
Scott LaVarnwayed25b612022-02-17 13:28:23 -05003738 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303739 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303740 -shift[1]);
3741 }
3742 if (txfm_size_col >= 16) {
3743 for (int i = 0; i < (txfm_size_col >> 4); i++) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303744 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303745 output + 16 * i, stride, ud_flip,
3746 txfm_size_row);
3747 }
3748 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303749 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303750 }
3751}
3752
Venkat000f2f62018-07-05 12:03:05 +05303753static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
3754 uint8_t *output, int stride,
sachin garg56f10202018-09-24 14:05:25 +00003755 TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303756 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003757 TX_SIZE tx_size = TX_4X4;
Venkat000f2f62018-07-05 12:03:05 +05303758 DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
3759 int32_t *temp_in = txfm_buf;
3760
Yaowu Xua19e7622019-04-29 14:12:44 -07003761 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303762 const int txw_idx = get_txw_idx(tx_size);
3763 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303764 const int txfm_size_col = tx_size_wide[tx_size];
3765 const int txfm_size_row = tx_size_high[tx_size];
3766 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3767 int32_t *temp_out = temp_in + buf_offset;
3768 int32_t *buf = temp_out + buf_offset;
3769 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303770 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
James Zern2fe51352023-07-18 18:05:31 -07003771 int r;
Venkat000f2f62018-07-05 12:03:05 +05303772 const transform_1d_neon row_txfm =
3773 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3774 const transform_1d_neon col_txfm =
3775 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3776
3777 int ud_flip, lr_flip;
3778 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3779
3780 for (int i = 0; i < txfm_size_row; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003781 for (int c = 0; c < txfm_size_col; ++c)
3782 temp_in[c] = input[c * txfm_size_row];
3783 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303784
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003785 input++;
Venkat000f2f62018-07-05 12:03:05 +05303786 buf_ptr += txfm_size_col;
3787 }
3788
3789 for (int c = 0; c < txfm_size_col; ++c) {
3790 if (lr_flip == 0) {
3791 for (r = 0; r < txfm_size_row; ++r)
3792 temp_in[r] = buf[r * txfm_size_col + c];
3793 } else {
3794 // flip left right
3795 for (r = 0; r < txfm_size_row; ++r)
3796 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3797 }
James Zern2fe51352023-07-18 18:05:31 -07003798 clamp_buf(temp_in, txfm_size_row, 16);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003799 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303800 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3801
3802 if (ud_flip == 0) {
3803 for (r = 0; r < txfm_size_row; ++r) {
3804 output[r * stride + c] =
James Zern88adec72023-07-18 17:58:14 -07003805 clip_pixel(output[r * stride + c] + temp_out[r]);
Venkat000f2f62018-07-05 12:03:05 +05303806 }
3807 } else {
3808 // flip upside down
3809 for (r = 0; r < txfm_size_row; ++r) {
James Zern88adec72023-07-18 17:58:14 -07003810 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3811 temp_out[txfm_size_row - r - 1]);
Venkat000f2f62018-07-05 12:03:05 +05303812 }
3813 }
3814 }
3815}
3816
3817void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003818 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303819 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003820 TX_SIZE tx_size = TX_4X8;
Venkat000f2f62018-07-05 12:03:05 +05303821 DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
3822 int32_t *temp_in = txfm_buf;
3823
Yaowu Xua19e7622019-04-29 14:12:44 -07003824 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303825 const int txw_idx = get_txw_idx(tx_size);
3826 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303827 const int txfm_size_col = tx_size_wide[tx_size];
3828 const int txfm_size_row = tx_size_high[tx_size];
3829 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3830 int32_t *temp_out = temp_in + buf_offset;
3831 int32_t *buf = temp_out + buf_offset;
3832 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303833 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3834 16, 16, 16, 16 };
James Zern2fe51352023-07-18 18:05:31 -07003835 int r;
Venkat000f2f62018-07-05 12:03:05 +05303836 const transform_1d_neon row_txfm =
3837 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3838 const transform_1d_neon col_txfm =
3839 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3840
3841 int ud_flip, lr_flip;
3842 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3843
3844 for (int i = 0; i < txfm_size_row; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003845 for (int c = 0; c < txfm_size_col; c++)
3846 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3847 NewSqrt2Bits);
Venkat000f2f62018-07-05 12:03:05 +05303848
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003849 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003850 input++;
Venkat000f2f62018-07-05 12:03:05 +05303851 buf_ptr += txfm_size_col;
3852 }
3853
3854 for (int c = 0; c < txfm_size_col; ++c) {
3855 if (lr_flip == 0) {
3856 for (r = 0; r < txfm_size_row; ++r)
3857 temp_in[r] = buf[r * txfm_size_col + c];
3858 } else {
3859 // flip left right
3860 for (r = 0; r < txfm_size_row; ++r)
3861 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3862 }
James Zern2fe51352023-07-18 18:05:31 -07003863 clamp_buf(temp_in, txfm_size_row, 16);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003864 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303865 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3866
3867 if (ud_flip == 0) {
3868 for (r = 0; r < txfm_size_row; ++r) {
3869 output[r * stride + c] =
James Zern88adec72023-07-18 17:58:14 -07003870 clip_pixel(output[r * stride + c] + temp_out[r]);
Venkat000f2f62018-07-05 12:03:05 +05303871 }
3872 } else {
3873 // flip upside down
3874 for (r = 0; r < txfm_size_row; ++r) {
James Zern88adec72023-07-18 17:58:14 -07003875 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3876 temp_out[txfm_size_row - r - 1]);
Venkat000f2f62018-07-05 12:03:05 +05303877 }
3878 }
3879 }
3880}
3881
3882void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003883 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303884 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003885 TX_SIZE tx_size = TX_8X4;
Venkat000f2f62018-07-05 12:03:05 +05303886 DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
3887 int32_t *temp_in = txfm_buf;
3888
Yaowu Xua19e7622019-04-29 14:12:44 -07003889 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303890 const int txw_idx = get_txw_idx(tx_size);
3891 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303892 const int txfm_size_col = tx_size_wide[tx_size];
3893 const int txfm_size_row = tx_size_high[tx_size];
3894 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3895 int32_t *temp_out = temp_in + buf_offset;
3896 int32_t *buf = temp_out + buf_offset;
3897 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303898 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3899 16, 16, 16, 16 };
James Zern2fe51352023-07-18 18:05:31 -07003900 int r;
Venkat000f2f62018-07-05 12:03:05 +05303901 const transform_1d_neon row_txfm =
3902 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3903 const transform_1d_neon col_txfm =
3904 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3905
3906 int ud_flip, lr_flip;
3907 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3908
3909 for (int i = 0; i < txfm_size_row; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003910 for (int c = 0; c < txfm_size_col; c++)
3911 temp_in[c] = round_shift((int64_t)input[c * txfm_size_row] * NewInvSqrt2,
3912 NewSqrt2Bits);
Venkat000f2f62018-07-05 12:03:05 +05303913
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003914 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003915 input++;
Venkat000f2f62018-07-05 12:03:05 +05303916 buf_ptr += txfm_size_col;
3917 }
3918
3919 for (int c = 0; c < txfm_size_col; ++c) {
3920 if (lr_flip == 0) {
3921 for (r = 0; r < txfm_size_row; ++r)
3922 temp_in[r] = buf[r * txfm_size_col + c];
3923 } else {
3924 // flip left right
3925 for (r = 0; r < txfm_size_row; ++r)
3926 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3927 }
James Zern2fe51352023-07-18 18:05:31 -07003928 clamp_buf(temp_in, txfm_size_row, 16);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003929 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303930 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3931
3932 if (ud_flip == 0) {
3933 for (r = 0; r < txfm_size_row; ++r) {
3934 output[r * stride + c] =
James Zern88adec72023-07-18 17:58:14 -07003935 clip_pixel(output[r * stride + c] + temp_out[r]);
Venkat000f2f62018-07-05 12:03:05 +05303936 }
3937 } else {
3938 // flip upside down
3939 for (r = 0; r < txfm_size_row; ++r) {
James Zern88adec72023-07-18 17:58:14 -07003940 output[r * stride + c] = clip_pixel(output[r * stride + c] +
3941 temp_out[txfm_size_row - r - 1]);
Venkat000f2f62018-07-05 12:03:05 +05303942 }
3943 }
3944 }
3945}
3946
3947void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003948 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303949 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003950 TX_SIZE tx_size = TX_4X16;
Venkat000f2f62018-07-05 12:03:05 +05303951 DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
3952 int32_t *temp_in = txfm_buf;
3953
Yaowu Xua19e7622019-04-29 14:12:44 -07003954 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303955 const int txw_idx = get_txw_idx(tx_size);
3956 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303957 const int txfm_size_col = tx_size_wide[tx_size];
3958 const int txfm_size_row = tx_size_high[tx_size];
3959 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3960 int32_t *temp_out = temp_in + buf_offset;
3961 int32_t *buf = temp_out + buf_offset;
3962 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303963 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
3964 16, 16, 16, 16, 16 };
James Zern2fe51352023-07-18 18:05:31 -07003965 int r;
Venkat000f2f62018-07-05 12:03:05 +05303966 const transform_1d_neon row_txfm =
3967 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3968 const transform_1d_neon col_txfm =
3969 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3970
3971 int ud_flip, lr_flip;
3972 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3973
3974 for (int i = 0; i < txfm_size_row; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003975 for (int c = 0; c < txfm_size_col; c++)
3976 temp_in[c] = input[c * txfm_size_row];
3977 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303978 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
Kyle Siefring7bb4c542022-12-09 10:27:41 -05003979 input++;
Venkat000f2f62018-07-05 12:03:05 +05303980 buf_ptr += txfm_size_col;
3981 }
3982
3983 for (int c = 0; c < txfm_size_col; ++c) {
3984 if (lr_flip == 0) {
3985 for (r = 0; r < txfm_size_row; ++r)
3986 temp_in[r] = buf[r * txfm_size_col + c];
3987 } else {
3988 // flip left right
3989 for (r = 0; r < txfm_size_row; ++r)
3990 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3991 }
James Zern2fe51352023-07-18 18:05:31 -07003992 clamp_buf(temp_in, txfm_size_row, 16);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003993 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303994 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3995
3996 if (ud_flip == 0) {
3997 for (r = 0; r < txfm_size_row; ++r) {
3998 output[r * stride + c] =
James Zern88adec72023-07-18 17:58:14 -07003999 clip_pixel(output[r * stride + c] + temp_out[r]);
Venkat000f2f62018-07-05 12:03:05 +05304000 }
4001 } else {
4002 // flip upside down
4003 for (r = 0; r < txfm_size_row; ++r) {
James Zern88adec72023-07-18 17:58:14 -07004004 output[r * stride + c] = clip_pixel(output[r * stride + c] +
4005 temp_out[txfm_size_row - r - 1]);
Venkat000f2f62018-07-05 12:03:05 +05304006 }
4007 }
4008 }
4009}
4010
4011void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00004012 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304013 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00004014 TX_SIZE tx_size = TX_16X4;
Venkat000f2f62018-07-05 12:03:05 +05304015 DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
4016 int32_t *temp_in = txfm_buf;
4017
Yaowu Xua19e7622019-04-29 14:12:44 -07004018 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05304019 const int txw_idx = get_txw_idx(tx_size);
4020 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05304021 const int txfm_size_col = tx_size_wide[tx_size];
4022 const int txfm_size_row = tx_size_high[tx_size];
4023 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4024 int32_t *temp_out = temp_in + buf_offset;
4025 int32_t *buf = temp_out + buf_offset;
4026 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05304027 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
4028 16, 16, 16, 16, 16 };
James Zern2fe51352023-07-18 18:05:31 -07004029 int r;
Venkat000f2f62018-07-05 12:03:05 +05304030 const transform_1d_neon row_txfm =
4031 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4032 const transform_1d_neon col_txfm =
4033 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4034
4035 int ud_flip, lr_flip;
4036 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4037
4038 for (int i = 0; i < txfm_size_row; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004039 for (int c = 0; c < txfm_size_col; c++)
4040 temp_in[c] = input[c * txfm_size_row];
4041 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05304042 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004043 input++;
Venkat000f2f62018-07-05 12:03:05 +05304044 buf_ptr += txfm_size_col;
4045 }
4046
4047 for (int c = 0; c < txfm_size_col; ++c) {
4048 if (lr_flip == 0) {
4049 for (r = 0; r < txfm_size_row; ++r)
4050 temp_in[r] = buf[r * txfm_size_col + c];
4051 } else {
4052 // flip left right
4053 for (r = 0; r < txfm_size_row; ++r)
4054 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4055 }
James Zern2fe51352023-07-18 18:05:31 -07004056 clamp_buf(temp_in, txfm_size_row, 16);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05004057 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05304058 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4059
4060 if (ud_flip == 0) {
4061 for (r = 0; r < txfm_size_row; ++r) {
4062 output[r * stride + c] =
James Zern88adec72023-07-18 17:58:14 -07004063 clip_pixel(output[r * stride + c] + temp_out[r]);
Venkat000f2f62018-07-05 12:03:05 +05304064 }
4065 } else {
4066 // flip upside down
4067 for (r = 0; r < txfm_size_row; ++r) {
James Zern88adec72023-07-18 17:58:14 -07004068 output[r * stride + c] = clip_pixel(output[r * stride + c] +
4069 temp_out[txfm_size_row - r - 1]);
Venkat000f2f62018-07-05 12:03:05 +05304070 }
4071 }
4072 }
4073}
4074
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304075static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
4076 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4077 TX_SIZE tx_size, int eob) {
4078 int16x8_t a[64 * 8];
4079 int16x8_t b[64 * 8];
4080 int eobx, eoby, ud_flip, lr_flip;
4081 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07004082 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304083 const int txw_idx = get_txw_idx(tx_size);
4084 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304085 const int txfm_size_col = tx_size_wide[tx_size];
4086 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304087 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304088 const int buf_size_w_div8 = txfm_size_col >> 3;
4089 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004090 const int buf_size_nonzero_w = (eobx + 8) >> 3 << 3;
4091 const int input_stride = AOMMIN(32, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304092 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4093 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304094 int temp_b = 0;
4095
4096 const transform_neon row_txfm =
4097 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4098 const transform_neon col_txfm =
4099 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4100
4101 assert(col_txfm != NULL);
4102 assert(row_txfm != NULL);
4103
4104 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4105
4106 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004107 int16x8_t *cur_a = &a[i * txfm_size_col];
4108 load_buffer_32bit_to_16bit_neon(input, input_stride, cur_a,
4109 buf_size_nonzero_w);
4110 input += 8;
Kyle Siefringfe9647f2022-12-08 16:56:00 -05004111 if (abs(rect_type) == 1) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004112 round_shift_for_rect(cur_a, cur_a, buf_size_nonzero_w);
Kyle Siefringfe9647f2022-12-08 16:56:00 -05004113 }
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004114 row_txfm(cur_a, cur_a, INV_COS_BIT);
4115 av1_round_shift_array_16_neon(cur_a, txfm_size_col, -shift[0]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304116 if (lr_flip == 1) {
4117 for (int j = 0; j < buf_size_w_div8; ++j) {
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004118 flip_buf_ud_neon(&cur_a[j * 8], 8);
George Steede2391552023-08-16 18:31:42 +01004119 transpose_arrays_s16_8x8(
Kyle Siefring7bb4c542022-12-09 10:27:41 -05004120 &cur_a[j * 8],
4121 &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304122 }
4123 temp_b += 8;
4124 } else {
4125 for (int j = 0; j < buf_size_w_div8; ++j) {
George Steede2391552023-08-16 18:31:42 +01004126 transpose_arrays_s16_8x8(&cur_a[j * 8], &b[temp_b + txfm_size_row * j]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304127 }
4128 temp_b += 8;
4129 }
4130 }
4131 for (int j = 0; j < buf_size_w_div8; ++j) {
Scott LaVarnwayed25b612022-02-17 13:28:23 -05004132 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304133 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
4134 -shift[1]);
4135 }
4136
4137 if (txfm_size_col >= 16) {
4138 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4139 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
4140 output + 16 * i, stride, ud_flip,
4141 txfm_size_row);
4142 }
4143 } else if (txfm_size_col == 8) {
4144 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
4145 }
4146}
4147
Venkat000f2f62018-07-05 12:03:05 +05304148static INLINE void lowbd_inv_txfm2d_add_universe_neon(
4149 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4150 TX_SIZE tx_size, int eob) {
4151 switch (tx_type) {
4152 case IDTX:
4153 lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
4154 eob);
4155 break;
4156
4157 case H_DCT:
4158 case H_ADST:
4159 case H_FLIPADST:
4160 lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
4161 tx_size, eob);
4162 break;
4163
4164 case V_DCT:
4165 case V_ADST:
4166 case V_FLIPADST:
4167 lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
4168 tx_size, eob);
4169 break;
4170
4171 default:
4172 lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
4173 tx_size, eob);
4174 break;
4175 }
4176}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304177
Venkat000f2f62018-07-05 12:03:05 +05304178void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4179 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4180 int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304181 switch (tx_size) {
4182 case TX_4X4:
sachin garg56f10202018-09-24 14:05:25 +00004183 lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304184 break;
4185
4186 case TX_4X8:
sachin garg56f10202018-09-24 14:05:25 +00004187 lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304188 break;
4189
4190 case TX_8X4:
sachin garg56f10202018-09-24 14:05:25 +00004191 lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304192 break;
4193
4194 case TX_4X16:
sachin garg56f10202018-09-24 14:05:25 +00004195 lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304196 break;
4197
4198 case TX_16X4:
sachin garg56f10202018-09-24 14:05:25 +00004199 lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304200 break;
4201
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304202 default:
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304203 lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
Venkat000f2f62018-07-05 12:03:05 +05304204 tx_size, eob);
Venkat000f2f62018-07-05 12:03:05 +05304205 break;
4206 }
4207}
4208void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
4209 const TxfmParam *txfm_param) {
4210 const TX_TYPE tx_type = txfm_param->tx_type;
4211 if (!txfm_param->lossless) {
4212 av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
4213 txfm_param->tx_size, txfm_param->eob);
4214 } else {
4215 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
4216 }
4217}