blob: 1628cbf23e779bb52e9a7254c779f95669224477 [file] [log] [blame]
Venkat000f2f62018-07-05 12:03:05 +05301/*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053012#include <arm_neon.h>
13
Venkat000f2f62018-07-05 12:03:05 +053014#include "config/aom_config.h"
15#include "config/aom_dsp_rtcd.h"
16#include "config/av1_rtcd.h"
17
Yaowu Xu01d4a322021-07-15 07:46:13 -070018#include "aom_dsp/arm/transpose_neon.h"
Venkat000f2f62018-07-05 12:03:05 +053019#include "av1/common/av1_inv_txfm1d.h"
20#include "av1/common/av1_inv_txfm1d_cfg.h"
21#include "av1/common/av1_txfm.h"
22#include "av1/common/enums.h"
23#include "av1/common/idct.h"
24#include "av1/common/arm/av1_inv_txfm_neon.h"
25
Venkat000f2f62018-07-05 12:03:05 +053026// 1D itx types
27typedef enum ATTRIBUTE_PACKED {
28 IDCT_1D,
29 IADST_1D,
30 IFLIPADST_1D = IADST_1D,
31 IIDENTITY_1D,
32 ITX_TYPES_1D,
33} ITX_TYPE_1D;
34
35static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
36 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
37 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
38 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
39 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
40};
41
42static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
43 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
44 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
45 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
46 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
47};
48
49// 1D functions
50static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
Yaowu Xueb5e4e22020-04-06 14:17:55 -070051 { av1_idct4, av1_iadst4, av1_iidentity4_c },
52 { av1_idct8, av1_iadst8, av1_iidentity8_c },
53 { av1_idct16, av1_iadst16, av1_iidentity16_c },
54 { av1_idct32, NULL, NULL },
55 { av1_idct64, NULL, NULL },
Venkat000f2f62018-07-05 12:03:05 +053056};
57
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053058static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
59 uint8_t *output, int stride,
60 int flipud,
61 const int height) {
62 int j = flipud ? (height - 1) : 0;
63 const int step = flipud ? -1 : 1;
64 int16x8_t temp_output;
65 for (int i = 0; i < height; ++i, j += step) {
66 temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
67 temp_output = vaddq_s16(temp_output, in[j]);
68 vst1_u8(output, vqmovun_s16(temp_output));
69 output += stride;
70 }
71}
72
73static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
74 int16x8_t res0,
75 int16x8_t res1) {
76 int16x8_t temp_output[2];
77 uint8x16_t temp_output_8q;
78 temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
79 temp_output[0] = vaddq_s16(temp_output[0], res0);
80 temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
81 temp_output[1] = vaddq_s16(temp_output[1], res1);
82 temp_output_8q =
83 vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
84 return temp_output_8q;
85}
86
87static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
88 uint8_t *output, int stride,
89 int flipud, int height) {
90 uint8x16_t temp_output_8q;
91 int j = flipud ? (height - 1) : 0;
92 const int step = flipud ? -1 : 1;
93 for (int i = 0; i < height; ++i, j += step) {
94 temp_output_8q = vld1q_u8(output + i * stride);
95 temp_output_8q =
96 lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
97 vst1q_u8((output + i * stride), temp_output_8q);
98 }
99}
100
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530101static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
102 int value) {
103 for (int i = 0; i < size; i++) {
104 a[i] = vdupq_n_s16((int16_t)value);
105 }
106}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530107
108static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
109 const int16x8_t in1, const int16x4_t c,
110 int16x8_t *t0, int16x8_t *t1) {
111 int32x4_t s0[2], s1[2];
112 int16x4_t v0[2], v1[2];
113
114 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
115 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
116 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
117 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
118
119 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
120 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
121 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
122 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
123
124 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
125 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
126 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
127 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
128
129 *t0 = vcombine_s16(v0[0], v0[1]);
130 *t1 = vcombine_s16(v1[0], v1[1]);
131}
132
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530133static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
134 const int16x8_t in1, const int16x4_t c,
135 int16x8_t *t0, int16x8_t *t1) {
136 int32x4_t s0[2], s1[2];
137 int16x4_t v0[2], v1[2];
138
139 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
140 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
141 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
142 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
143
144 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
145 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
146 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
147 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
148
149 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
150 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
151 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
152 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
153
154 *t0 = vcombine_s16(v0[0], v0[1]);
155 *t1 = vcombine_s16(v1[0], v1[1]);
156}
157
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530158static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
159 const int16x8_t in1, const int16x4_t c,
160 int16x8_t *t0, int16x8_t *t1) {
161 int32x4_t s0[2], s1[2];
162 int16x4_t v0[2], v1[2];
163
164 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
165 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
166 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168
169 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
170 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
171 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
172 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
173
174 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
175 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
176 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
177 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
178
179 *t0 = vcombine_s16(v0[0], v0[1]);
180 *t1 = vcombine_s16(v1[0], v1[1]);
181}
182
183static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
184 int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
185 int32x4_t s0_l, s0_h, s1_l, s1_h;
186 int16x4_t v0[2], v1[2];
187
188 s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
189 s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530190 s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
191 s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530192
193 v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
194 v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
195 v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
196 v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
197
198 *t0 = vcombine_s16(v0[0], v0[1]);
199 *t1 = vcombine_s16(v1[0], v1[1]);
200}
201
202static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
203 const int16x8_t in1, const int16x4_t c,
204 int16x8_t *t0, int16x8_t *t1) {
205 int32x4_t s0[2], s1[2];
206 int16x4_t v0[2], v1[2];
207
208 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
209 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
210 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
211 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
212
213 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
214 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
215 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
216 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
217
218 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
219 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
220 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
221 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
222
223 *t0 = vcombine_s16(v0[0], v0[1]);
224 *t1 = vcombine_s16(v1[0], v1[1]);
225}
226
227static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
228 int32x4_t t0[2], t1[2];
229 int16x4_t v0[2], v1[2];
230
231 // Don't add/sub before multiply, which will overflow in iadst8.
232 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
233 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
234 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
235 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
236
237 t0[0] = vaddq_s32(x0_lo, x1_lo);
238 t0[1] = vaddq_s32(x0_hi, x1_hi);
239 t1[0] = vsubq_s32(x0_lo, x1_lo);
240 t1[1] = vsubq_s32(x0_hi, x1_hi);
241
242 v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
243 v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
244 v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
245 v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
246
247 x[0] = vcombine_s16(v0[0], v0[1]);
248 x[1] = vcombine_s16(v1[0], v1[1]);
249}
250
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530251static INLINE int16x4_t set_s16x4_neon(const int16_t c0, const int16_t c1,
252 const int16_t c2, const int16_t c3) {
James Zern42a54e52022-09-01 19:11:06 -0700253 int16x4_t val = vdup_n_s16(c0);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530254 val = vset_lane_s16(c1, val, 1);
255 val = vset_lane_s16(c2, val, 2);
256 val = vset_lane_s16(c3, val, 3);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530257 return val;
258}
259
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700260static INLINE void iadst8_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500261 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530262 const int32_t *cospi = cospi_arr(cos_bit);
263
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530264 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
265 (int16_t)cospi[20], (int16_t)cospi[44]);
266 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[36], (int16_t)cospi[28],
267 (int16_t)cospi[52], (int16_t)cospi[12]);
268 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
269 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530270
271 int16x8_t x[8];
272 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
273
274 // Stage 1
275 x[0] = in[7];
276 x[1] = in[0];
277 x[2] = in[5];
278 x[3] = in[2];
279 x[4] = in[3];
280 x[5] = in[4];
281 x[6] = in[1];
282 x[7] = in[6];
283
284 // Stage 2
285 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
286 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
287 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
288 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
289
290 // Stage 3
291 x[0] = vqaddq_s16(s0, s4);
292 x[1] = vqaddq_s16(s1, s5);
293 x[2] = vqaddq_s16(s2, s6);
294 x[3] = vqaddq_s16(s3, s7);
295 x[4] = vqsubq_s16(s0, s4);
296 x[5] = vqsubq_s16(s1, s5);
297 x[6] = vqsubq_s16(s2, s6);
298 x[7] = vqsubq_s16(s3, s7);
299
300 // Stage 4
301 s0 = x[0];
302 s1 = x[1];
303 s2 = x[2];
304 s3 = x[3];
305 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
306 btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
307
308 // Stage 5
309 x[0] = vqaddq_s16(s0, s2);
310 x[1] = vqaddq_s16(s1, s3);
311 x[2] = vqsubq_s16(s0, s2);
312 x[3] = vqsubq_s16(s1, s3);
313 x[4] = vqaddq_s16(s4, s6);
314 x[5] = vqaddq_s16(s5, s7);
315 x[6] = vqsubq_s16(s4, s6);
316 x[7] = vqsubq_s16(s5, s7);
317
318 // stage 6
319 btf_16_half_neon(x + 2, c2);
320 btf_16_half_neon(x + 6, c2);
321
322 // Stage 7
323 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530324 out[1] = vqnegq_s16(x[4]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530325 out[2] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530326 out[3] = vqnegq_s16(x[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530327 out[4] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530328 out[5] = vqnegq_s16(x[7]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530329 out[6] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530330 out[7] = vqnegq_s16(x[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530331}
332
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700333static INLINE void iadst8_low1_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500334 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530335 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530336 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
337 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530338
339 int16x8_t x[8];
340 int16x8_t s0, s1, s4, s5;
341
342 // Stage 1
343 x[1] = in[0];
344
345 // Stage 2
346
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530347 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530348
349 // Stage 3
350 x[0] = s0;
351 x[1] = s1;
352 x[4] = s0;
353 x[5] = s1;
354
355 // Stage 4
356 s0 = x[0];
357 s1 = x[1];
358 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
359
360 // Stage 5
361 x[0] = s0;
362 x[1] = s1;
363 x[2] = s0;
364 x[3] = s1;
365 x[4] = s4;
366 x[5] = s5;
367 x[6] = s4;
368 x[7] = s5;
369
370 // stage 6
371 btf_16_half_neon(x + 2, c2);
372 btf_16_half_neon(x + 6, c2);
373
374 // Stage 7
375 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530376 out[1] = vqnegq_s16(x[4]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530377 out[2] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530378 out[3] = vqnegq_s16(x[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530379 out[4] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530380 out[5] = vqnegq_s16(x[7]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530381 out[6] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530382 out[7] = vqnegq_s16(x[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530383}
384
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500385static INLINE void idct8_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530386 const int32_t *cospi = cospi_arr(cos_bit);
387 int16x8_t step1[8], step2[8];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530388 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
389 (int16_t)cospi[40], (int16_t)cospi[24]);
390 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
391 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530392
393 // stage 2
394 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
395 btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
396
397 // stage 3
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530398 btf_16_lane_0_1_neon(in[0], in[4], c1, &step2[0], &step2[1]);
399 btf_16_lane_2_3_neon(in[2], in[6], c1, &step2[3], &step2[2]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530400 step2[4] = vqaddq_s16(step1[4], step1[5]);
401 step2[5] = vqsubq_s16(step1[4], step1[5]);
402 step2[6] = vqsubq_s16(step1[7], step1[6]);
403 step2[7] = vqaddq_s16(step1[7], step1[6]);
404
405 // stage 4
406 step1[0] = vqaddq_s16(step2[0], step2[3]);
407 step1[1] = vqaddq_s16(step2[1], step2[2]);
408 step1[2] = vqsubq_s16(step2[1], step2[2]);
409 step1[3] = vqsubq_s16(step2[0], step2[3]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530410 btf_16_lane_0_1_neon(step2[6], step2[5], c1, &step1[6], &step1[5]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530411
412 // stage 5
413 out[0] = vqaddq_s16(step1[0], step2[7]);
414 out[1] = vqaddq_s16(step1[1], step1[6]);
415 out[2] = vqaddq_s16(step1[2], step1[5]);
416 out[3] = vqaddq_s16(step1[3], step2[4]);
417 out[4] = vqsubq_s16(step1[3], step2[4]);
418 out[5] = vqsubq_s16(step1[2], step1[5]);
419 out[6] = vqsubq_s16(step1[1], step1[6]);
420 out[7] = vqsubq_s16(step1[0], step2[7]);
421}
422
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700423static INLINE void idct8_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500424 int8_t cos_bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530425 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530426 int16x8_t step1;
427 int32x4_t t32[2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530428
429 // stage 1
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530430 // stage 2
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530431 // stage 3
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530432 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
433 t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
434
435 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
436 vrshrn_n_s32(t32[1], INV_COS_BIT));
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530437
438 // stage 4
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530439 // stage 5
440 out[0] = step1;
441 out[1] = step1;
442 out[2] = step1;
443 out[3] = step1;
444 out[4] = step1;
445 out[5] = step1;
446 out[6] = step1;
447 out[7] = step1;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530448}
449
450void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
451 assert(!(size % 4));
452 if (!bit) return;
453 const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
454 for (int i = 0; i < size; i++) {
455 arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
456 }
457}
458
459static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
460 int16x8_t temp[8];
461 for (int i = 0; i < size; ++i) {
462 temp[i] = input[size - 1 - i];
463 }
464 for (int i = 0; i < size; ++i) {
465 input[i] = temp[i];
466 }
467}
468
469static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
470 int16x8_t *const a,
471 int out_size) {
472 for (int i = 0; i < 8; ++i) {
473 a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
474 vmovn_s32(vld1q_s32(input + 4)));
475 input += out_size;
476 }
477}
478
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530479static int16_t sqrt_2_list[TX_SIZES] = { 5793, 2 * 4096, 2 * 5793, 4 * 4096,
480 4 * 5793 };
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530481
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530482static INLINE void identity_txfm_round_neon(int16x8_t *input, int16x8_t *output,
483 int txw_idx, int8_t size, int bit) {
484 const int32x4_t dup_bits_n_32x4 = vdupq_n_s32((int32_t)(-bit));
485 int16x4_t scale = vdup_n_s16(sqrt_2_list[txw_idx]);
486 int16x4_t low_i16, high_i16;
487 int32x4_t low_i32, high_i32;
488 for (int i = 0; i < size; i++) {
489 int32x4_t temp_out_low = vmull_s16(vget_low_s16(input[i]), scale);
490 int32x4_t temp_out_high = vmull_s16(vget_high_s16(input[i]), scale);
491 low_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_low, 12), dup_bits_n_32x4);
492 high_i32 = vrshlq_s32(vrshrq_n_s32(temp_out_high, 12), dup_bits_n_32x4);
493 low_i16 = vqmovn_s32(low_i32);
494 high_i16 = vqmovn_s32(high_i32);
495 output[i] = vcombine_s16(low_i16, high_i16);
496 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530497}
498
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530499static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
500 int size) {
501 int32x4_t out_low, out_high;
502 int16x4_t low, high;
503
504 for (int z = 0; z < size; ++z) {
505 out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
506 out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
507
508 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
509 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
510
511 output[z] = vcombine_s16(low, high);
512 }
513}
514
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700515static INLINE void idct16_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500516 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530517 const int32_t *cospi = cospi_arr(cos_bit);
518 int16x8_t step1;
519 int32x4_t t32[2];
520
521 // stage 4
522
523 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
524 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
525 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
526 vrshrn_n_s32(t32[1], INV_COS_BIT));
527
528 // stage 6
529 // stage 7
530 out[0] = step1;
531 out[1] = step1;
532 out[2] = step1;
533 out[3] = step1;
534 out[4] = step1;
535 out[5] = step1;
536 out[6] = step1;
537 out[7] = step1;
538 out[8] = step1;
539 out[9] = step1;
540 out[10] = step1;
541 out[11] = step1;
542 out[12] = step1;
543 out[13] = step1;
544 out[14] = step1;
545 out[15] = step1;
546}
547
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500548static INLINE void idct16_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530549 const int32_t *cospi = cospi_arr(cos_bit);
550 int16x8_t step1[16], step2[16];
551
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530552 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
553 (int16_t)cospi[36], (int16_t)cospi[28]);
554 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
555 (int16_t)cospi[52], (int16_t)cospi[12]);
556 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
557 (int16_t)cospi[40], (int16_t)cospi[24]);
558 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
559 (int16_t)cospi[16], (int16_t)cospi[48]);
560 const int16x4_t c4 =
561 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
562 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530563 // stage 2
564
565 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
566 btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
567 btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
568 btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
569
570 step2[0] = in[0];
571 step2[1] = in[8];
572 step2[2] = in[4];
573 step2[3] = in[12];
574 step2[4] = in[2];
575 step2[5] = in[10];
576 step2[6] = in[6];
577 step2[7] = in[14];
578
579 // stage 3
580
581 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
582 btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
583
584 step1[0] = step2[0];
585 step1[1] = step2[1];
586 step1[2] = step2[2];
587 step1[3] = step2[3];
588 step1[8] = vqaddq_s16(step2[8], step2[9]);
589 step1[9] = vqsubq_s16(step2[8], step2[9]);
590 step1[10] = vqsubq_s16(step2[11], step2[10]);
591 step1[11] = vqaddq_s16(step2[11], step2[10]);
592 step1[12] = vqaddq_s16(step2[12], step2[13]);
593 step1[13] = vqsubq_s16(step2[12], step2[13]);
594 step1[14] = vqsubq_s16(step2[15], step2[14]);
595 step1[15] = vqaddq_s16(step2[15], step2[14]);
596
597 // stage 4
598
599 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
600 btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
601 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530602 btf_16_lane_3_2_neon(step1[10], step1[13], c4, &step2[10], &step2[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530603
604 step2[4] = vqaddq_s16(step1[4], step1[5]);
605 step2[5] = vqsubq_s16(step1[4], step1[5]);
606 step2[6] = vqsubq_s16(step1[7], step1[6]);
607 step2[7] = vqaddq_s16(step1[7], step1[6]);
608 step2[8] = step1[8];
609 step2[11] = step1[11];
610 step2[12] = step1[12];
611 step2[15] = step1[15];
612
613 // stage 5
614
615 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
616
617 step1[0] = vqaddq_s16(step2[0], step2[3]);
618 step1[1] = vqaddq_s16(step2[1], step2[2]);
619 step1[2] = vqsubq_s16(step2[1], step2[2]);
620 step1[3] = vqsubq_s16(step2[0], step2[3]);
621 step1[4] = step2[4];
622 step1[7] = step2[7];
623 step1[8] = vqaddq_s16(step2[8], step2[11]);
624 step1[9] = vqaddq_s16(step2[9], step2[10]);
625 step1[10] = vqsubq_s16(step2[9], step2[10]);
626 step1[11] = vqsubq_s16(step2[8], step2[11]);
627 step1[12] = vqsubq_s16(step2[15], step2[12]);
628 step1[13] = vqsubq_s16(step2[14], step2[13]);
629 step1[14] = vqaddq_s16(step2[14], step2[13]);
630 step1[15] = vqaddq_s16(step2[15], step2[12]);
631
632 // stage 6
633
634 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
635 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
636
637 step2[0] = vqaddq_s16(step1[0], step1[7]);
638 step2[1] = vqaddq_s16(step1[1], step1[6]);
639 step2[2] = vqaddq_s16(step1[2], step1[5]);
640 step2[3] = vqaddq_s16(step1[3], step1[4]);
641 step2[4] = vqsubq_s16(step1[3], step1[4]);
642 step2[5] = vqsubq_s16(step1[2], step1[5]);
643 step2[6] = vqsubq_s16(step1[1], step1[6]);
644 step2[7] = vqsubq_s16(step1[0], step1[7]);
645 step2[8] = step1[8];
646 step2[9] = step1[9];
647 step2[14] = step1[14];
648 step2[15] = step1[15];
649
650 // stage 7
651 out[0] = vqaddq_s16(step2[0], step2[15]);
652 out[1] = vqaddq_s16(step2[1], step2[14]);
653 out[2] = vqaddq_s16(step2[2], step2[13]);
654 out[3] = vqaddq_s16(step2[3], step2[12]);
655 out[4] = vqaddq_s16(step2[4], step2[11]);
656 out[5] = vqaddq_s16(step2[5], step2[10]);
657 out[6] = vqaddq_s16(step2[6], step2[9]);
658 out[7] = vqaddq_s16(step2[7], step2[8]);
659 out[8] = vqsubq_s16(step2[7], step2[8]);
660 out[9] = vqsubq_s16(step2[6], step2[9]);
661 out[10] = vqsubq_s16(step2[5], step2[10]);
662 out[11] = vqsubq_s16(step2[4], step2[11]);
663 out[12] = vqsubq_s16(step2[3], step2[12]);
664 out[13] = vqsubq_s16(step2[2], step2[13]);
665 out[14] = vqsubq_s16(step2[1], step2[14]);
666 out[15] = vqsubq_s16(step2[0], step2[15]);
667}
668
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700669static INLINE void idct16_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500670 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530671 const int32_t *cospi = cospi_arr(cos_bit);
672 int16x8_t step1[16], step2[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530673 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
674 (int16_t)cospi[16], (int16_t)cospi[48]);
675 const int16x4_t c1 =
676 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
677 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530678
679 // stage 1
680 // stage 2
681
682 step2[0] = in[0];
683 step2[2] = in[4];
684 step2[4] = in[2];
685 step2[6] = in[6];
686
687 btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
688 btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
689 btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
690 btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
691
692 // stage 3
693
694 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
695 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
696
697 step1[0] = step2[0];
698 step1[2] = step2[2];
699 step1[8] = vqaddq_s16(step2[8], step2[9]);
700 step1[9] = vqsubq_s16(step2[8], step2[9]);
701 step1[10] = vqsubq_s16(step2[11], step2[10]);
702 step1[11] = vqaddq_s16(step2[11], step2[10]);
703 step1[12] = vqaddq_s16(step2[12], step2[13]);
704 step1[13] = vqsubq_s16(step2[12], step2[13]);
705 step1[14] = vqsubq_s16(step2[15], step2[14]);
706 step1[15] = vqaddq_s16(step2[15], step2[14]);
707
708 // stage 4
709
710 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
711 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
712 btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530713 btf_16_lane_3_2_neon(step1[10], step1[13], c1, &step2[10], &step2[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530714
715 step2[4] = vqaddq_s16(step1[4], step1[5]);
716 step2[5] = vqsubq_s16(step1[4], step1[5]);
717 step2[6] = vqsubq_s16(step1[7], step1[6]);
718 step2[7] = vqaddq_s16(step1[7], step1[6]);
719 step2[8] = step1[8];
720 step2[11] = step1[11];
721 step2[12] = step1[12];
722 step2[15] = step1[15];
723
724 // stage 5
725
726 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
727 step1[0] = vqaddq_s16(step2[0], step2[3]);
728 step1[1] = vqaddq_s16(step2[1], step2[2]);
729 step1[2] = vqsubq_s16(step2[1], step2[2]);
730 step1[3] = vqsubq_s16(step2[0], step2[3]);
731 step1[4] = step2[4];
732 step1[7] = step2[7];
733 step1[8] = vqaddq_s16(step2[8], step2[11]);
734 step1[9] = vqaddq_s16(step2[9], step2[10]);
735 step1[10] = vqsubq_s16(step2[9], step2[10]);
736 step1[11] = vqsubq_s16(step2[8], step2[11]);
737 step1[12] = vqsubq_s16(step2[15], step2[12]);
738 step1[13] = vqsubq_s16(step2[14], step2[13]);
739 step1[14] = vqaddq_s16(step2[14], step2[13]);
740 step1[15] = vqaddq_s16(step2[15], step2[12]);
741
742 // stage 6
743 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
744 btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
745
746 step2[0] = vqaddq_s16(step1[0], step1[7]);
747 step2[1] = vqaddq_s16(step1[1], step1[6]);
748 step2[2] = vqaddq_s16(step1[2], step1[5]);
749 step2[3] = vqaddq_s16(step1[3], step1[4]);
750 step2[4] = vqsubq_s16(step1[3], step1[4]);
751 step2[5] = vqsubq_s16(step1[2], step1[5]);
752 step2[6] = vqsubq_s16(step1[1], step1[6]);
753 step2[7] = vqsubq_s16(step1[0], step1[7]);
754 step2[8] = step1[8];
755 step2[9] = step1[9];
756 step2[14] = step1[14];
757 step2[15] = step1[15];
758
759 // stage 7
760
761 out[0] = vqaddq_s16(step2[0], step2[15]);
762 out[1] = vqaddq_s16(step2[1], step2[14]);
763 out[2] = vqaddq_s16(step2[2], step2[13]);
764 out[3] = vqaddq_s16(step2[3], step2[12]);
765 out[4] = vqaddq_s16(step2[4], step2[11]);
766 out[5] = vqaddq_s16(step2[5], step2[10]);
767 out[6] = vqaddq_s16(step2[6], step2[9]);
768 out[7] = vqaddq_s16(step2[7], step2[8]);
769 out[8] = vqsubq_s16(step2[7], step2[8]);
770 out[9] = vqsubq_s16(step2[6], step2[9]);
771 out[10] = vqsubq_s16(step2[5], step2[10]);
772 out[11] = vqsubq_s16(step2[4], step2[11]);
773 out[12] = vqsubq_s16(step2[3], step2[12]);
774 out[13] = vqsubq_s16(step2[2], step2[13]);
775 out[14] = vqsubq_s16(step2[1], step2[14]);
776 out[15] = vqsubq_s16(step2[0], step2[15]);
777}
778
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700779static INLINE void iadst16_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500780 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530781 const int32_t *cospi = cospi_arr(cos_bit);
782
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530783 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
784 (int16_t)cospi[10], (int16_t)cospi[54]);
785 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
786 (int16_t)cospi[26], (int16_t)cospi[38]);
787 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[34], (int16_t)cospi[30],
788 (int16_t)cospi[42], (int16_t)cospi[22]);
789 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[50], (int16_t)cospi[14],
790 (int16_t)cospi[58], (int16_t)cospi[6]);
791 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
792 (int16_t)cospi[40], (int16_t)cospi[24]);
793 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
794 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530795
796 int16x8_t x[16];
797 int16x8_t t[14];
798 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
799 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
800
801 // Stage 1
802 x[0] = in[15];
803 x[1] = in[0];
804 x[2] = in[13];
805 x[3] = in[2];
806 x[4] = in[11];
807 x[5] = in[4];
808 x[6] = in[9];
809 x[7] = in[6];
810 x[8] = in[7];
811 x[9] = in[8];
812 x[10] = in[5];
813 x[11] = in[10];
814 x[12] = in[3];
815 x[13] = in[12];
816 x[14] = in[1];
817 x[15] = in[14];
818
819 // Stage 2
820 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
821 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
822 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
823 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
824 btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
825 btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
826 btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
827 btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
828
829 // Stage 3
830 x[0] = vqaddq_s16(s0, s8);
831 x[1] = vqaddq_s16(s1, s9);
832 x[2] = vqaddq_s16(s2, s10);
833 x[3] = vqaddq_s16(s3, s11);
834 x[4] = vqaddq_s16(s4, s12);
835 x[5] = vqaddq_s16(s5, s13);
836 x[6] = vqaddq_s16(s6, s14);
837 x[7] = vqaddq_s16(s7, s15);
838 x[8] = vqsubq_s16(s0, s8);
839 x[9] = vqsubq_s16(s1, s9);
840 x[10] = vqsubq_s16(s2, s10);
841 x[11] = vqsubq_s16(s3, s11);
842 x[12] = vqsubq_s16(s4, s12);
843 x[13] = vqsubq_s16(s5, s13);
844 x[14] = vqsubq_s16(s6, s14);
845 x[15] = vqsubq_s16(s7, s15);
846
847 // Stage 4
848 t[0] = x[0];
849 t[1] = x[1];
850 t[2] = x[2];
851 t[3] = x[3];
852 t[4] = x[4];
853 t[5] = x[5];
854 t[6] = x[6];
855 t[7] = x[7];
856 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
857 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
858 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
859 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
860
861 // Stage 5
862 x[0] = vqaddq_s16(t[0], t[4]);
863 x[1] = vqaddq_s16(t[1], t[5]);
864 x[2] = vqaddq_s16(t[2], t[6]);
865 x[3] = vqaddq_s16(t[3], t[7]);
866 x[4] = vqsubq_s16(t[0], t[4]);
867 x[5] = vqsubq_s16(t[1], t[5]);
868 x[6] = vqsubq_s16(t[2], t[6]);
869 x[7] = vqsubq_s16(t[3], t[7]);
870 x[8] = vqaddq_s16(s8, s12);
871 x[9] = vqaddq_s16(s9, s13);
872 x[10] = vqaddq_s16(s10, s14);
873 x[11] = vqaddq_s16(s11, s15);
874 x[12] = vqsubq_s16(s8, s12);
875 x[13] = vqsubq_s16(s9, s13);
876 x[14] = vqsubq_s16(s10, s14);
877 x[15] = vqsubq_s16(s11, s15);
878
879 // stage 6
880 t[0] = x[0];
881 t[1] = x[1];
882 t[2] = x[2];
883 t[3] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530884 btf_16_lane_2_3_neon(x[4], x[5], c5, &s4, &s5);
885 btf_16_lane_3_2_neon(x[7], x[6], c5, &s7, &s6);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530886 t[8] = x[8];
887 t[9] = x[9];
888 t[10] = x[10];
889 t[11] = x[11];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530890 btf_16_lane_2_3_neon(x[12], x[13], c5, &s12, &s13);
891 btf_16_lane_3_2_neon(x[15], x[14], c5, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530892
893 // Stage 7
894 x[0] = vqaddq_s16(t[0], t[2]);
895 x[1] = vqaddq_s16(t[1], t[3]);
896 x[2] = vqsubq_s16(t[0], t[2]);
897 x[3] = vqsubq_s16(t[1], t[3]);
898 x[4] = vqaddq_s16(s4, s6);
899 x[5] = vqaddq_s16(s5, s7);
900 x[6] = vqsubq_s16(s4, s6);
901 x[7] = vqsubq_s16(s5, s7);
902 x[8] = vqaddq_s16(t[8], t[10]);
903 x[9] = vqaddq_s16(t[9], t[11]);
904 x[10] = vqsubq_s16(t[8], t[10]);
905 x[11] = vqsubq_s16(t[9], t[11]);
906 x[12] = vqaddq_s16(s12, s14);
907 x[13] = vqaddq_s16(s13, s15);
908 x[14] = vqsubq_s16(s12, s14);
909 x[15] = vqsubq_s16(s13, s15);
910
911 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530912 btf_16_half_neon(x + 2, c5);
913 btf_16_half_neon(x + 6, c5);
914 btf_16_half_neon(x + 10, c5);
915 btf_16_half_neon(x + 14, c5);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530916
917 // Stage 9
918 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530919 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530920 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530921 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530922 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530923 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530924 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530925 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530926 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530927 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530928 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530929 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530930 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530931 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530932 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530933 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530934}
935
Yaowu Xueb5e4e22020-04-06 14:17:55 -0700936static INLINE void iadst16_low1_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -0500937 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530938 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530939 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
940 (int16_t)cospi[40], (int16_t)cospi[24]);
941 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
942 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530943
944 int16x8_t x[16];
945 int16x8_t t[10];
946 int16x8_t s0, s1, s4, s5;
947 int16x8_t s8, s9, s12, s13;
948
949 // Stage 1
950 x[1] = in[0];
951
952 // Stage 2
953 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
954
955 // Stage 3
956 x[0] = s0;
957 x[1] = s1;
958 x[8] = s0;
959 x[9] = s1;
960
961 // Stage 4
962 t[0] = x[0];
963 t[1] = x[1];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530964 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530965
966 // Stage 5
967 x[0] = t[0];
968 x[1] = t[1];
969 x[4] = t[0];
970 x[5] = t[1];
971 x[8] = s8;
972 x[9] = s9;
973 x[12] = s8;
974 x[13] = s9;
975
976 // stage 6
977 t[0] = x[0];
978 t[1] = x[1];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530979 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530980 t[8] = x[8];
981 t[9] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +0530982 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530983
984 // Stage 7
985 x[0] = t[0];
986 x[1] = t[1];
987 x[2] = t[0];
988 x[3] = t[1];
989 x[4] = s4;
990 x[5] = s5;
991 x[6] = s4;
992 x[7] = s5;
993 x[8] = t[8];
994 x[9] = t[9];
995 x[10] = t[8];
996 x[11] = t[9];
997 x[12] = s12;
998 x[13] = s13;
999 x[14] = s12;
1000 x[15] = s13;
1001
1002 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301003 btf_16_half_neon(x + 2, c1);
1004 btf_16_half_neon(x + 6, c1);
1005 btf_16_half_neon(x + 10, c1);
1006 btf_16_half_neon(x + 14, c1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301007
1008 // Stage 9
1009 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301010 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301011 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301012 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301013 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301014 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301015 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301016 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301017 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301018 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301019 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301020 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301021 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301022 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301023 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301024 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301025}
1026
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001027static INLINE void iadst16_low8_neon(int16x8_t *const in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001028 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301029 const int32_t *cospi = cospi_arr(cos_bit);
1030
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301031 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1032 (int16_t)cospi[40], (int16_t)cospi[24]);
1033 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1034 (int16_t)cospi[16], (int16_t)cospi[48]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301035
1036 int16x8_t x[16];
1037 int16x8_t t[14];
1038 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1039 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
1040
1041 // Stage 1
1042 x[1] = in[0];
1043 x[3] = in[2];
1044 x[5] = in[4];
1045 x[7] = in[6];
1046 x[8] = in[7];
1047 x[10] = in[5];
1048 x[12] = in[3];
1049 x[14] = in[1];
1050
1051 // Stage 2
1052 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1053 btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
1054 btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
1055 btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
1056
1057 btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
1058 btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
1059 btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
1060 btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
1061
1062 // Stage 3
1063 x[0] = vqaddq_s16(s0, s8);
1064 x[1] = vqaddq_s16(s1, s9);
1065 x[2] = vqaddq_s16(s2, s10);
1066 x[3] = vqaddq_s16(s3, s11);
1067 x[4] = vqaddq_s16(s4, s12);
1068 x[5] = vqaddq_s16(s5, s13);
1069 x[6] = vqaddq_s16(s6, s14);
1070 x[7] = vqaddq_s16(s7, s15);
1071 x[8] = vqsubq_s16(s0, s8);
1072 x[9] = vqsubq_s16(s1, s9);
1073 x[10] = vqsubq_s16(s2, s10);
1074 x[11] = vqsubq_s16(s3, s11);
1075 x[12] = vqsubq_s16(s4, s12);
1076 x[13] = vqsubq_s16(s5, s13);
1077 x[14] = vqsubq_s16(s6, s14);
1078 x[15] = vqsubq_s16(s7, s15);
1079
1080 // Stage 4
1081 t[0] = x[0];
1082 t[1] = x[1];
1083 t[2] = x[2];
1084 t[3] = x[3];
1085 t[4] = x[4];
1086 t[5] = x[5];
1087 t[6] = x[6];
1088 t[7] = x[7];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301089 btf_16_lane_0_1_neon(x[8], x[9], c0, &s8, &s9);
1090 btf_16_lane_2_3_neon(x[10], x[11], c0, &s10, &s11);
1091 btf_16_lane_1_0_neon(x[13], x[12], c0, &s13, &s12);
1092 btf_16_lane_3_2_neon(x[15], x[14], c0, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301093
1094 // Stage 5
1095 x[0] = vqaddq_s16(t[0], t[4]);
1096 x[1] = vqaddq_s16(t[1], t[5]);
1097 x[2] = vqaddq_s16(t[2], t[6]);
1098 x[3] = vqaddq_s16(t[3], t[7]);
1099 x[4] = vqsubq_s16(t[0], t[4]);
1100 x[5] = vqsubq_s16(t[1], t[5]);
1101 x[6] = vqsubq_s16(t[2], t[6]);
1102 x[7] = vqsubq_s16(t[3], t[7]);
1103 x[8] = vqaddq_s16(s8, s12);
1104 x[9] = vqaddq_s16(s9, s13);
1105 x[10] = vqaddq_s16(s10, s14);
1106 x[11] = vqaddq_s16(s11, s15);
1107 x[12] = vqsubq_s16(s8, s12);
1108 x[13] = vqsubq_s16(s9, s13);
1109 x[14] = vqsubq_s16(s10, s14);
1110 x[15] = vqsubq_s16(s11, s15);
1111
1112 // stage 6
1113 t[0] = x[0];
1114 t[1] = x[1];
1115 t[2] = x[2];
1116 t[3] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301117 btf_16_lane_2_3_neon(x[4], x[5], c1, &s4, &s5);
1118 btf_16_lane_3_2_neon(x[7], x[6], c1, &s7, &s6);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301119 t[8] = x[8];
1120 t[9] = x[9];
1121 t[10] = x[10];
1122 t[11] = x[11];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301123 btf_16_lane_2_3_neon(x[12], x[13], c1, &s12, &s13);
1124 btf_16_lane_3_2_neon(x[15], x[14], c1, &s15, &s14);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301125
1126 // Stage 7
1127 x[0] = vqaddq_s16(t[0], t[2]);
1128 x[1] = vqaddq_s16(t[1], t[3]);
1129 x[2] = vqsubq_s16(t[0], t[2]);
1130 x[3] = vqsubq_s16(t[1], t[3]);
1131 x[4] = vqaddq_s16(s4, s6);
1132 x[5] = vqaddq_s16(s5, s7);
1133 x[6] = vqsubq_s16(s4, s6);
1134 x[7] = vqsubq_s16(s5, s7);
1135 x[8] = vqaddq_s16(t[8], t[10]);
1136 x[9] = vqaddq_s16(t[9], t[11]);
1137 x[10] = vqsubq_s16(t[8], t[10]);
1138 x[11] = vqsubq_s16(t[9], t[11]);
1139 x[12] = vqaddq_s16(s12, s14);
1140 x[13] = vqaddq_s16(s13, s15);
1141 x[14] = vqsubq_s16(s12, s14);
1142 x[15] = vqsubq_s16(s13, s15);
1143
1144 // Stage 8
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301145 btf_16_half_neon(x + 2, c1);
1146 btf_16_half_neon(x + 6, c1);
1147 btf_16_half_neon(x + 10, c1);
1148 btf_16_half_neon(x + 14, c1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301149
1150 // Stage 9
1151 out[0] = x[0];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301152 out[1] = vqnegq_s16(x[8]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301153 out[2] = x[12];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301154 out[3] = vqnegq_s16(x[4]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301155 out[4] = x[6];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301156 out[5] = vqnegq_s16(x[14]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301157 out[6] = x[10];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301158 out[7] = vqnegq_s16(x[2]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301159 out[8] = x[3];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301160 out[9] = vqnegq_s16(x[11]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301161 out[10] = x[15];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301162 out[11] = vqnegq_s16(x[7]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301163 out[12] = x[5];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301164 out[13] = vqnegq_s16(x[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301165 out[14] = x[9];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301166 out[15] = vqnegq_s16(x[1]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301167}
1168
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001169static INLINE void idct32_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301170 const int32_t *cospi = cospi_arr(cos_bit);
1171 int16x8_t step1[32], step2[32];
1172
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301173 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[2], (int16_t)cospi[62],
1174 (int16_t)cospi[34], (int16_t)cospi[30]);
1175 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[18], (int16_t)cospi[46],
1176 (int16_t)cospi[50], (int16_t)cospi[14]);
1177 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[10], (int16_t)cospi[54],
1178 (int16_t)cospi[42], (int16_t)cospi[22]);
1179 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[26], (int16_t)cospi[38],
1180 (int16_t)cospi[58], (int16_t)cospi[6]);
1181 const int16x4_t c4 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
1182 (int16_t)cospi[36], (int16_t)cospi[28]);
1183 const int16x4_t c5 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
1184 (int16_t)cospi[52], (int16_t)cospi[12]);
1185 const int16x4_t c6 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1186 (int16_t)cospi[40], (int16_t)cospi[24]);
1187 const int16x4_t c7 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1188 (int16_t)cospi[16], (int16_t)cospi[48]);
1189 const int16x4_t c8 =
1190 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1191 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1192 const int16x4_t c9 =
1193 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1194 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301195
1196 // stage 2
1197
1198 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
1199 btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
1200 btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
1201 btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
1202 btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
1203 btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
1204 btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
1205 btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
1206
1207 step2[0] = in[0];
1208 step2[1] = in[16];
1209 step2[2] = in[8];
1210 step2[3] = in[24];
1211 step2[4] = in[4];
1212 step2[5] = in[20];
1213 step2[6] = in[12];
1214 step2[7] = in[28];
1215 step2[8] = in[2];
1216 step2[9] = in[18];
1217 step2[10] = in[10];
1218 step2[11] = in[26];
1219 step2[12] = in[6];
1220 step2[13] = in[22];
1221 step2[14] = in[14];
1222 step2[15] = in[30];
1223
1224 // stage 3
1225
1226 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
1227 btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
1228 btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
1229 btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
1230
1231 step1[0] = step2[0];
1232 step1[1] = step2[1];
1233 step1[2] = step2[2];
1234 step1[3] = step2[3];
1235 step1[4] = step2[4];
1236 step1[5] = step2[5];
1237 step1[6] = step2[6];
1238 step1[7] = step2[7];
1239
1240 step1[16] = vqaddq_s16(step2[16], step2[17]);
1241 step1[17] = vqsubq_s16(step2[16], step2[17]);
1242 step1[18] = vqsubq_s16(step2[19], step2[18]);
1243 step1[19] = vqaddq_s16(step2[19], step2[18]);
1244 step1[20] = vqaddq_s16(step2[20], step2[21]);
1245 step1[21] = vqsubq_s16(step2[20], step2[21]);
1246 step1[22] = vqsubq_s16(step2[23], step2[22]);
1247 step1[23] = vqaddq_s16(step2[23], step2[22]);
1248 step1[24] = vqaddq_s16(step2[24], step2[25]);
1249 step1[25] = vqsubq_s16(step2[24], step2[25]);
1250 step1[26] = vqsubq_s16(step2[27], step2[26]);
1251 step1[27] = vqaddq_s16(step2[27], step2[26]);
1252 step1[28] = vqaddq_s16(step2[28], step2[29]);
1253 step1[29] = vqsubq_s16(step2[28], step2[29]);
1254 step1[30] = vqsubq_s16(step2[31], step2[30]);
1255 step1[31] = vqaddq_s16(step2[31], step2[30]);
1256
1257 // stage 4
1258
1259 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
1260 btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
1261 btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301262 btf_16_lane_1_0_neon(step1[18], step1[29], c8, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301263 btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301264 btf_16_lane_3_2_neon(step1[22], step1[25], c8, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301265
1266 step2[0] = step1[0];
1267 step2[1] = step1[1];
1268 step2[2] = step1[2];
1269 step2[3] = step1[3];
1270 step2[8] = vqaddq_s16(step1[8], step1[9]);
1271 step2[9] = vqsubq_s16(step1[8], step1[9]);
1272 step2[10] = vqsubq_s16(step1[11], step1[10]);
1273 step2[11] = vqaddq_s16(step1[11], step1[10]);
1274 step2[12] = vqaddq_s16(step1[12], step1[13]);
1275 step2[13] = vqsubq_s16(step1[12], step1[13]);
1276 step2[14] = vqsubq_s16(step1[15], step1[14]);
1277 step2[15] = vqaddq_s16(step1[15], step1[14]);
1278 step2[16] = step1[16];
1279 step2[19] = step1[19];
1280 step2[20] = step1[20];
1281 step2[23] = step1[23];
1282 step2[24] = step1[24];
1283 step2[27] = step1[27];
1284 step2[28] = step1[28];
1285 step2[31] = step1[31];
1286
1287 // stage 5
1288
1289 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
1290 btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
1291 btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301292 btf_16_lane_3_2_neon(step2[10], step2[13], c9, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301293
1294 step1[4] = vqaddq_s16(step2[4], step2[5]);
1295 step1[5] = vqsubq_s16(step2[4], step2[5]);
1296 step1[6] = vqsubq_s16(step2[7], step2[6]);
1297 step1[7] = vqaddq_s16(step2[7], step2[6]);
1298 step1[8] = step2[8];
1299 step1[11] = step2[11];
1300 step1[12] = step2[12];
1301 step1[15] = step2[15];
1302 step1[16] = vqaddq_s16(step2[16], step2[19]);
1303 step1[17] = vqaddq_s16(step2[17], step2[18]);
1304 step1[18] = vqsubq_s16(step2[17], step2[18]);
1305 step1[19] = vqsubq_s16(step2[16], step2[19]);
1306 step1[20] = vqsubq_s16(step2[23], step2[20]);
1307 step1[21] = vqsubq_s16(step2[22], step2[21]);
1308 step1[22] = vqaddq_s16(step2[22], step2[21]);
1309 step1[23] = vqaddq_s16(step2[23], step2[20]);
1310 step1[24] = vqaddq_s16(step2[24], step2[27]);
1311 step1[25] = vqaddq_s16(step2[25], step2[26]);
1312 step1[26] = vqsubq_s16(step2[25], step2[26]);
1313 step1[27] = vqsubq_s16(step2[24], step2[27]);
1314 step1[28] = vqsubq_s16(step2[31], step2[28]);
1315 step1[29] = vqsubq_s16(step2[30], step2[29]);
1316 step1[30] = vqaddq_s16(step2[30], step2[29]);
1317 step1[31] = vqaddq_s16(step2[31], step2[28]);
1318
1319 // stage 6
1320
1321 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
1322 btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
1323 btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301324 btf_16_lane_3_2_neon(step1[20], step1[27], c9, &step2[20], &step2[27]);
1325 btf_16_lane_3_2_neon(step1[21], step1[26], c9, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301326
1327 step2[0] = vqaddq_s16(step1[0], step1[3]);
1328 step2[1] = vqaddq_s16(step1[1], step1[2]);
1329 step2[2] = vqsubq_s16(step1[1], step1[2]);
1330 step2[3] = vqsubq_s16(step1[0], step1[3]);
1331 step2[4] = step1[4];
1332 step2[7] = step1[7];
1333 step2[8] = vqaddq_s16(step1[8], step1[11]);
1334 step2[9] = vqaddq_s16(step1[9], step1[10]);
1335 step2[10] = vqsubq_s16(step1[9], step1[10]);
1336 step2[11] = vqsubq_s16(step1[8], step1[11]);
1337 step2[12] = vqsubq_s16(step1[15], step1[12]);
1338 step2[13] = vqsubq_s16(step1[14], step1[13]);
1339 step2[14] = vqaddq_s16(step1[14], step1[13]);
1340 step2[15] = vqaddq_s16(step1[15], step1[12]);
1341 step2[16] = step1[16];
1342 step2[17] = step1[17];
1343 step2[22] = step1[22];
1344 step2[23] = step1[23];
1345 step2[24] = step1[24];
1346 step2[25] = step1[25];
1347 step2[30] = step1[30];
1348 step2[31] = step1[31];
1349
1350 // stage 7
1351
1352 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
1353 btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
1354
1355 step1[0] = vqaddq_s16(step2[0], step2[7]);
1356 step1[1] = vqaddq_s16(step2[1], step2[6]);
1357 step1[2] = vqaddq_s16(step2[2], step2[5]);
1358 step1[3] = vqaddq_s16(step2[3], step2[4]);
1359 step1[4] = vqsubq_s16(step2[3], step2[4]);
1360 step1[5] = vqsubq_s16(step2[2], step2[5]);
1361 step1[6] = vqsubq_s16(step2[1], step2[6]);
1362 step1[7] = vqsubq_s16(step2[0], step2[7]);
1363 step1[8] = step2[8];
1364 step1[9] = step2[9];
1365 step1[14] = step2[14];
1366 step1[15] = step2[15];
1367 step1[16] = vqaddq_s16(step2[16], step2[23]);
1368 step1[17] = vqaddq_s16(step2[17], step2[22]);
1369 step1[18] = vqaddq_s16(step2[18], step2[21]);
1370 step1[19] = vqaddq_s16(step2[19], step2[20]);
1371 step1[20] = vqsubq_s16(step2[19], step2[20]);
1372 step1[21] = vqsubq_s16(step2[18], step2[21]);
1373 step1[22] = vqsubq_s16(step2[17], step2[22]);
1374 step1[23] = vqsubq_s16(step2[16], step2[23]);
1375 step1[24] = vqsubq_s16(step2[31], step2[24]);
1376 step1[25] = vqsubq_s16(step2[30], step2[25]);
1377 step1[26] = vqsubq_s16(step2[29], step2[26]);
1378 step1[27] = vqsubq_s16(step2[28], step2[27]);
1379 step1[28] = vqaddq_s16(step2[27], step2[28]);
1380 step1[29] = vqaddq_s16(step2[26], step2[29]);
1381 step1[30] = vqaddq_s16(step2[25], step2[30]);
1382 step1[31] = vqaddq_s16(step2[24], step2[31]);
1383
1384 // stage 8
1385
1386 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
1387 btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
1388 btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
1389 btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
1390
1391 step2[0] = vqaddq_s16(step1[0], step1[15]);
1392 step2[1] = vqaddq_s16(step1[1], step1[14]);
1393 step2[2] = vqaddq_s16(step1[2], step1[13]);
1394 step2[3] = vqaddq_s16(step1[3], step1[12]);
1395 step2[4] = vqaddq_s16(step1[4], step1[11]);
1396 step2[5] = vqaddq_s16(step1[5], step1[10]);
1397 step2[6] = vqaddq_s16(step1[6], step1[9]);
1398 step2[7] = vqaddq_s16(step1[7], step1[8]);
1399 step2[8] = vqsubq_s16(step1[7], step1[8]);
1400 step2[9] = vqsubq_s16(step1[6], step1[9]);
1401 step2[10] = vqsubq_s16(step1[5], step1[10]);
1402 step2[11] = vqsubq_s16(step1[4], step1[11]);
1403 step2[12] = vqsubq_s16(step1[3], step1[12]);
1404 step2[13] = vqsubq_s16(step1[2], step1[13]);
1405 step2[14] = vqsubq_s16(step1[1], step1[14]);
1406 step2[15] = vqsubq_s16(step1[0], step1[15]);
1407 step2[16] = step1[16];
1408 step2[17] = step1[17];
1409 step2[18] = step1[18];
1410 step2[19] = step1[19];
1411 step2[28] = step1[28];
1412 step2[29] = step1[29];
1413 step2[30] = step1[30];
1414 step2[31] = step1[31];
1415
1416 // stage 9
1417
1418 out[0] = vqaddq_s16(step2[0], step2[31]);
1419 out[1] = vqaddq_s16(step2[1], step2[30]);
1420 out[2] = vqaddq_s16(step2[2], step2[29]);
1421 out[3] = vqaddq_s16(step2[3], step2[28]);
1422 out[4] = vqaddq_s16(step2[4], step2[27]);
1423 out[5] = vqaddq_s16(step2[5], step2[26]);
1424 out[6] = vqaddq_s16(step2[6], step2[25]);
1425 out[7] = vqaddq_s16(step2[7], step2[24]);
1426 out[8] = vqaddq_s16(step2[8], step2[23]);
1427 out[9] = vqaddq_s16(step2[9], step2[22]);
1428 out[10] = vqaddq_s16(step2[10], step2[21]);
1429 out[11] = vqaddq_s16(step2[11], step2[20]);
1430 out[12] = vqaddq_s16(step2[12], step2[19]);
1431 out[13] = vqaddq_s16(step2[13], step2[18]);
1432 out[14] = vqaddq_s16(step2[14], step2[17]);
1433 out[15] = vqaddq_s16(step2[15], step2[16]);
1434 out[16] = vqsubq_s16(step2[15], step2[16]);
1435 out[17] = vqsubq_s16(step2[14], step2[17]);
1436 out[18] = vqsubq_s16(step2[13], step2[18]);
1437 out[19] = vqsubq_s16(step2[12], step2[19]);
1438 out[20] = vqsubq_s16(step2[11], step2[20]);
1439 out[21] = vqsubq_s16(step2[10], step2[21]);
1440 out[22] = vqsubq_s16(step2[9], step2[22]);
1441 out[23] = vqsubq_s16(step2[8], step2[23]);
1442 out[24] = vqsubq_s16(step2[7], step2[24]);
1443 out[25] = vqsubq_s16(step2[6], step2[25]);
1444 out[26] = vqsubq_s16(step2[5], step2[26]);
1445 out[27] = vqsubq_s16(step2[4], step2[27]);
1446 out[28] = vqsubq_s16(step2[3], step2[28]);
1447 out[29] = vqsubq_s16(step2[2], step2[29]);
1448 out[30] = vqsubq_s16(step2[1], step2[30]);
1449 out[31] = vqsubq_s16(step2[0], step2[31]);
1450}
1451
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001452static INLINE void idct32_low1_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001453 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301454 const int32_t *cospi = cospi_arr(cos_bit);
1455 int16x8_t step1;
1456 int32x4_t t32[2];
1457
1458 // stage 1
1459 // stage 2
1460 // stage 3
1461 // stage 4
1462 // stage 5
1463
1464 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
1465 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
1466 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1467 vrshrn_n_s32(t32[1], INV_COS_BIT));
1468
1469 // stage 6
1470 // stage 7
1471 // stage 8
1472 // stage 9
1473
1474 out[0] = step1;
1475 out[1] = step1;
1476 out[2] = step1;
1477 out[3] = step1;
1478 out[4] = step1;
1479 out[5] = step1;
1480 out[6] = step1;
1481 out[7] = step1;
1482 out[8] = step1;
1483 out[9] = step1;
1484 out[10] = step1;
1485 out[11] = step1;
1486 out[12] = step1;
1487 out[13] = step1;
1488 out[14] = step1;
1489 out[15] = step1;
1490 out[16] = step1;
1491 out[17] = step1;
1492 out[18] = step1;
1493 out[19] = step1;
1494 out[20] = step1;
1495 out[21] = step1;
1496 out[22] = step1;
1497 out[23] = step1;
1498 out[24] = step1;
1499 out[25] = step1;
1500 out[26] = step1;
1501 out[27] = step1;
1502 out[28] = step1;
1503 out[29] = step1;
1504 out[30] = step1;
1505 out[31] = step1;
1506}
1507
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001508static INLINE void idct32_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001509 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301510 const int32_t *cospi = cospi_arr(cos_bit);
1511 int16x8_t step1[32], step2[32];
1512 int32x4_t t32[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301513 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1514 (int16_t)cospi[40], (int16_t)cospi[24]);
1515 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1516 (int16_t)cospi[16], cospi[48]);
1517 const int16x4_t c2 =
1518 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1519 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1520 const int16x4_t c3 =
1521 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1522 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301523 // stage 1
1524 // stage 2
1525
1526 step2[0] = in[0];
1527 step2[4] = in[4];
1528 step2[8] = in[2];
1529 step2[12] = in[6];
1530
1531 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1532 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1533 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1534 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1535
1536 // stage 3
1537 step1[0] = step2[0];
1538 step1[4] = step2[4];
1539
1540 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1541 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1542
1543 step1[16] = step2[16];
1544 step1[17] = step2[16];
1545 step1[18] = step2[19];
1546 step1[19] = step2[19];
1547 step1[20] = step2[20];
1548 step1[21] = step2[20];
1549 step1[22] = step2[23];
1550 step1[23] = step2[23];
1551 step1[24] = step2[24];
1552 step1[25] = step2[24];
1553 step1[26] = step2[27];
1554 step1[27] = step2[27];
1555 step1[28] = step2[28];
1556 step1[29] = step2[28];
1557 step1[30] = step2[31];
1558 step1[31] = step2[31];
1559
1560 // stage 4
1561
1562 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1563 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301564 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301565 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301566 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301567
1568 step2[0] = step1[0];
1569 step2[8] = step1[8];
1570 step2[9] = step1[8];
1571 step2[10] = step1[11];
1572 step2[11] = step1[11];
1573 step2[12] = step1[12];
1574 step2[13] = step1[12];
1575 step2[14] = step1[15];
1576 step2[15] = step1[15];
1577 step2[16] = step1[16];
1578 step2[19] = step1[19];
1579 step2[20] = step1[20];
1580 step2[23] = step1[23];
1581 step2[24] = step1[24];
1582 step2[27] = step1[27];
1583 step2[28] = step1[28];
1584 step2[31] = step1[31];
1585
1586 // stage 5
1587
1588 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1589 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1590 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1591 vrshrn_n_s32(t32[1], INV_COS_BIT));
1592
1593 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301594 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301595
1596 step1[4] = step2[4];
1597 step1[5] = step2[4];
1598 step1[6] = step2[7];
1599 step1[7] = step2[7];
1600 step1[8] = step2[8];
1601 step1[11] = step2[11];
1602 step1[12] = step2[12];
1603 step1[15] = step2[15];
1604 step1[16] = vqaddq_s16(step2[16], step2[19]);
1605 step1[17] = vqaddq_s16(step2[17], step2[18]);
1606 step1[18] = vqsubq_s16(step2[17], step2[18]);
1607 step1[19] = vqsubq_s16(step2[16], step2[19]);
1608 step1[20] = vqsubq_s16(step2[23], step2[20]);
1609 step1[21] = vqsubq_s16(step2[22], step2[21]);
1610 step1[22] = vqaddq_s16(step2[22], step2[21]);
1611 step1[23] = vqaddq_s16(step2[23], step2[20]);
1612 step1[24] = vqaddq_s16(step2[24], step2[27]);
1613 step1[25] = vqaddq_s16(step2[25], step2[26]);
1614 step1[26] = vqsubq_s16(step2[25], step2[26]);
1615 step1[27] = vqsubq_s16(step2[24], step2[27]);
1616 step1[28] = vqsubq_s16(step2[31], step2[28]);
1617 step1[29] = vqsubq_s16(step2[30], step2[29]);
1618 step1[30] = vqaddq_s16(step2[30], step2[29]);
1619 step1[31] = vqaddq_s16(step2[31], step2[28]);
1620
1621 // stage 6
1622
1623 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1624 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1625 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301626 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1627 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301628
1629 step2[0] = step1[0];
1630 step2[1] = step1[0];
1631 step2[2] = step1[0];
1632 step2[3] = step1[0];
1633 step2[4] = step1[4];
1634 step2[7] = step1[7];
1635 step2[8] = vqaddq_s16(step1[8], step1[11]);
1636 step2[9] = vqaddq_s16(step1[9], step1[10]);
1637 step2[10] = vqsubq_s16(step1[9], step1[10]);
1638 step2[11] = vqsubq_s16(step1[8], step1[11]);
1639 step2[12] = vqsubq_s16(step1[15], step1[12]);
1640 step2[13] = vqsubq_s16(step1[14], step1[13]);
1641 step2[14] = vqaddq_s16(step1[14], step1[13]);
1642 step2[15] = vqaddq_s16(step1[15], step1[12]);
1643 step2[16] = step1[16];
1644 step2[17] = step1[17];
1645 step2[22] = step1[22];
1646 step2[23] = step1[23];
1647 step2[24] = step1[24];
1648 step2[25] = step1[25];
1649 step2[30] = step1[30];
1650 step2[31] = step1[31];
1651
1652 // stage 7
1653
1654 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1655 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1656
1657 step1[0] = vqaddq_s16(step2[0], step2[7]);
1658 step1[1] = vqaddq_s16(step2[1], step2[6]);
1659 step1[2] = vqaddq_s16(step2[2], step2[5]);
1660 step1[3] = vqaddq_s16(step2[3], step2[4]);
1661 step1[4] = vqsubq_s16(step2[3], step2[4]);
1662 step1[5] = vqsubq_s16(step2[2], step2[5]);
1663 step1[6] = vqsubq_s16(step2[1], step2[6]);
1664 step1[7] = vqsubq_s16(step2[0], step2[7]);
1665 step1[8] = step2[8];
1666 step1[9] = step2[9];
1667 step1[14] = step2[14];
1668 step1[15] = step2[15];
1669 step1[16] = vqaddq_s16(step2[16], step2[23]);
1670 step1[17] = vqaddq_s16(step2[17], step2[22]);
1671 step1[18] = vqaddq_s16(step2[18], step2[21]);
1672 step1[19] = vqaddq_s16(step2[19], step2[20]);
1673 step1[20] = vqsubq_s16(step2[19], step2[20]);
1674 step1[21] = vqsubq_s16(step2[18], step2[21]);
1675 step1[22] = vqsubq_s16(step2[17], step2[22]);
1676 step1[23] = vqsubq_s16(step2[16], step2[23]);
1677 step1[24] = vqsubq_s16(step2[31], step2[24]);
1678 step1[25] = vqsubq_s16(step2[30], step2[25]);
1679 step1[26] = vqsubq_s16(step2[29], step2[26]);
1680 step1[27] = vqsubq_s16(step2[28], step2[27]);
1681 step1[28] = vqaddq_s16(step2[27], step2[28]);
1682 step1[29] = vqaddq_s16(step2[26], step2[29]);
1683 step1[30] = vqaddq_s16(step2[25], step2[30]);
1684 step1[31] = vqaddq_s16(step2[24], step2[31]);
1685
1686 // stage 8
1687
1688 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1689 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1690 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1691 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1692
1693 step2[0] = vqaddq_s16(step1[0], step1[15]);
1694 step2[1] = vqaddq_s16(step1[1], step1[14]);
1695 step2[2] = vqaddq_s16(step1[2], step1[13]);
1696 step2[3] = vqaddq_s16(step1[3], step1[12]);
1697 step2[4] = vqaddq_s16(step1[4], step1[11]);
1698 step2[5] = vqaddq_s16(step1[5], step1[10]);
1699 step2[6] = vqaddq_s16(step1[6], step1[9]);
1700 step2[7] = vqaddq_s16(step1[7], step1[8]);
1701 step2[8] = vqsubq_s16(step1[7], step1[8]);
1702 step2[9] = vqsubq_s16(step1[6], step1[9]);
1703 step2[10] = vqsubq_s16(step1[5], step1[10]);
1704 step2[11] = vqsubq_s16(step1[4], step1[11]);
1705 step2[12] = vqsubq_s16(step1[3], step1[12]);
1706 step2[13] = vqsubq_s16(step1[2], step1[13]);
1707 step2[14] = vqsubq_s16(step1[1], step1[14]);
1708 step2[15] = vqsubq_s16(step1[0], step1[15]);
1709 step2[16] = step1[16];
1710 step2[17] = step1[17];
1711 step2[18] = step1[18];
1712 step2[19] = step1[19];
1713 step2[28] = step1[28];
1714 step2[29] = step1[29];
1715 step2[30] = step1[30];
1716 step2[31] = step1[31];
1717
1718 // stage 9
1719
1720 out[0] = vqaddq_s16(step2[0], step2[31]);
1721 out[1] = vqaddq_s16(step2[1], step2[30]);
1722 out[2] = vqaddq_s16(step2[2], step2[29]);
1723 out[3] = vqaddq_s16(step2[3], step2[28]);
1724 out[4] = vqaddq_s16(step2[4], step2[27]);
1725 out[5] = vqaddq_s16(step2[5], step2[26]);
1726 out[6] = vqaddq_s16(step2[6], step2[25]);
1727 out[7] = vqaddq_s16(step2[7], step2[24]);
1728 out[8] = vqaddq_s16(step2[8], step2[23]);
1729 out[9] = vqaddq_s16(step2[9], step2[22]);
1730 out[10] = vqaddq_s16(step2[10], step2[21]);
1731 out[11] = vqaddq_s16(step2[11], step2[20]);
1732 out[12] = vqaddq_s16(step2[12], step2[19]);
1733 out[13] = vqaddq_s16(step2[13], step2[18]);
1734 out[14] = vqaddq_s16(step2[14], step2[17]);
1735 out[15] = vqaddq_s16(step2[15], step2[16]);
1736 out[16] = vqsubq_s16(step2[15], step2[16]);
1737 out[17] = vqsubq_s16(step2[14], step2[17]);
1738 out[18] = vqsubq_s16(step2[13], step2[18]);
1739 out[19] = vqsubq_s16(step2[12], step2[19]);
1740 out[20] = vqsubq_s16(step2[11], step2[20]);
1741 out[21] = vqsubq_s16(step2[10], step2[21]);
1742 out[22] = vqsubq_s16(step2[9], step2[22]);
1743 out[23] = vqsubq_s16(step2[8], step2[23]);
1744 out[24] = vqsubq_s16(step2[7], step2[24]);
1745 out[25] = vqsubq_s16(step2[6], step2[25]);
1746 out[26] = vqsubq_s16(step2[5], step2[26]);
1747 out[27] = vqsubq_s16(step2[4], step2[27]);
1748 out[28] = vqsubq_s16(step2[3], step2[28]);
1749 out[29] = vqsubq_s16(step2[2], step2[29]);
1750 out[30] = vqsubq_s16(step2[1], step2[30]);
1751 out[31] = vqsubq_s16(step2[0], step2[31]);
1752}
1753
Yaowu Xueb5e4e22020-04-06 14:17:55 -07001754static INLINE void idct32_low16_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05001755 int8_t cos_bit) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301756 const int32_t *cospi = cospi_arr(cos_bit);
1757 int16x8_t step1[32], step2[32];
1758 int32x4_t t32[16];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301759 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
1760 (int16_t)cospi[40], (int16_t)cospi[24]);
1761 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
1762 (int16_t)cospi[16], (int16_t)cospi[48]);
1763 const int16x4_t c2 =
1764 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
1765 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
1766 const int16x4_t c3 =
1767 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
1768 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301769
1770 // stage 1
1771 // stage 2
1772
1773 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1774 btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
1775 btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
1776 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1777 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1778 btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
1779 btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
1780 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1781
1782 step2[0] = in[0];
1783 step2[2] = in[8];
1784 step2[4] = in[4];
1785 step2[6] = in[12];
1786 step2[8] = in[2];
1787 step2[10] = in[10];
1788 step2[12] = in[6];
1789 step2[14] = in[14];
1790
1791 // stage 3
1792
1793 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1794 btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
1795 btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
1796 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1797
1798 step1[0] = step2[0];
1799 step1[2] = step2[2];
1800 step1[4] = step2[4];
1801 step1[6] = step2[6];
1802 step1[16] = vqaddq_s16(step2[16], step2[17]);
1803 step1[17] = vqsubq_s16(step2[16], step2[17]);
1804 step1[18] = vqsubq_s16(step2[19], step2[18]);
1805 step1[19] = vqaddq_s16(step2[19], step2[18]);
1806 step1[20] = vqaddq_s16(step2[20], step2[21]);
1807 step1[21] = vqsubq_s16(step2[20], step2[21]);
1808 step1[22] = vqsubq_s16(step2[23], step2[22]);
1809 step1[23] = vqaddq_s16(step2[23], step2[22]);
1810 step1[24] = vqaddq_s16(step2[24], step2[25]);
1811 step1[25] = vqsubq_s16(step2[24], step2[25]);
1812 step1[26] = vqsubq_s16(step2[27], step2[26]);
1813 step1[27] = vqaddq_s16(step2[27], step2[26]);
1814 step1[28] = vqaddq_s16(step2[28], step2[29]);
1815 step1[29] = vqsubq_s16(step2[28], step2[29]);
1816 step1[30] = vqsubq_s16(step2[31], step2[30]);
1817 step1[31] = vqaddq_s16(step2[31], step2[30]);
1818
1819 // stage 4
1820
1821 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1822 btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
1823 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301824 btf_16_lane_1_0_neon(step1[18], step1[29], c2, &step2[18], &step2[29]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301825 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301826 btf_16_lane_3_2_neon(step1[22], step1[25], c2, &step2[22], &step2[25]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301827
1828 step2[0] = step1[0];
1829 step2[2] = step1[2];
1830 step2[8] = vqaddq_s16(step1[8], step1[9]);
1831 step2[9] = vqsubq_s16(step1[8], step1[9]);
1832 step2[10] = vqsubq_s16(step1[11], step1[10]);
1833 step2[11] = vqaddq_s16(step1[11], step1[10]);
1834 step2[12] = vqaddq_s16(step1[12], step1[13]);
1835 step2[13] = vqsubq_s16(step1[12], step1[13]);
1836 step2[14] = vqsubq_s16(step1[15], step1[14]);
1837 step2[15] = vqaddq_s16(step1[15], step1[14]);
1838 step2[16] = step1[16];
1839 step2[19] = step1[19];
1840 step2[20] = step1[20];
1841 step2[23] = step1[23];
1842 step2[24] = step1[24];
1843 step2[27] = step1[27];
1844 step2[28] = step1[28];
1845 step2[31] = step1[31];
1846
1847 // stage 5
1848
1849 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1850 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1851
1852 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1853 vrshrn_n_s32(t32[1], INV_COS_BIT));
1854
1855 btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
1856 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301857 btf_16_lane_3_2_neon(step2[10], step2[13], c3, &step1[10], &step1[13]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301858
1859 step1[4] = vqaddq_s16(step2[4], step2[5]);
1860 step1[5] = vqsubq_s16(step2[4], step2[5]);
1861 step1[6] = vqsubq_s16(step2[7], step2[6]);
1862 step1[7] = vqaddq_s16(step2[7], step2[6]);
1863 step1[8] = step2[8];
1864 step1[11] = step2[11];
1865 step1[12] = step2[12];
1866 step1[15] = step2[15];
1867 step1[16] = vqaddq_s16(step2[16], step2[19]);
1868 step1[17] = vqaddq_s16(step2[17], step2[18]);
1869 step1[18] = vqsubq_s16(step2[17], step2[18]);
1870 step1[19] = vqsubq_s16(step2[16], step2[19]);
1871 step1[20] = vqsubq_s16(step2[23], step2[20]);
1872 step1[21] = vqsubq_s16(step2[22], step2[21]);
1873 step1[22] = vqaddq_s16(step2[22], step2[21]);
1874 step1[23] = vqaddq_s16(step2[23], step2[20]);
1875 step1[24] = vqaddq_s16(step2[24], step2[27]);
1876 step1[25] = vqaddq_s16(step2[25], step2[26]);
1877 step1[26] = vqsubq_s16(step2[25], step2[26]);
1878 step1[27] = vqsubq_s16(step2[24], step2[27]);
1879 step1[28] = vqsubq_s16(step2[31], step2[28]);
1880 step1[29] = vqsubq_s16(step2[30], step2[29]);
1881 step1[30] = vqaddq_s16(step2[30], step2[29]);
1882 step1[31] = vqaddq_s16(step2[31], step2[28]);
1883
1884 // stage 6
1885
1886 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1887 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1888 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05301889 btf_16_lane_3_2_neon(step1[20], step1[27], c3, &step2[20], &step2[27]);
1890 btf_16_lane_3_2_neon(step1[21], step1[26], c3, &step2[21], &step2[26]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05301891
1892 step2[0] = vqaddq_s16(step1[0], step1[3]);
1893 step2[1] = vqaddq_s16(step1[0], step1[2]);
1894 step2[2] = vqsubq_s16(step1[0], step1[2]);
1895 step2[3] = vqsubq_s16(step1[0], step1[3]);
1896 step2[4] = step1[4];
1897 step2[7] = step1[7];
1898 step2[8] = vqaddq_s16(step1[8], step1[11]);
1899 step2[9] = vqaddq_s16(step1[9], step1[10]);
1900 step2[10] = vqsubq_s16(step1[9], step1[10]);
1901 step2[11] = vqsubq_s16(step1[8], step1[11]);
1902 step2[12] = vqsubq_s16(step1[15], step1[12]);
1903 step2[13] = vqsubq_s16(step1[14], step1[13]);
1904 step2[14] = vqaddq_s16(step1[14], step1[13]);
1905 step2[15] = vqaddq_s16(step1[15], step1[12]);
1906 step2[16] = step1[16];
1907 step2[17] = step1[17];
1908 step2[22] = step1[22];
1909 step2[23] = step1[23];
1910 step2[24] = step1[24];
1911 step2[25] = step1[25];
1912 step2[30] = step1[30];
1913 step2[31] = step1[31];
1914
1915 // stage 7
1916
1917 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1918 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1919
1920 step1[0] = vqaddq_s16(step2[0], step2[7]);
1921 step1[1] = vqaddq_s16(step2[1], step2[6]);
1922 step1[2] = vqaddq_s16(step2[2], step2[5]);
1923 step1[3] = vqaddq_s16(step2[3], step2[4]);
1924 step1[4] = vqsubq_s16(step2[3], step2[4]);
1925 step1[5] = vqsubq_s16(step2[2], step2[5]);
1926 step1[6] = vqsubq_s16(step2[1], step2[6]);
1927 step1[7] = vqsubq_s16(step2[0], step2[7]);
1928 step1[8] = step2[8];
1929 step1[9] = step2[9];
1930 step1[14] = step2[14];
1931 step1[15] = step2[15];
1932 step1[16] = vqaddq_s16(step2[16], step2[23]);
1933 step1[17] = vqaddq_s16(step2[17], step2[22]);
1934 step1[18] = vqaddq_s16(step2[18], step2[21]);
1935 step1[19] = vqaddq_s16(step2[19], step2[20]);
1936 step1[20] = vqsubq_s16(step2[19], step2[20]);
1937 step1[21] = vqsubq_s16(step2[18], step2[21]);
1938 step1[22] = vqsubq_s16(step2[17], step2[22]);
1939 step1[23] = vqsubq_s16(step2[16], step2[23]);
1940 step1[24] = vqsubq_s16(step2[31], step2[24]);
1941 step1[25] = vqsubq_s16(step2[30], step2[25]);
1942 step1[26] = vqsubq_s16(step2[29], step2[26]);
1943 step1[27] = vqsubq_s16(step2[28], step2[27]);
1944 step1[28] = vqaddq_s16(step2[27], step2[28]);
1945 step1[29] = vqaddq_s16(step2[26], step2[29]);
1946 step1[30] = vqaddq_s16(step2[25], step2[30]);
1947 step1[31] = vqaddq_s16(step2[24], step2[31]);
1948
1949 // stage 8
1950
1951 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1952 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1953 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1954 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1955
1956 step2[0] = vqaddq_s16(step1[0], step1[15]);
1957 step2[1] = vqaddq_s16(step1[1], step1[14]);
1958 step2[2] = vqaddq_s16(step1[2], step1[13]);
1959 step2[3] = vqaddq_s16(step1[3], step1[12]);
1960 step2[4] = vqaddq_s16(step1[4], step1[11]);
1961 step2[5] = vqaddq_s16(step1[5], step1[10]);
1962 step2[6] = vqaddq_s16(step1[6], step1[9]);
1963 step2[7] = vqaddq_s16(step1[7], step1[8]);
1964 step2[8] = vqsubq_s16(step1[7], step1[8]);
1965 step2[9] = vqsubq_s16(step1[6], step1[9]);
1966 step2[10] = vqsubq_s16(step1[5], step1[10]);
1967 step2[11] = vqsubq_s16(step1[4], step1[11]);
1968 step2[12] = vqsubq_s16(step1[3], step1[12]);
1969 step2[13] = vqsubq_s16(step1[2], step1[13]);
1970 step2[14] = vqsubq_s16(step1[1], step1[14]);
1971 step2[15] = vqsubq_s16(step1[0], step1[15]);
1972 step2[16] = step1[16];
1973 step2[17] = step1[17];
1974 step2[18] = step1[18];
1975 step2[19] = step1[19];
1976 step2[28] = step1[28];
1977 step2[29] = step1[29];
1978 step2[30] = step1[30];
1979 step2[31] = step1[31];
1980
1981 // stage 9
1982
1983 out[0] = vqaddq_s16(step2[0], step2[31]);
1984 out[1] = vqaddq_s16(step2[1], step2[30]);
1985 out[2] = vqaddq_s16(step2[2], step2[29]);
1986 out[3] = vqaddq_s16(step2[3], step2[28]);
1987 out[4] = vqaddq_s16(step2[4], step2[27]);
1988 out[5] = vqaddq_s16(step2[5], step2[26]);
1989 out[6] = vqaddq_s16(step2[6], step2[25]);
1990 out[7] = vqaddq_s16(step2[7], step2[24]);
1991 out[8] = vqaddq_s16(step2[8], step2[23]);
1992 out[9] = vqaddq_s16(step2[9], step2[22]);
1993 out[10] = vqaddq_s16(step2[10], step2[21]);
1994 out[11] = vqaddq_s16(step2[11], step2[20]);
1995 out[12] = vqaddq_s16(step2[12], step2[19]);
1996 out[13] = vqaddq_s16(step2[13], step2[18]);
1997 out[14] = vqaddq_s16(step2[14], step2[17]);
1998 out[15] = vqaddq_s16(step2[15], step2[16]);
1999 out[16] = vqsubq_s16(step2[15], step2[16]);
2000 out[17] = vqsubq_s16(step2[14], step2[17]);
2001 out[18] = vqsubq_s16(step2[13], step2[18]);
2002 out[19] = vqsubq_s16(step2[12], step2[19]);
2003 out[20] = vqsubq_s16(step2[11], step2[20]);
2004 out[21] = vqsubq_s16(step2[10], step2[21]);
2005 out[22] = vqsubq_s16(step2[9], step2[22]);
2006 out[23] = vqsubq_s16(step2[8], step2[23]);
2007 out[24] = vqsubq_s16(step2[7], step2[24]);
2008 out[25] = vqsubq_s16(step2[6], step2[25]);
2009 out[26] = vqsubq_s16(step2[5], step2[26]);
2010 out[27] = vqsubq_s16(step2[4], step2[27]);
2011 out[28] = vqsubq_s16(step2[3], step2[28]);
2012 out[29] = vqsubq_s16(step2[2], step2[29]);
2013 out[30] = vqsubq_s16(step2[1], step2[30]);
2014 out[31] = vqsubq_s16(step2[0], step2[31]);
2015}
sachin garg56f10202018-09-24 14:05:25 +00002016static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
2017 int8_t cos_bit) {
2018 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302019 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2020 (int16_t)cospi[16], (int16_t)cospi[48]);
sachin garg56f10202018-09-24 14:05:25 +00002021
2022 btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
2023 btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
2024 btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
2025 btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
2026
2027 step1[0] = vqaddq_s16(step2[0], step2[15]);
2028 step1[1] = vqaddq_s16(step2[1], step2[14]);
2029 step1[2] = vqaddq_s16(step2[2], step2[13]);
2030 step1[3] = vqaddq_s16(step2[3], step2[12]);
2031 step1[4] = vqaddq_s16(step2[4], step2[11]);
2032 step1[5] = vqaddq_s16(step2[5], step2[10]);
2033 step1[6] = vqaddq_s16(step2[6], step2[9]);
2034 step1[7] = vqaddq_s16(step2[7], step2[8]);
2035 step1[8] = vqsubq_s16(step2[7], step2[8]);
2036 step1[9] = vqsubq_s16(step2[6], step2[9]);
2037 step1[10] = vqsubq_s16(step2[5], step2[10]);
2038 step1[11] = vqsubq_s16(step2[4], step2[11]);
2039 step1[12] = vqsubq_s16(step2[3], step2[12]);
2040 step1[13] = vqsubq_s16(step2[2], step2[13]);
2041 step1[14] = vqsubq_s16(step2[1], step2[14]);
2042 step1[15] = vqsubq_s16(step2[0], step2[15]);
2043 step1[16] = step2[16];
2044 step1[17] = step2[17];
2045 step1[18] = step2[18];
2046 step1[19] = step2[19];
2047 step1[28] = step2[28];
2048 step1[29] = step2[29];
2049 step1[30] = step2[30];
2050 step1[31] = step2[31];
2051 step1[32] = vqaddq_s16(step2[32], step2[47]);
2052 step1[33] = vqaddq_s16(step2[33], step2[46]);
2053 step1[34] = vqaddq_s16(step2[34], step2[45]);
2054 step1[35] = vqaddq_s16(step2[35], step2[44]);
2055 step1[36] = vqaddq_s16(step2[36], step2[43]);
2056 step1[37] = vqaddq_s16(step2[37], step2[42]);
2057 step1[38] = vqaddq_s16(step2[38], step2[41]);
2058 step1[39] = vqaddq_s16(step2[39], step2[40]);
2059 step1[40] = vqsubq_s16(step2[39], step2[40]);
2060 step1[41] = vqsubq_s16(step2[38], step2[41]);
2061 step1[42] = vqsubq_s16(step2[37], step2[42]);
2062 step1[43] = vqsubq_s16(step2[36], step2[43]);
2063 step1[44] = vqsubq_s16(step2[35], step2[44]);
2064 step1[45] = vqsubq_s16(step2[34], step2[45]);
2065 step1[46] = vqsubq_s16(step2[33], step2[46]);
2066 step1[47] = vqsubq_s16(step2[32], step2[47]);
2067 step1[48] = vqsubq_s16(step2[63], step2[48]);
2068 step1[49] = vqsubq_s16(step2[62], step2[49]);
2069 step1[50] = vqsubq_s16(step2[61], step2[50]);
2070 step1[51] = vqsubq_s16(step2[60], step2[51]);
2071 step1[52] = vqsubq_s16(step2[59], step2[52]);
2072 step1[53] = vqsubq_s16(step2[58], step2[53]);
2073 step1[54] = vqsubq_s16(step2[57], step2[54]);
2074 step1[55] = vqsubq_s16(step2[56], step2[55]);
2075 step1[56] = vqaddq_s16(step2[56], step2[55]);
2076 step1[57] = vqaddq_s16(step2[57], step2[54]);
2077 step1[58] = vqaddq_s16(step2[58], step2[53]);
2078 step1[59] = vqaddq_s16(step2[59], step2[52]);
2079 step1[60] = vqaddq_s16(step2[60], step2[51]);
2080 step1[61] = vqaddq_s16(step2[61], step2[50]);
2081 step1[62] = vqaddq_s16(step2[62], step2[49]);
2082 step1[63] = vqaddq_s16(step2[63], step2[48]);
2083}
2084
2085static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
2086 int8_t cos_bit) {
2087 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302088 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2089 (int16_t)cospi[16], (int16_t)cospi[48]);
sachin garg56f10202018-09-24 14:05:25 +00002090
2091 btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
2092 btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
2093 btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
2094 btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
2095 btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
2096 btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
2097 btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
2098 btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
2099
2100 step2[0] = vqaddq_s16(step1[0], step1[31]);
2101 step2[1] = vqaddq_s16(step1[1], step1[30]);
2102 step2[2] = vqaddq_s16(step1[2], step1[29]);
2103 step2[3] = vqaddq_s16(step1[3], step1[28]);
2104 step2[4] = vqaddq_s16(step1[4], step1[27]);
2105 step2[5] = vqaddq_s16(step1[5], step1[26]);
2106 step2[6] = vqaddq_s16(step1[6], step1[25]);
2107 step2[7] = vqaddq_s16(step1[7], step1[24]);
2108 step2[8] = vqaddq_s16(step1[8], step1[23]);
2109 step2[9] = vqaddq_s16(step1[9], step1[22]);
2110 step2[10] = vqaddq_s16(step1[10], step1[21]);
2111 step2[11] = vqaddq_s16(step1[11], step1[20]);
2112 step2[12] = vqaddq_s16(step1[12], step1[19]);
2113 step2[13] = vqaddq_s16(step1[13], step1[18]);
2114 step2[14] = vqaddq_s16(step1[14], step1[17]);
2115 step2[15] = vqaddq_s16(step1[15], step1[16]);
2116 step2[16] = vqsubq_s16(step1[15], step1[16]);
2117 step2[17] = vqsubq_s16(step1[14], step1[17]);
2118 step2[18] = vqsubq_s16(step1[13], step1[18]);
2119 step2[19] = vqsubq_s16(step1[12], step1[19]);
2120 step2[20] = vqsubq_s16(step1[11], step1[20]);
2121 step2[21] = vqsubq_s16(step1[10], step1[21]);
2122 step2[22] = vqsubq_s16(step1[9], step1[22]);
2123 step2[23] = vqsubq_s16(step1[8], step1[23]);
2124 step2[24] = vqsubq_s16(step1[7], step1[24]);
2125 step2[25] = vqsubq_s16(step1[6], step1[25]);
2126 step2[26] = vqsubq_s16(step1[5], step1[26]);
2127 step2[27] = vqsubq_s16(step1[4], step1[27]);
2128 step2[28] = vqsubq_s16(step1[3], step1[28]);
2129 step2[29] = vqsubq_s16(step1[2], step1[29]);
2130 step2[30] = vqsubq_s16(step1[1], step1[30]);
2131 step2[31] = vqsubq_s16(step1[0], step1[31]);
2132 step2[32] = step1[32];
2133 step2[33] = step1[33];
2134 step2[34] = step1[34];
2135 step2[35] = step1[35];
2136 step2[36] = step1[36];
2137 step2[37] = step1[37];
2138 step2[38] = step1[38];
2139 step2[39] = step1[39];
2140 step2[56] = step1[56];
2141 step2[57] = step1[57];
2142 step2[58] = step1[58];
2143 step2[59] = step1[59];
2144 step2[60] = step1[60];
2145 step2[61] = step1[61];
2146 step2[62] = step1[62];
2147 step2[63] = step1[63];
2148}
2149
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002150static INLINE void idct64_low32_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002151 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002152 const int32_t *cospi = cospi_arr(cos_bit);
2153 int16x8_t step2[64], step1[64];
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302154 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2155 (int16_t)cospi[36], (int16_t)cospi[28]);
2156 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2157 (int16_t)cospi[52], (int16_t)cospi[12]);
2158 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2159 (int16_t)cospi[40], (int16_t)cospi[24]);
2160 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2161 (int16_t)cospi[16], (int16_t)cospi[48]);
2162 const int16x4_t c4 =
2163 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
2164 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
2165 const int16x4_t c5 =
2166 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
2167 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2168 const int16x4_t c6 =
2169 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2170 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2171 const int16x4_t c7 =
2172 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2173 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00002174
2175 // stage 1
2176 // stage 2
2177
2178 step2[0] = in[0];
2179 step2[2] = in[16];
2180 step2[4] = in[8];
2181 step2[6] = in[24];
2182 step2[8] = in[4];
2183 step2[10] = in[20];
2184 step2[12] = in[12];
2185 step2[14] = in[28];
2186 step2[16] = in[2];
2187 step2[18] = in[18];
2188 step2[20] = in[10];
2189 step2[22] = in[26];
2190 step2[24] = in[6];
2191 step2[26] = in[22];
2192 step2[28] = in[14];
2193 step2[30] = in[30];
2194
2195 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2196 btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
2197 btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
2198 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
2199 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
2200 btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
2201 btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
2202 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2203 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2204 btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
2205 btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
2206 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
2207 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
2208 btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
2209 btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
2210 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2211
2212 // stage 3
2213
2214 step1[0] = step2[0];
2215 step1[2] = step2[2];
2216 step1[4] = step2[4];
2217 step1[6] = step2[6];
2218 step1[8] = step2[8];
2219 step1[10] = step2[10];
2220 step1[12] = step2[12];
2221 step1[14] = step2[14];
2222
2223 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2224 btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
2225 btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
2226 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
2227 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
2228 btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
2229 btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
2230 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2231
2232 step1[32] = vqaddq_s16(step2[32], step2[33]);
2233 step1[33] = vqsubq_s16(step2[32], step2[33]);
2234 step1[34] = vqsubq_s16(step2[35], step2[34]);
2235 step1[35] = vqaddq_s16(step2[35], step2[34]);
2236 step1[36] = vqaddq_s16(step2[36], step2[37]);
2237 step1[37] = vqsubq_s16(step2[36], step2[37]);
2238 step1[38] = vqsubq_s16(step2[39], step2[38]);
2239 step1[39] = vqaddq_s16(step2[39], step2[38]);
2240 step1[40] = vqaddq_s16(step2[40], step2[41]);
2241 step1[41] = vqsubq_s16(step2[40], step2[41]);
2242 step1[42] = vqsubq_s16(step2[43], step2[42]);
2243 step1[43] = vqaddq_s16(step2[43], step2[42]);
2244 step1[44] = vqaddq_s16(step2[44], step2[45]);
2245 step1[45] = vqsubq_s16(step2[44], step2[45]);
2246 step1[46] = vqsubq_s16(step2[47], step2[46]);
2247 step1[47] = vqaddq_s16(step2[47], step2[46]);
2248 step1[48] = vqaddq_s16(step2[48], step2[49]);
2249 step1[49] = vqsubq_s16(step2[48], step2[49]);
2250 step1[50] = vqsubq_s16(step2[51], step2[50]);
2251 step1[51] = vqaddq_s16(step2[51], step2[50]);
2252 step1[52] = vqaddq_s16(step2[52], step2[53]);
2253 step1[53] = vqsubq_s16(step2[52], step2[53]);
2254 step1[54] = vqsubq_s16(step2[55], step2[54]);
2255 step1[55] = vqaddq_s16(step2[55], step2[54]);
2256 step1[56] = vqaddq_s16(step2[56], step2[57]);
2257 step1[57] = vqsubq_s16(step2[56], step2[57]);
2258 step1[58] = vqsubq_s16(step2[59], step2[58]);
2259 step1[59] = vqaddq_s16(step2[59], step2[58]);
2260 step1[60] = vqaddq_s16(step2[60], step2[61]);
2261 step1[61] = vqsubq_s16(step2[60], step2[61]);
2262 step1[62] = vqsubq_s16(step2[63], step2[62]);
2263 step1[63] = vqaddq_s16(step2[63], step2[62]);
2264
2265 // stage 4
2266
2267 step2[0] = step1[0];
2268 step2[2] = step1[2];
2269 step2[4] = step1[4];
2270 step2[6] = step1[6];
2271
2272 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2273 btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
2274 btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
2275 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
2276 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302277 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
sachin garg56f10202018-09-24 14:05:25 +00002278 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302279 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00002280 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302281 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
sachin garg56f10202018-09-24 14:05:25 +00002282 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302283 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00002284
2285 step2[16] = vqaddq_s16(step1[16], step1[17]);
2286 step2[17] = vqsubq_s16(step1[16], step1[17]);
2287 step2[18] = vqsubq_s16(step1[19], step1[18]);
2288 step2[19] = vqaddq_s16(step1[19], step1[18]);
2289 step2[20] = vqaddq_s16(step1[20], step1[21]);
2290 step2[21] = vqsubq_s16(step1[20], step1[21]);
2291 step2[22] = vqsubq_s16(step1[23], step1[22]);
2292 step2[23] = vqaddq_s16(step1[23], step1[22]);
2293 step2[24] = vqaddq_s16(step1[24], step1[25]);
2294 step2[25] = vqsubq_s16(step1[24], step1[25]);
2295 step2[26] = vqsubq_s16(step1[27], step1[26]);
2296 step2[27] = vqaddq_s16(step1[27], step1[26]);
2297 step2[28] = vqaddq_s16(step1[28], step1[29]);
2298 step2[29] = vqsubq_s16(step1[28], step1[29]);
2299 step2[30] = vqsubq_s16(step1[31], step1[30]);
2300 step2[31] = vqaddq_s16(step1[31], step1[30]);
2301 step2[32] = step1[32];
2302 step2[35] = step1[35];
2303 step2[36] = step1[36];
2304 step2[39] = step1[39];
2305 step2[40] = step1[40];
2306 step2[43] = step1[43];
2307 step2[44] = step1[44];
2308 step2[47] = step1[47];
2309 step2[48] = step1[48];
2310 step2[51] = step1[51];
2311 step2[52] = step1[52];
2312 step2[55] = step1[55];
2313 step2[56] = step1[56];
2314 step2[59] = step1[59];
2315 step2[60] = step1[60];
2316 step2[63] = step1[63];
2317
2318 // stage 5
2319
2320 step1[0] = step2[0];
2321 step1[2] = step2[2];
2322
2323 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
2324 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
2325 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302326 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
sachin garg56f10202018-09-24 14:05:25 +00002327 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302328 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00002329
2330 step1[8] = vqaddq_s16(step2[8], step2[9]);
2331 step1[9] = vqsubq_s16(step2[8], step2[9]);
2332 step1[10] = vqsubq_s16(step2[11], step2[10]);
2333 step1[11] = vqaddq_s16(step2[11], step2[10]);
2334 step1[12] = vqaddq_s16(step2[12], step2[13]);
2335 step1[13] = vqsubq_s16(step2[12], step2[13]);
2336 step1[14] = vqsubq_s16(step2[15], step2[14]);
2337 step1[15] = vqaddq_s16(step2[15], step2[14]);
2338 step1[16] = step2[16];
2339 step1[19] = step2[19];
2340 step1[20] = step2[20];
2341 step1[23] = step2[23];
2342 step1[24] = step2[24];
2343 step1[27] = step2[27];
2344 step1[28] = step2[28];
2345 step1[31] = step2[31];
2346 step1[32] = vqaddq_s16(step2[32], step2[35]);
2347 step1[33] = vqaddq_s16(step2[33], step2[34]);
2348 step1[34] = vqsubq_s16(step2[33], step2[34]);
2349 step1[35] = vqsubq_s16(step2[32], step2[35]);
2350 step1[36] = vqsubq_s16(step2[39], step2[36]);
2351 step1[37] = vqsubq_s16(step2[38], step2[37]);
2352 step1[38] = vqaddq_s16(step2[38], step2[37]);
2353 step1[39] = vqaddq_s16(step2[39], step2[36]);
2354 step1[40] = vqaddq_s16(step2[40], step2[43]);
2355 step1[41] = vqaddq_s16(step2[41], step2[42]);
2356 step1[42] = vqsubq_s16(step2[41], step2[42]);
2357 step1[43] = vqsubq_s16(step2[40], step2[43]);
2358 step1[44] = vqsubq_s16(step2[47], step2[44]);
2359 step1[45] = vqsubq_s16(step2[46], step2[45]);
2360 step1[46] = vqaddq_s16(step2[46], step2[45]);
2361 step1[47] = vqaddq_s16(step2[47], step2[44]);
2362 step1[48] = vqaddq_s16(step2[48], step2[51]);
2363 step1[49] = vqaddq_s16(step2[49], step2[50]);
2364 step1[50] = vqsubq_s16(step2[49], step2[50]);
2365 step1[51] = vqsubq_s16(step2[48], step2[51]);
2366 step1[52] = vqsubq_s16(step2[55], step2[52]);
2367 step1[53] = vqsubq_s16(step2[54], step2[53]);
2368 step1[54] = vqaddq_s16(step2[54], step2[53]);
2369 step1[55] = vqaddq_s16(step2[55], step2[52]);
2370 step1[56] = vqaddq_s16(step2[56], step2[59]);
2371 step1[57] = vqaddq_s16(step2[57], step2[58]);
2372 step1[58] = vqsubq_s16(step2[57], step2[58]);
2373 step1[59] = vqsubq_s16(step2[56], step2[59]);
2374 step1[60] = vqsubq_s16(step2[63], step2[60]);
2375 step1[61] = vqsubq_s16(step2[62], step2[61]);
2376 step1[62] = vqaddq_s16(step2[62], step2[61]);
2377 step1[63] = vqaddq_s16(step2[63], step2[60]);
2378
2379 // stage 6
2380
2381 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2382 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
2383 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302384 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
sachin garg56f10202018-09-24 14:05:25 +00002385 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2386 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302387 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
2388 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00002389 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2390 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302391 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
2392 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00002393
2394 step2[4] = vqaddq_s16(step1[4], step1[5]);
2395 step2[5] = vqsubq_s16(step1[4], step1[5]);
2396 step2[6] = vqsubq_s16(step1[7], step1[6]);
2397 step2[7] = vqaddq_s16(step1[7], step1[6]);
2398 step2[8] = step1[8];
2399 step2[11] = step1[11];
2400 step2[12] = step1[12];
2401 step2[15] = step1[15];
2402 step2[16] = vqaddq_s16(step1[16], step1[19]);
2403 step2[17] = vqaddq_s16(step1[17], step1[18]);
2404 step2[18] = vqsubq_s16(step1[17], step1[18]);
2405 step2[19] = vqsubq_s16(step1[16], step1[19]);
2406 step2[20] = vqsubq_s16(step1[23], step1[20]);
2407 step2[21] = vqsubq_s16(step1[22], step1[21]);
2408 step2[22] = vqaddq_s16(step1[22], step1[21]);
2409 step2[23] = vqaddq_s16(step1[23], step1[20]);
2410 step2[24] = vqaddq_s16(step1[24], step1[27]);
2411 step2[25] = vqaddq_s16(step1[25], step1[26]);
2412 step2[26] = vqsubq_s16(step1[25], step1[26]);
2413 step2[27] = vqsubq_s16(step1[24], step1[27]);
2414 step2[28] = vqsubq_s16(step1[31], step1[28]);
2415 step2[29] = vqsubq_s16(step1[30], step1[29]);
2416 step2[30] = vqaddq_s16(step1[30], step1[29]);
2417 step2[31] = vqaddq_s16(step1[31], step1[28]);
2418 step2[32] = step1[32];
2419 step2[33] = step1[33];
2420 step2[38] = step1[38];
2421 step2[39] = step1[39];
2422 step2[40] = step1[40];
2423 step2[41] = step1[41];
2424 step2[46] = step1[46];
2425 step2[47] = step1[47];
2426 step2[48] = step1[48];
2427 step2[49] = step1[49];
2428 step2[54] = step1[54];
2429 step2[55] = step1[55];
2430 step2[56] = step1[56];
2431 step2[57] = step1[57];
2432 step2[62] = step1[62];
2433 step2[63] = step1[63];
2434
2435 // stage 7
2436
2437 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
2438 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2439 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302440 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
2441 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00002442
2443 step1[0] = vqaddq_s16(step2[0], step2[3]);
2444 step1[1] = vqaddq_s16(step2[1], step2[2]);
2445 step1[2] = vqsubq_s16(step2[1], step2[2]);
2446 step1[3] = vqsubq_s16(step2[0], step2[3]);
2447 step1[4] = step2[4];
2448 step1[7] = step2[7];
2449 step1[8] = vqaddq_s16(step2[8], step2[11]);
2450 step1[9] = vqaddq_s16(step2[9], step2[10]);
2451 step1[10] = vqsubq_s16(step2[9], step2[10]);
2452 step1[11] = vqsubq_s16(step2[8], step2[11]);
2453 step1[12] = vqsubq_s16(step2[15], step2[12]);
2454 step1[13] = vqsubq_s16(step2[14], step2[13]);
2455 step1[14] = vqaddq_s16(step2[14], step2[13]);
2456 step1[15] = vqaddq_s16(step2[15], step2[12]);
2457 step1[16] = step2[16];
2458 step1[17] = step2[17];
2459 step1[22] = step2[22];
2460 step1[23] = step2[23];
2461 step1[24] = step2[24];
2462 step1[25] = step2[25];
2463 step1[30] = step2[30];
2464 step1[31] = step2[31];
2465 step1[32] = vqaddq_s16(step2[32], step2[39]);
2466 step1[33] = vqaddq_s16(step2[33], step2[38]);
2467 step1[34] = vqaddq_s16(step2[34], step2[37]);
2468 step1[35] = vqaddq_s16(step2[35], step2[36]);
2469 step1[36] = vqsubq_s16(step2[35], step2[36]);
2470 step1[37] = vqsubq_s16(step2[34], step2[37]);
2471 step1[38] = vqsubq_s16(step2[33], step2[38]);
2472 step1[39] = vqsubq_s16(step2[32], step2[39]);
2473 step1[40] = vqsubq_s16(step2[47], step2[40]);
2474 step1[41] = vqsubq_s16(step2[46], step2[41]);
2475 step1[42] = vqsubq_s16(step2[45], step2[42]);
2476 step1[43] = vqsubq_s16(step2[44], step2[43]);
2477 step1[44] = vqaddq_s16(step2[43], step2[44]);
2478 step1[45] = vqaddq_s16(step2[42], step2[45]);
2479 step1[46] = vqaddq_s16(step2[41], step2[46]);
2480 step1[47] = vqaddq_s16(step2[40], step2[47]);
2481 step1[48] = vqaddq_s16(step2[48], step2[55]);
2482 step1[49] = vqaddq_s16(step2[49], step2[54]);
2483 step1[50] = vqaddq_s16(step2[50], step2[53]);
2484 step1[51] = vqaddq_s16(step2[51], step2[52]);
2485 step1[52] = vqsubq_s16(step2[51], step2[52]);
2486 step1[53] = vqsubq_s16(step2[50], step2[53]);
2487 step1[54] = vqsubq_s16(step2[49], step2[54]);
2488 step1[55] = vqsubq_s16(step2[48], step2[55]);
2489 step1[56] = vqsubq_s16(step2[63], step2[56]);
2490 step1[57] = vqsubq_s16(step2[62], step2[57]);
2491 step1[58] = vqsubq_s16(step2[61], step2[58]);
2492 step1[59] = vqsubq_s16(step2[60], step2[59]);
2493 step1[60] = vqaddq_s16(step2[59], step2[60]);
2494 step1[61] = vqaddq_s16(step2[58], step2[61]);
2495 step1[62] = vqaddq_s16(step2[57], step2[62]);
2496 step1[63] = vqaddq_s16(step2[56], step2[63]);
2497
2498 // stage 8
2499
2500 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2501 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2502 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2503 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2504 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2505 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302506 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
2507 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
2508 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
2509 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00002510
2511 step2[0] = vqaddq_s16(step1[0], step1[7]);
2512 step2[1] = vqaddq_s16(step1[1], step1[6]);
2513 step2[2] = vqaddq_s16(step1[2], step1[5]);
2514 step2[3] = vqaddq_s16(step1[3], step1[4]);
2515 step2[4] = vqsubq_s16(step1[3], step1[4]);
2516 step2[5] = vqsubq_s16(step1[2], step1[5]);
2517 step2[6] = vqsubq_s16(step1[1], step1[6]);
2518 step2[7] = vqsubq_s16(step1[0], step1[7]);
2519 step2[8] = step1[8];
2520 step2[9] = step1[9];
2521 step2[14] = step1[14];
2522 step2[15] = step1[15];
2523 step2[16] = vqaddq_s16(step1[16], step1[23]);
2524 step2[17] = vqaddq_s16(step1[17], step1[22]);
2525 step2[18] = vqaddq_s16(step1[18], step1[21]);
2526 step2[19] = vqaddq_s16(step1[19], step1[20]);
2527 step2[20] = vqsubq_s16(step1[19], step1[20]);
2528 step2[21] = vqsubq_s16(step1[18], step1[21]);
2529 step2[22] = vqsubq_s16(step1[17], step1[22]);
2530 step2[23] = vqsubq_s16(step1[16], step1[23]);
2531 step2[24] = vqsubq_s16(step1[31], step1[24]);
2532 step2[25] = vqsubq_s16(step1[30], step1[25]);
2533 step2[26] = vqsubq_s16(step1[29], step1[26]);
2534 step2[27] = vqsubq_s16(step1[28], step1[27]);
2535 step2[28] = vqaddq_s16(step1[28], step1[27]);
2536 step2[29] = vqaddq_s16(step1[29], step1[26]);
2537 step2[30] = vqaddq_s16(step1[30], step1[25]);
2538 step2[31] = vqaddq_s16(step1[31], step1[24]);
2539 step2[32] = step1[32];
2540 step2[33] = step1[33];
2541 step2[34] = step1[34];
2542 step2[35] = step1[35];
2543 step2[44] = step1[44];
2544 step2[45] = step1[45];
2545 step2[46] = step1[46];
2546 step2[47] = step1[47];
2547 step2[48] = step1[48];
2548 step2[49] = step1[49];
2549 step2[50] = step1[50];
2550 step2[51] = step1[51];
2551 step2[60] = step1[60];
2552 step2[61] = step1[61];
2553 step2[62] = step1[62];
2554 step2[63] = step1[63];
2555
2556 // stage 9
2557 idct64_stage9_neon(step2, step1, cos_bit);
2558
2559 // stage 10
2560 idct64_stage10_neon(step1, step2, cos_bit);
2561
2562 // stage 11
2563
2564 out[0] = vqaddq_s16(step2[0], step2[63]);
2565 out[1] = vqaddq_s16(step2[1], step2[62]);
2566 out[2] = vqaddq_s16(step2[2], step2[61]);
2567 out[3] = vqaddq_s16(step2[3], step2[60]);
2568 out[4] = vqaddq_s16(step2[4], step2[59]);
2569 out[5] = vqaddq_s16(step2[5], step2[58]);
2570 out[6] = vqaddq_s16(step2[6], step2[57]);
2571 out[7] = vqaddq_s16(step2[7], step2[56]);
2572 out[8] = vqaddq_s16(step2[8], step2[55]);
2573 out[9] = vqaddq_s16(step2[9], step2[54]);
2574 out[10] = vqaddq_s16(step2[10], step2[53]);
2575 out[11] = vqaddq_s16(step2[11], step2[52]);
2576 out[12] = vqaddq_s16(step2[12], step2[51]);
2577 out[13] = vqaddq_s16(step2[13], step2[50]);
2578 out[14] = vqaddq_s16(step2[14], step2[49]);
2579 out[15] = vqaddq_s16(step2[15], step2[48]);
2580 out[16] = vqaddq_s16(step2[16], step2[47]);
2581 out[17] = vqaddq_s16(step2[17], step2[46]);
2582 out[18] = vqaddq_s16(step2[18], step2[45]);
2583 out[19] = vqaddq_s16(step2[19], step2[44]);
2584 out[20] = vqaddq_s16(step2[20], step2[43]);
2585 out[21] = vqaddq_s16(step2[21], step2[42]);
2586 out[22] = vqaddq_s16(step2[22], step2[41]);
2587 out[23] = vqaddq_s16(step2[23], step2[40]);
2588 out[24] = vqaddq_s16(step2[24], step2[39]);
2589 out[25] = vqaddq_s16(step2[25], step2[38]);
2590 out[26] = vqaddq_s16(step2[26], step2[37]);
2591 out[27] = vqaddq_s16(step2[27], step2[36]);
2592 out[28] = vqaddq_s16(step2[28], step2[35]);
2593 out[29] = vqaddq_s16(step2[29], step2[34]);
2594 out[30] = vqaddq_s16(step2[30], step2[33]);
2595 out[31] = vqaddq_s16(step2[31], step2[32]);
2596 out[32] = vqsubq_s16(step2[31], step2[32]);
2597 out[33] = vqsubq_s16(step2[30], step2[33]);
2598 out[34] = vqsubq_s16(step2[29], step2[34]);
2599 out[35] = vqsubq_s16(step2[28], step2[35]);
2600 out[36] = vqsubq_s16(step2[27], step2[36]);
2601 out[37] = vqsubq_s16(step2[26], step2[37]);
2602 out[38] = vqsubq_s16(step2[25], step2[38]);
2603 out[39] = vqsubq_s16(step2[24], step2[39]);
2604 out[40] = vqsubq_s16(step2[23], step2[40]);
2605 out[41] = vqsubq_s16(step2[22], step2[41]);
2606 out[42] = vqsubq_s16(step2[21], step2[42]);
2607 out[43] = vqsubq_s16(step2[20], step2[43]);
2608 out[44] = vqsubq_s16(step2[19], step2[44]);
2609 out[45] = vqsubq_s16(step2[18], step2[45]);
2610 out[46] = vqsubq_s16(step2[17], step2[46]);
2611 out[47] = vqsubq_s16(step2[16], step2[47]);
2612 out[48] = vqsubq_s16(step2[15], step2[48]);
2613 out[49] = vqsubq_s16(step2[14], step2[49]);
2614 out[50] = vqsubq_s16(step2[13], step2[50]);
2615 out[51] = vqsubq_s16(step2[12], step2[51]);
2616 out[52] = vqsubq_s16(step2[11], step2[52]);
2617 out[53] = vqsubq_s16(step2[10], step2[53]);
2618 out[54] = vqsubq_s16(step2[9], step2[54]);
2619 out[55] = vqsubq_s16(step2[8], step2[55]);
2620 out[56] = vqsubq_s16(step2[7], step2[56]);
2621 out[57] = vqsubq_s16(step2[6], step2[57]);
2622 out[58] = vqsubq_s16(step2[5], step2[58]);
2623 out[59] = vqsubq_s16(step2[4], step2[59]);
2624 out[60] = vqsubq_s16(step2[3], step2[60]);
2625 out[61] = vqsubq_s16(step2[2], step2[61]);
2626 out[62] = vqsubq_s16(step2[1], step2[62]);
2627 out[63] = vqsubq_s16(step2[0], step2[63]);
2628}
2629
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002630static INLINE void idct64_low1_neon(int16x8_t *input, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002631 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002632 const int32_t *cospi = cospi_arr(cos_bit);
2633 int16x8_t step1;
2634 int32x4_t t32[2];
2635
2636 // stage 1
2637 // stage 2
2638 // stage 3
2639 // stage 4
2640 // stage 5
2641 // stage 6
2642
2643 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
2644 t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
2645
2646 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
2647 vrshrn_n_s32(t32[1], INV_COS_BIT));
2648 // stage 7
2649 // stage 8
2650 // stage 9
2651 // stage 10
2652 // stage 11
2653 out[0] = step1;
2654 out[1] = step1;
2655 out[2] = step1;
2656 out[3] = step1;
2657 out[4] = step1;
2658 out[5] = step1;
2659 out[6] = step1;
2660 out[7] = step1;
2661 out[8] = step1;
2662 out[9] = step1;
2663 out[10] = step1;
2664 out[11] = step1;
2665 out[12] = step1;
2666 out[13] = step1;
2667 out[14] = step1;
2668 out[15] = step1;
2669 out[16] = step1;
2670 out[17] = step1;
2671 out[18] = step1;
2672 out[19] = step1;
2673 out[20] = step1;
2674 out[21] = step1;
2675 out[22] = step1;
2676 out[23] = step1;
2677 out[24] = step1;
2678 out[25] = step1;
2679 out[26] = step1;
2680 out[27] = step1;
2681 out[28] = step1;
2682 out[29] = step1;
2683 out[30] = step1;
2684 out[31] = step1;
2685 out[32] = step1;
2686 out[33] = step1;
2687 out[34] = step1;
2688 out[35] = step1;
2689 out[36] = step1;
2690 out[37] = step1;
2691 out[38] = step1;
2692 out[39] = step1;
2693 out[40] = step1;
2694 out[41] = step1;
2695 out[42] = step1;
2696 out[43] = step1;
2697 out[44] = step1;
2698 out[45] = step1;
2699 out[46] = step1;
2700 out[47] = step1;
2701 out[48] = step1;
2702 out[49] = step1;
2703 out[50] = step1;
2704 out[51] = step1;
2705 out[52] = step1;
2706 out[53] = step1;
2707 out[54] = step1;
2708 out[55] = step1;
2709 out[56] = step1;
2710 out[57] = step1;
2711 out[58] = step1;
2712 out[59] = step1;
2713 out[60] = step1;
2714 out[61] = step1;
2715 out[62] = step1;
2716 out[63] = step1;
2717}
2718
Yaowu Xueb5e4e22020-04-06 14:17:55 -07002719static INLINE void idct64_low8_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05002720 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00002721 const int32_t *cospi = cospi_arr(cos_bit);
2722 int16x8_t step2[64], step1[64];
2723
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302724 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
2725 (int16_t)cospi[36], (int16_t)cospi[28]);
2726 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
2727 (int16_t)cospi[52], (int16_t)cospi[12]);
2728 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
2729 (int16_t)cospi[40], (int16_t)cospi[24]);
2730 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
2731 (int16_t)cospi[16], (int16_t)cospi[48]);
2732 const int16x4_t c4 =
2733 set_s16x4_neon((int16_t)(-cospi[36]), (int16_t)(-cospi[28]),
2734 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
2735 const int16x4_t c5 =
2736 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
2737 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
2738 const int16x4_t c6 =
2739 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
2740 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00002741
2742 // stage 1
2743 // stage 2
2744
2745 step2[0] = in[0];
2746 step2[8] = in[4];
2747 step2[16] = in[2];
2748 step2[24] = in[6];
2749
2750 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2751 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2752 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2753 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2754
2755 // stage 3
2756
2757 step1[0] = step2[0];
2758 step1[8] = step2[8];
2759
2760 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2761 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2762
2763 step1[32] = step2[32];
2764 step1[33] = step2[32];
2765 step1[38] = step2[39];
2766 step1[39] = step2[39];
2767 step1[40] = step2[40];
2768 step1[41] = step2[40];
2769 step1[46] = step2[47];
2770 step1[47] = step2[47];
2771 step1[48] = step2[48];
2772 step1[49] = step2[48];
2773 step1[54] = step2[55];
2774 step1[55] = step2[55];
2775 step1[56] = step2[56];
2776 step1[57] = step2[56];
2777 step1[62] = step2[63];
2778 step1[63] = step2[63];
2779
2780 // stage 4
2781
2782 step2[0] = step1[0];
2783
2784 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2785 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302786 btf_16_lane_1_0_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00002787 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302788 btf_16_lane_3_2_neon(step1[46], step1[49], c4, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00002789
2790 step2[16] = step1[16];
2791 step2[17] = step1[16];
2792 step2[22] = step1[23];
2793 step2[23] = step1[23];
2794 step2[24] = step1[24];
2795 step2[25] = step1[24];
2796 step2[30] = step1[31];
2797 step2[31] = step1[31];
2798 step2[32] = step1[32];
2799 step2[39] = step1[39];
2800 step2[40] = step1[40];
2801 step2[47] = step1[47];
2802 step2[48] = step1[48];
2803 step2[55] = step1[55];
2804 step2[56] = step1[56];
2805 step2[63] = step1[63];
2806
2807 // stage 5
2808
2809 step1[0] = step2[0];
2810
2811 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302812 btf_16_lane_3_2_neon(step2[22], step2[25], c5, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00002813
2814 step1[8] = step2[8];
2815 step1[9] = step2[8];
2816 step1[14] = step2[15];
2817 step1[15] = step2[15];
2818
2819 step1[16] = step2[16];
2820 step1[23] = step2[23];
2821 step1[24] = step2[24];
2822 step1[31] = step2[31];
2823 step1[32] = step2[32];
2824 step1[33] = step2[33];
2825 step1[34] = step2[33];
2826 step1[35] = step2[32];
2827 step1[36] = step2[39];
2828 step1[37] = step2[38];
2829 step1[38] = step2[38];
2830 step1[39] = step2[39];
2831 step1[40] = step2[40];
2832 step1[41] = step2[41];
2833 step1[42] = step2[41];
2834 step1[43] = step2[40];
2835 step1[44] = step2[47];
2836 step1[45] = step2[46];
2837 step1[46] = step2[46];
2838 step1[47] = step2[47];
2839 step1[48] = step2[48];
2840 step1[49] = step2[49];
2841 step1[50] = step2[49];
2842 step1[51] = step2[48];
2843 step1[52] = step2[55];
2844 step1[53] = step2[54];
2845 step1[54] = step2[54];
2846 step1[55] = step2[55];
2847 step1[56] = step2[56];
2848 step1[57] = step2[57];
2849 step1[58] = step2[57];
2850 step1[59] = step2[56];
2851 step1[60] = step2[63];
2852 step1[61] = step2[62];
2853 step1[62] = step2[62];
2854 step1[63] = step2[63];
2855
2856 // stage 6
2857
2858 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2859 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2860 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2861 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302862 btf_16_lane_1_0_neon(step1[36], step1[59], c5, &step2[36], &step2[59]);
2863 btf_16_lane_1_0_neon(step1[37], step1[58], c5, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00002864 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2865 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302866 btf_16_lane_3_2_neon(step1[44], step1[51], c5, &step2[44], &step2[51]);
2867 btf_16_lane_3_2_neon(step1[45], step1[50], c5, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00002868
2869 step2[8] = step1[8];
2870 step2[15] = step1[15];
2871 step2[16] = step1[16];
2872 step2[17] = step1[17];
2873 step2[18] = step1[17];
2874 step2[19] = step1[16];
2875 step2[20] = step1[23];
2876 step2[21] = step1[22];
2877 step2[22] = step1[22];
2878 step2[23] = step1[23];
2879 step2[24] = step1[24];
2880 step2[25] = step1[25];
2881 step2[26] = step1[25];
2882 step2[27] = step1[24];
2883 step2[28] = step1[31];
2884 step2[29] = step1[30];
2885 step2[30] = step1[30];
2886 step2[31] = step1[31];
2887 step2[32] = step1[32];
2888 step2[33] = step1[33];
2889 step2[38] = step1[38];
2890 step2[39] = step1[39];
2891 step2[40] = step1[40];
2892 step2[41] = step1[41];
2893 step2[46] = step1[46];
2894 step2[47] = step1[47];
2895 step2[48] = step1[48];
2896 step2[49] = step1[49];
2897 step2[54] = step1[54];
2898 step2[55] = step1[55];
2899 step2[56] = step1[56];
2900 step2[57] = step1[57];
2901 step2[62] = step1[62];
2902 step2[63] = step1[63];
2903
2904 // stage 7
2905
2906 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2907 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302908 btf_16_lane_3_2_neon(step2[20], step2[27], c6, &step1[20], &step1[27]);
2909 btf_16_lane_3_2_neon(step2[21], step2[26], c6, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00002910
2911 step1[0] = step2[0];
2912 step1[1] = step2[1];
2913 step1[2] = step2[1];
2914 step1[3] = step2[0];
2915 step1[8] = step2[8];
2916 step1[9] = step2[9];
2917 step1[10] = step2[9];
2918 step1[11] = step2[8];
2919 step1[12] = step2[15];
2920 step1[13] = step2[14];
2921 step1[14] = step2[14];
2922 step1[15] = step2[15];
2923 step1[16] = step2[16];
2924 step1[17] = step2[17];
2925 step1[22] = step2[22];
2926 step1[23] = step2[23];
2927 step1[24] = step2[24];
2928 step1[25] = step2[25];
2929 step1[30] = step2[30];
2930 step1[31] = step2[31];
2931 step1[32] = vqaddq_s16(step2[32], step2[39]);
2932 step1[33] = vqaddq_s16(step2[33], step2[38]);
2933 step1[34] = vqaddq_s16(step2[34], step2[37]);
2934 step1[35] = vqaddq_s16(step2[35], step2[36]);
2935 step1[36] = vqsubq_s16(step2[35], step2[36]);
2936 step1[37] = vqsubq_s16(step2[34], step2[37]);
2937 step1[38] = vqsubq_s16(step2[33], step2[38]);
2938 step1[39] = vqsubq_s16(step2[32], step2[39]);
2939 step1[40] = vqsubq_s16(step2[47], step2[40]);
2940 step1[41] = vqsubq_s16(step2[46], step2[41]);
2941 step1[42] = vqsubq_s16(step2[45], step2[42]);
2942 step1[43] = vqsubq_s16(step2[44], step2[43]);
2943 step1[44] = vqaddq_s16(step2[43], step2[44]);
2944 step1[45] = vqaddq_s16(step2[42], step2[45]);
2945 step1[46] = vqaddq_s16(step2[41], step2[46]);
2946 step1[47] = vqaddq_s16(step2[40], step2[47]);
2947 step1[48] = vqaddq_s16(step2[48], step2[55]);
2948 step1[49] = vqaddq_s16(step2[49], step2[54]);
2949 step1[50] = vqaddq_s16(step2[50], step2[53]);
2950 step1[51] = vqaddq_s16(step2[51], step2[52]);
2951 step1[52] = vqsubq_s16(step2[51], step2[52]);
2952 step1[53] = vqsubq_s16(step2[50], step2[53]);
2953 step1[54] = vqsubq_s16(step2[49], step2[54]);
2954 step1[55] = vqsubq_s16(step2[48], step2[55]);
2955 step1[56] = vqsubq_s16(step2[63], step2[56]);
2956 step1[57] = vqsubq_s16(step2[62], step2[57]);
2957 step1[58] = vqsubq_s16(step2[61], step2[58]);
2958 step1[59] = vqsubq_s16(step2[60], step2[59]);
2959 step1[60] = vqaddq_s16(step2[59], step2[60]);
2960 step1[61] = vqaddq_s16(step2[58], step2[61]);
2961 step1[62] = vqaddq_s16(step2[57], step2[62]);
2962 step1[63] = vqaddq_s16(step2[56], step2[63]);
2963
2964 // stage 8
2965
2966 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2967 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2968 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2969 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2970 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2971 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05302972 btf_16_lane_3_2_neon(step1[40], step1[55], c6, &step2[40], &step2[55]);
2973 btf_16_lane_3_2_neon(step1[41], step1[54], c6, &step2[41], &step2[54]);
2974 btf_16_lane_3_2_neon(step1[42], step1[53], c6, &step2[42], &step2[53]);
2975 btf_16_lane_3_2_neon(step1[43], step1[52], c6, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00002976
2977 step2[0] = step1[0];
2978 step2[1] = step1[1];
2979 step2[2] = step1[2];
2980 step2[3] = step1[3];
2981 step2[4] = step1[3];
2982 step2[5] = step1[2];
2983 step2[6] = step1[1];
2984 step2[7] = step1[0];
2985 step2[8] = step1[8];
2986 step2[9] = step1[9];
2987 step2[14] = step1[14];
2988 step2[15] = step1[15];
2989 step2[16] = vqaddq_s16(step1[16], step1[23]);
2990 step2[17] = vqaddq_s16(step1[17], step1[22]);
2991 step2[18] = vqaddq_s16(step1[18], step1[21]);
2992 step2[19] = vqaddq_s16(step1[19], step1[20]);
2993 step2[20] = vqsubq_s16(step1[19], step1[20]);
2994 step2[21] = vqsubq_s16(step1[18], step1[21]);
2995 step2[22] = vqsubq_s16(step1[17], step1[22]);
2996 step2[23] = vqsubq_s16(step1[16], step1[23]);
2997 step2[24] = vqsubq_s16(step1[31], step1[24]);
2998 step2[25] = vqsubq_s16(step1[30], step1[25]);
2999 step2[26] = vqsubq_s16(step1[29], step1[26]);
3000 step2[27] = vqsubq_s16(step1[28], step1[27]);
3001 step2[28] = vqaddq_s16(step1[28], step1[27]);
3002 step2[29] = vqaddq_s16(step1[29], step1[26]);
3003 step2[30] = vqaddq_s16(step1[30], step1[25]);
3004 step2[31] = vqaddq_s16(step1[31], step1[24]);
3005 step2[32] = step1[32];
3006 step2[33] = step1[33];
3007 step2[34] = step1[34];
3008 step2[35] = step1[35];
3009 step2[44] = step1[44];
3010 step2[45] = step1[45];
3011 step2[46] = step1[46];
3012 step2[47] = step1[47];
3013 step2[48] = step1[48];
3014 step2[49] = step1[49];
3015 step2[50] = step1[50];
3016 step2[51] = step1[51];
3017 step2[60] = step1[60];
3018 step2[61] = step1[61];
3019 step2[62] = step1[62];
3020 step2[63] = step1[63];
3021
3022 // stage 9
3023 idct64_stage9_neon(step2, step1, cos_bit);
3024
3025 // stage 10
3026 idct64_stage10_neon(step1, step2, cos_bit);
3027
3028 // stage 11
3029
3030 out[0] = vqaddq_s16(step2[0], step2[63]);
3031 out[1] = vqaddq_s16(step2[1], step2[62]);
3032 out[2] = vqaddq_s16(step2[2], step2[61]);
3033 out[3] = vqaddq_s16(step2[3], step2[60]);
3034 out[4] = vqaddq_s16(step2[4], step2[59]);
3035 out[5] = vqaddq_s16(step2[5], step2[58]);
3036 out[6] = vqaddq_s16(step2[6], step2[57]);
3037 out[7] = vqaddq_s16(step2[7], step2[56]);
3038 out[8] = vqaddq_s16(step2[8], step2[55]);
3039 out[9] = vqaddq_s16(step2[9], step2[54]);
3040 out[10] = vqaddq_s16(step2[10], step2[53]);
3041 out[11] = vqaddq_s16(step2[11], step2[52]);
3042 out[12] = vqaddq_s16(step2[12], step2[51]);
3043 out[13] = vqaddq_s16(step2[13], step2[50]);
3044 out[14] = vqaddq_s16(step2[14], step2[49]);
3045 out[15] = vqaddq_s16(step2[15], step2[48]);
3046 out[16] = vqaddq_s16(step2[16], step2[47]);
3047 out[17] = vqaddq_s16(step2[17], step2[46]);
3048 out[18] = vqaddq_s16(step2[18], step2[45]);
3049 out[19] = vqaddq_s16(step2[19], step2[44]);
3050 out[20] = vqaddq_s16(step2[20], step2[43]);
3051 out[21] = vqaddq_s16(step2[21], step2[42]);
3052 out[22] = vqaddq_s16(step2[22], step2[41]);
3053 out[23] = vqaddq_s16(step2[23], step2[40]);
3054 out[24] = vqaddq_s16(step2[24], step2[39]);
3055 out[25] = vqaddq_s16(step2[25], step2[38]);
3056 out[26] = vqaddq_s16(step2[26], step2[37]);
3057 out[27] = vqaddq_s16(step2[27], step2[36]);
3058 out[28] = vqaddq_s16(step2[28], step2[35]);
3059 out[29] = vqaddq_s16(step2[29], step2[34]);
3060 out[30] = vqaddq_s16(step2[30], step2[33]);
3061 out[31] = vqaddq_s16(step2[31], step2[32]);
3062 out[32] = vqsubq_s16(step2[31], step2[32]);
3063 out[33] = vqsubq_s16(step2[30], step2[33]);
3064 out[34] = vqsubq_s16(step2[29], step2[34]);
3065 out[35] = vqsubq_s16(step2[28], step2[35]);
3066 out[36] = vqsubq_s16(step2[27], step2[36]);
3067 out[37] = vqsubq_s16(step2[26], step2[37]);
3068 out[38] = vqsubq_s16(step2[25], step2[38]);
3069 out[39] = vqsubq_s16(step2[24], step2[39]);
3070 out[40] = vqsubq_s16(step2[23], step2[40]);
3071 out[41] = vqsubq_s16(step2[22], step2[41]);
3072 out[42] = vqsubq_s16(step2[21], step2[42]);
3073 out[43] = vqsubq_s16(step2[20], step2[43]);
3074 out[44] = vqsubq_s16(step2[19], step2[44]);
3075 out[45] = vqsubq_s16(step2[18], step2[45]);
3076 out[46] = vqsubq_s16(step2[17], step2[46]);
3077 out[47] = vqsubq_s16(step2[16], step2[47]);
3078 out[48] = vqsubq_s16(step2[15], step2[48]);
3079 out[49] = vqsubq_s16(step2[14], step2[49]);
3080 out[50] = vqsubq_s16(step2[13], step2[50]);
3081 out[51] = vqsubq_s16(step2[12], step2[51]);
3082 out[52] = vqsubq_s16(step2[11], step2[52]);
3083 out[53] = vqsubq_s16(step2[10], step2[53]);
3084 out[54] = vqsubq_s16(step2[9], step2[54]);
3085 out[55] = vqsubq_s16(step2[8], step2[55]);
3086 out[56] = vqsubq_s16(step2[7], step2[56]);
3087 out[57] = vqsubq_s16(step2[6], step2[57]);
3088 out[58] = vqsubq_s16(step2[5], step2[58]);
3089 out[59] = vqsubq_s16(step2[4], step2[59]);
3090 out[60] = vqsubq_s16(step2[3], step2[60]);
3091 out[61] = vqsubq_s16(step2[2], step2[61]);
3092 out[62] = vqsubq_s16(step2[1], step2[62]);
3093 out[63] = vqsubq_s16(step2[0], step2[63]);
3094}
3095
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003096static INLINE void idct64_low16_neon(int16x8_t *in, int16x8_t *out,
Scott LaVarnwayed25b612022-02-17 13:28:23 -05003097 int8_t cos_bit) {
sachin garg56f10202018-09-24 14:05:25 +00003098 const int32_t *cospi = cospi_arr(cos_bit);
3099 int16x8_t step2[64], step1[64];
3100
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303101 const int16x4_t c0 = set_s16x4_neon((int16_t)cospi[4], (int16_t)cospi[60],
3102 (int16_t)cospi[36], (int16_t)cospi[28]);
3103 const int16x4_t c1 = set_s16x4_neon((int16_t)cospi[20], (int16_t)cospi[44],
3104 (int16_t)cospi[52], (int16_t)cospi[12]);
3105 const int16x4_t c2 = set_s16x4_neon((int16_t)cospi[8], (int16_t)cospi[56],
3106 (int16_t)cospi[40], (int16_t)cospi[24]);
3107 const int16x4_t c3 = set_s16x4_neon((int16_t)cospi[32], (int16_t)cospi[32],
3108 (int16_t)cospi[16], (int16_t)cospi[48]);
3109 const int16x4_t c4 =
3110 set_s16x4_neon((int16_t)(-cospi[4]), (int16_t)(-cospi[60]),
3111 (int16_t)(-cospi[36]), (int16_t)(-cospi[28]));
3112 const int16x4_t c5 =
3113 set_s16x4_neon((int16_t)(-cospi[20]), (int16_t)(-cospi[44]),
3114 (int16_t)(-cospi[52]), (int16_t)(-cospi[12]));
3115 const int16x4_t c6 =
3116 set_s16x4_neon((int16_t)(-cospi[8]), (int16_t)(-cospi[56]),
3117 (int16_t)(-cospi[40]), (int16_t)(-cospi[24]));
3118 const int16x4_t c7 =
3119 set_s16x4_neon((int16_t)(-cospi[32]), (int16_t)(-cospi[32]),
3120 (int16_t)(-cospi[16]), (int16_t)(-cospi[48]));
sachin garg56f10202018-09-24 14:05:25 +00003121
3122 // stage 1
3123 // stage 2
3124
3125 step2[0] = in[0];
3126 step2[4] = in[8];
3127 step2[8] = in[4];
3128 step2[12] = in[12];
3129 step2[16] = in[2];
3130 step2[20] = in[10];
3131 step2[24] = in[6];
3132 step2[28] = in[14];
3133
3134 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
3135 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
3136 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
3137 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
3138 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
3139 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
3140 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
3141 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
3142
3143 // stage 3
3144
3145 step1[0] = step2[0];
3146 step1[4] = step2[4];
3147 step1[8] = step2[8];
3148 step1[12] = step2[12];
3149
3150 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
3151 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
3152 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
3153 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
3154
3155 step1[32] = step2[32];
3156 step1[33] = step2[32];
3157 step1[34] = step2[35];
3158 step1[35] = step2[35];
3159 step1[36] = step2[36];
3160 step1[37] = step2[36];
3161 step1[38] = step2[39];
3162 step1[39] = step2[39];
3163 step1[40] = step2[40];
3164 step1[41] = step2[40];
3165 step1[42] = step2[43];
3166 step1[43] = step2[43];
3167 step1[44] = step2[44];
3168 step1[45] = step2[44];
3169 step1[46] = step2[47];
3170 step1[47] = step2[47];
3171 step1[48] = step2[48];
3172 step1[49] = step2[48];
3173 step1[50] = step2[51];
3174 step1[51] = step2[51];
3175 step1[52] = step2[52];
3176 step1[53] = step2[52];
3177 step1[54] = step2[55];
3178 step1[55] = step2[55];
3179 step1[56] = step2[56];
3180 step1[57] = step2[56];
3181 step1[58] = step2[59];
3182 step1[59] = step2[59];
3183 step1[60] = step2[60];
3184 step1[61] = step2[60];
3185 step1[62] = step2[63];
3186 step1[63] = step2[63];
3187
3188 // stage 4
3189
3190 step2[0] = step1[0];
3191 step2[4] = step1[4];
3192
3193 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
3194 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
3195 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303196 btf_16_lane_1_0_neon(step1[34], step1[61], c4, &step2[34], &step2[61]);
sachin garg56f10202018-09-24 14:05:25 +00003197 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303198 btf_16_lane_3_2_neon(step1[38], step1[57], c4, &step2[38], &step2[57]);
sachin garg56f10202018-09-24 14:05:25 +00003199 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303200 btf_16_lane_1_0_neon(step1[42], step1[53], c5, &step2[42], &step2[53]);
sachin garg56f10202018-09-24 14:05:25 +00003201 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303202 btf_16_lane_3_2_neon(step1[46], step1[49], c5, &step2[46], &step2[49]);
sachin garg56f10202018-09-24 14:05:25 +00003203
3204 step2[16] = step1[16];
3205 step2[17] = step1[16];
3206 step2[18] = step1[19];
3207 step2[19] = step1[19];
3208 step2[20] = step1[20];
3209 step2[21] = step1[20];
3210 step2[22] = step1[23];
3211 step2[23] = step1[23];
3212 step2[24] = step1[24];
3213 step2[25] = step1[24];
3214 step2[26] = step1[27];
3215 step2[27] = step1[27];
3216 step2[28] = step1[28];
3217 step2[29] = step1[28];
3218 step2[30] = step1[31];
3219 step2[31] = step1[31];
3220 step2[32] = step1[32];
3221 step2[35] = step1[35];
3222 step2[36] = step1[36];
3223 step2[39] = step1[39];
3224 step2[40] = step1[40];
3225 step2[43] = step1[43];
3226 step2[44] = step1[44];
3227 step2[47] = step1[47];
3228 step2[48] = step1[48];
3229 step2[51] = step1[51];
3230 step2[52] = step1[52];
3231 step2[55] = step1[55];
3232 step2[56] = step1[56];
3233 step2[59] = step1[59];
3234 step2[60] = step1[60];
3235 step2[63] = step1[63];
3236
3237 // stage 5
3238
3239 step1[0] = step2[0];
3240
3241 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
3242 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303243 btf_16_lane_1_0_neon(step2[18], step2[29], c6, &step1[18], &step1[29]);
sachin garg56f10202018-09-24 14:05:25 +00003244 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303245 btf_16_lane_3_2_neon(step2[22], step2[25], c6, &step1[22], &step1[25]);
sachin garg56f10202018-09-24 14:05:25 +00003246
3247 step1[8] = step2[8];
3248 step1[9] = step2[8];
3249 step1[10] = step2[11];
3250 step1[11] = step2[11];
3251 step1[12] = step2[12];
3252 step1[13] = step2[12];
3253 step1[14] = step2[15];
3254 step1[15] = step2[15];
3255 step1[16] = step2[16];
3256 step1[19] = step2[19];
3257 step1[20] = step2[20];
3258 step1[23] = step2[23];
3259 step1[24] = step2[24];
3260 step1[27] = step2[27];
3261 step1[28] = step2[28];
3262 step1[31] = step2[31];
3263 step1[32] = vqaddq_s16(step2[32], step2[35]);
3264 step1[33] = vqaddq_s16(step2[33], step2[34]);
3265 step1[34] = vqsubq_s16(step2[33], step2[34]);
3266 step1[35] = vqsubq_s16(step2[32], step2[35]);
3267 step1[36] = vqsubq_s16(step2[39], step2[36]);
3268 step1[37] = vqsubq_s16(step2[38], step2[37]);
3269 step1[38] = vqaddq_s16(step2[38], step2[37]);
3270 step1[39] = vqaddq_s16(step2[39], step2[36]);
3271 step1[40] = vqaddq_s16(step2[40], step2[43]);
3272 step1[41] = vqaddq_s16(step2[41], step2[42]);
3273 step1[42] = vqsubq_s16(step2[41], step2[42]);
3274 step1[43] = vqsubq_s16(step2[40], step2[43]);
3275 step1[44] = vqsubq_s16(step2[47], step2[44]);
3276 step1[45] = vqsubq_s16(step2[46], step2[45]);
3277 step1[46] = vqaddq_s16(step2[46], step2[45]);
3278 step1[47] = vqaddq_s16(step2[47], step2[44]);
3279 step1[48] = vqaddq_s16(step2[48], step2[51]);
3280 step1[49] = vqaddq_s16(step2[49], step2[50]);
3281 step1[50] = vqsubq_s16(step2[49], step2[50]);
3282 step1[51] = vqsubq_s16(step2[48], step2[51]);
3283 step1[52] = vqsubq_s16(step2[55], step2[52]);
3284 step1[53] = vqsubq_s16(step2[54], step2[53]);
3285 step1[54] = vqaddq_s16(step2[54], step2[53]);
3286 step1[55] = vqaddq_s16(step2[55], step2[52]);
3287 step1[56] = vqaddq_s16(step2[56], step2[59]);
3288 step1[57] = vqaddq_s16(step2[57], step2[58]);
3289 step1[58] = vqsubq_s16(step2[57], step2[58]);
3290 step1[59] = vqsubq_s16(step2[56], step2[59]);
3291 step1[60] = vqsubq_s16(step2[63], step2[60]);
3292 step1[61] = vqsubq_s16(step2[62], step2[61]);
3293 step1[62] = vqaddq_s16(step2[62], step2[61]);
3294 step1[63] = vqaddq_s16(step2[63], step2[60]);
3295
3296 // stage 6
3297
3298 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
3299 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303300 btf_16_lane_3_2_neon(step1[10], step1[13], c7, &step2[10], &step2[13]);
sachin garg56f10202018-09-24 14:05:25 +00003301 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
3302 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303303 btf_16_lane_1_0_neon(step1[36], step1[59], c6, &step2[36], &step2[59]);
3304 btf_16_lane_1_0_neon(step1[37], step1[58], c6, &step2[37], &step2[58]);
sachin garg56f10202018-09-24 14:05:25 +00003305 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
3306 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303307 btf_16_lane_3_2_neon(step1[44], step1[51], c6, &step2[44], &step2[51]);
3308 btf_16_lane_3_2_neon(step1[45], step1[50], c6, &step2[45], &step2[50]);
sachin garg56f10202018-09-24 14:05:25 +00003309
3310 step2[4] = step1[4];
3311 step2[5] = step1[4];
3312 step2[6] = step1[7];
3313 step2[7] = step1[7];
3314 step2[8] = step1[8];
3315 step2[11] = step1[11];
3316 step2[12] = step1[12];
3317 step2[15] = step1[15];
3318 step2[16] = vqaddq_s16(step1[16], step1[19]);
3319 step2[17] = vqaddq_s16(step1[17], step1[18]);
3320 step2[18] = vqsubq_s16(step1[17], step1[18]);
3321 step2[19] = vqsubq_s16(step1[16], step1[19]);
3322 step2[20] = vqsubq_s16(step1[23], step1[20]);
3323 step2[21] = vqsubq_s16(step1[22], step1[21]);
3324 step2[22] = vqaddq_s16(step1[22], step1[21]);
3325 step2[23] = vqaddq_s16(step1[23], step1[20]);
3326 step2[24] = vqaddq_s16(step1[24], step1[27]);
3327 step2[25] = vqaddq_s16(step1[25], step1[26]);
3328 step2[26] = vqsubq_s16(step1[25], step1[26]);
3329 step2[27] = vqsubq_s16(step1[24], step1[27]);
3330 step2[28] = vqsubq_s16(step1[31], step1[28]);
3331 step2[29] = vqsubq_s16(step1[30], step1[29]);
3332 step2[30] = vqaddq_s16(step1[30], step1[29]);
3333 step2[31] = vqaddq_s16(step1[31], step1[28]);
3334 step2[32] = step1[32];
3335 step2[33] = step1[33];
3336 step2[38] = step1[38];
3337 step2[39] = step1[39];
3338 step2[40] = step1[40];
3339 step2[41] = step1[41];
3340 step2[46] = step1[46];
3341 step2[47] = step1[47];
3342 step2[48] = step1[48];
3343 step2[49] = step1[49];
3344 step2[54] = step1[54];
3345 step2[55] = step1[55];
3346 step2[56] = step1[56];
3347 step2[57] = step1[57];
3348 step2[62] = step1[62];
3349 step2[63] = step1[63];
3350
3351 // stage 7
3352
3353 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
3354 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
3355 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303356 btf_16_lane_3_2_neon(step2[20], step2[27], c7, &step1[20], &step1[27]);
3357 btf_16_lane_3_2_neon(step2[21], step2[26], c7, &step1[21], &step1[26]);
sachin garg56f10202018-09-24 14:05:25 +00003358
3359 step1[0] = step2[0];
3360 step1[1] = step2[1];
3361 step1[2] = step2[1];
3362 step1[3] = step2[0];
3363 step1[4] = step2[4];
3364 step1[7] = step2[7];
3365 step1[8] = vqaddq_s16(step2[8], step2[11]);
3366 step1[9] = vqaddq_s16(step2[9], step2[10]);
3367 step1[10] = vqsubq_s16(step2[9], step2[10]);
3368 step1[11] = vqsubq_s16(step2[8], step2[11]);
3369 step1[12] = vqsubq_s16(step2[15], step2[12]);
3370 step1[13] = vqsubq_s16(step2[14], step2[13]);
3371 step1[14] = vqaddq_s16(step2[14], step2[13]);
3372 step1[15] = vqaddq_s16(step2[15], step2[12]);
3373 step1[16] = step2[16];
3374 step1[17] = step2[17];
3375 step1[22] = step2[22];
3376 step1[23] = step2[23];
3377 step1[24] = step2[24];
3378 step1[25] = step2[25];
3379 step1[30] = step2[30];
3380 step1[31] = step2[31];
3381 step1[32] = vqaddq_s16(step2[32], step2[39]);
3382 step1[33] = vqaddq_s16(step2[33], step2[38]);
3383 step1[34] = vqaddq_s16(step2[34], step2[37]);
3384 step1[35] = vqaddq_s16(step2[35], step2[36]);
3385 step1[36] = vqsubq_s16(step2[35], step2[36]);
3386 step1[37] = vqsubq_s16(step2[34], step2[37]);
3387 step1[38] = vqsubq_s16(step2[33], step2[38]);
3388 step1[39] = vqsubq_s16(step2[32], step2[39]);
3389 step1[40] = vqsubq_s16(step2[47], step2[40]);
3390 step1[41] = vqsubq_s16(step2[46], step2[41]);
3391 step1[42] = vqsubq_s16(step2[45], step2[42]);
3392 step1[43] = vqsubq_s16(step2[44], step2[43]);
3393 step1[44] = vqaddq_s16(step2[43], step2[44]);
3394 step1[45] = vqaddq_s16(step2[42], step2[45]);
3395 step1[46] = vqaddq_s16(step2[41], step2[46]);
3396 step1[47] = vqaddq_s16(step2[40], step2[47]);
3397 step1[48] = vqaddq_s16(step2[48], step2[55]);
3398 step1[49] = vqaddq_s16(step2[49], step2[54]);
3399 step1[50] = vqaddq_s16(step2[50], step2[53]);
3400 step1[51] = vqaddq_s16(step2[51], step2[52]);
3401 step1[52] = vqsubq_s16(step2[51], step2[52]);
3402 step1[53] = vqsubq_s16(step2[50], step2[53]);
3403 step1[54] = vqsubq_s16(step2[49], step2[54]);
3404 step1[55] = vqsubq_s16(step2[48], step2[55]);
3405 step1[56] = vqsubq_s16(step2[63], step2[56]);
3406 step1[57] = vqsubq_s16(step2[62], step2[57]);
3407 step1[58] = vqsubq_s16(step2[61], step2[58]);
3408 step1[59] = vqsubq_s16(step2[60], step2[59]);
3409 step1[60] = vqaddq_s16(step2[59], step2[60]);
3410 step1[61] = vqaddq_s16(step2[58], step2[61]);
3411 step1[62] = vqaddq_s16(step2[57], step2[62]);
3412 step1[63] = vqaddq_s16(step2[56], step2[63]);
3413
3414 // stage 8
3415
3416 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3417 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3418 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3419 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3420 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3421 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303422 btf_16_lane_3_2_neon(step1[40], step1[55], c7, &step2[40], &step2[55]);
3423 btf_16_lane_3_2_neon(step1[41], step1[54], c7, &step2[41], &step2[54]);
3424 btf_16_lane_3_2_neon(step1[42], step1[53], c7, &step2[42], &step2[53]);
3425 btf_16_lane_3_2_neon(step1[43], step1[52], c7, &step2[43], &step2[52]);
sachin garg56f10202018-09-24 14:05:25 +00003426
3427 step2[0] = vqaddq_s16(step1[0], step1[7]);
3428 step2[1] = vqaddq_s16(step1[1], step1[6]);
3429 step2[2] = vqaddq_s16(step1[2], step1[5]);
3430 step2[3] = vqaddq_s16(step1[3], step1[4]);
3431 step2[4] = vqsubq_s16(step1[3], step1[4]);
3432 step2[5] = vqsubq_s16(step1[2], step1[5]);
3433 step2[6] = vqsubq_s16(step1[1], step1[6]);
3434 step2[7] = vqsubq_s16(step1[0], step1[7]);
3435 step2[8] = step1[8];
3436 step2[9] = step1[9];
3437 step2[14] = step1[14];
3438 step2[15] = step1[15];
3439 step2[16] = vqaddq_s16(step1[16], step1[23]);
3440 step2[17] = vqaddq_s16(step1[17], step1[22]);
3441 step2[18] = vqaddq_s16(step1[18], step1[21]);
3442 step2[19] = vqaddq_s16(step1[19], step1[20]);
3443 step2[20] = vqsubq_s16(step1[19], step1[20]);
3444 step2[21] = vqsubq_s16(step1[18], step1[21]);
3445 step2[22] = vqsubq_s16(step1[17], step1[22]);
3446 step2[23] = vqsubq_s16(step1[16], step1[23]);
3447 step2[24] = vqsubq_s16(step1[31], step1[24]);
3448 step2[25] = vqsubq_s16(step1[30], step1[25]);
3449 step2[26] = vqsubq_s16(step1[29], step1[26]);
3450 step2[27] = vqsubq_s16(step1[28], step1[27]);
3451 step2[28] = vqaddq_s16(step1[28], step1[27]);
3452 step2[29] = vqaddq_s16(step1[29], step1[26]);
3453 step2[30] = vqaddq_s16(step1[30], step1[25]);
3454 step2[31] = vqaddq_s16(step1[31], step1[24]);
3455 step2[32] = step1[32];
3456 step2[33] = step1[33];
3457 step2[34] = step1[34];
3458 step2[35] = step1[35];
3459 step2[44] = step1[44];
3460 step2[45] = step1[45];
3461 step2[46] = step1[46];
3462 step2[47] = step1[47];
3463 step2[48] = step1[48];
3464 step2[49] = step1[49];
3465 step2[50] = step1[50];
3466 step2[51] = step1[51];
3467 step2[60] = step1[60];
3468 step2[61] = step1[61];
3469 step2[62] = step1[62];
3470 step2[63] = step1[63];
3471
3472 // stage 9
3473 idct64_stage9_neon(step2, step1, cos_bit);
3474
3475 // stage 10
3476 idct64_stage10_neon(step1, step2, cos_bit);
3477
3478 // stage 11
3479
3480 out[0] = vqaddq_s16(step2[0], step2[63]);
3481 out[1] = vqaddq_s16(step2[1], step2[62]);
3482 out[2] = vqaddq_s16(step2[2], step2[61]);
3483 out[3] = vqaddq_s16(step2[3], step2[60]);
3484 out[4] = vqaddq_s16(step2[4], step2[59]);
3485 out[5] = vqaddq_s16(step2[5], step2[58]);
3486 out[6] = vqaddq_s16(step2[6], step2[57]);
3487 out[7] = vqaddq_s16(step2[7], step2[56]);
3488 out[8] = vqaddq_s16(step2[8], step2[55]);
3489 out[9] = vqaddq_s16(step2[9], step2[54]);
3490 out[10] = vqaddq_s16(step2[10], step2[53]);
3491 out[11] = vqaddq_s16(step2[11], step2[52]);
3492 out[12] = vqaddq_s16(step2[12], step2[51]);
3493 out[13] = vqaddq_s16(step2[13], step2[50]);
3494 out[14] = vqaddq_s16(step2[14], step2[49]);
3495 out[15] = vqaddq_s16(step2[15], step2[48]);
3496 out[16] = vqaddq_s16(step2[16], step2[47]);
3497 out[17] = vqaddq_s16(step2[17], step2[46]);
3498 out[18] = vqaddq_s16(step2[18], step2[45]);
3499 out[19] = vqaddq_s16(step2[19], step2[44]);
3500 out[20] = vqaddq_s16(step2[20], step2[43]);
3501 out[21] = vqaddq_s16(step2[21], step2[42]);
3502 out[22] = vqaddq_s16(step2[22], step2[41]);
3503 out[23] = vqaddq_s16(step2[23], step2[40]);
3504 out[24] = vqaddq_s16(step2[24], step2[39]);
3505 out[25] = vqaddq_s16(step2[25], step2[38]);
3506 out[26] = vqaddq_s16(step2[26], step2[37]);
3507 out[27] = vqaddq_s16(step2[27], step2[36]);
3508 out[28] = vqaddq_s16(step2[28], step2[35]);
3509 out[29] = vqaddq_s16(step2[29], step2[34]);
3510 out[30] = vqaddq_s16(step2[30], step2[33]);
3511 out[31] = vqaddq_s16(step2[31], step2[32]);
3512 out[32] = vqsubq_s16(step2[31], step2[32]);
3513 out[33] = vqsubq_s16(step2[30], step2[33]);
3514 out[34] = vqsubq_s16(step2[29], step2[34]);
3515 out[35] = vqsubq_s16(step2[28], step2[35]);
3516 out[36] = vqsubq_s16(step2[27], step2[36]);
3517 out[37] = vqsubq_s16(step2[26], step2[37]);
3518 out[38] = vqsubq_s16(step2[25], step2[38]);
3519 out[39] = vqsubq_s16(step2[24], step2[39]);
3520 out[40] = vqsubq_s16(step2[23], step2[40]);
3521 out[41] = vqsubq_s16(step2[22], step2[41]);
3522 out[42] = vqsubq_s16(step2[21], step2[42]);
3523 out[43] = vqsubq_s16(step2[20], step2[43]);
3524 out[44] = vqsubq_s16(step2[19], step2[44]);
3525 out[45] = vqsubq_s16(step2[18], step2[45]);
3526 out[46] = vqsubq_s16(step2[17], step2[46]);
3527 out[47] = vqsubq_s16(step2[16], step2[47]);
3528 out[48] = vqsubq_s16(step2[15], step2[48]);
3529 out[49] = vqsubq_s16(step2[14], step2[49]);
3530 out[50] = vqsubq_s16(step2[13], step2[50]);
3531 out[51] = vqsubq_s16(step2[12], step2[51]);
3532 out[52] = vqsubq_s16(step2[11], step2[52]);
3533 out[53] = vqsubq_s16(step2[10], step2[53]);
3534 out[54] = vqsubq_s16(step2[9], step2[54]);
3535 out[55] = vqsubq_s16(step2[8], step2[55]);
3536 out[56] = vqsubq_s16(step2[7], step2[56]);
3537 out[57] = vqsubq_s16(step2[6], step2[57]);
3538 out[58] = vqsubq_s16(step2[5], step2[58]);
3539 out[59] = vqsubq_s16(step2[4], step2[59]);
3540 out[60] = vqsubq_s16(step2[3], step2[60]);
3541 out[61] = vqsubq_s16(step2[2], step2[61]);
3542 out[62] = vqsubq_s16(step2[1], step2[62]);
3543 out[63] = vqsubq_s16(step2[0], step2[63]);
3544}
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303545
Venkat000f2f62018-07-05 12:03:05 +05303546// Functions for blocks with eob at DC and within
3547// topleft 8x8, 16x16, 32x32 corner
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303548static const transform_neon
3549 lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
3550 {
3551 { NULL, NULL, NULL, NULL },
3552 { NULL, NULL, NULL, NULL },
3553 { NULL, NULL, NULL, NULL },
3554 },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003555 { { idct8_low1_neon, idct8_neon, NULL, NULL },
3556 { iadst8_low1_neon, iadst8_neon, NULL, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303557 { NULL, NULL, NULL, NULL } },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303558 {
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003559 { idct16_low1_neon, idct16_low8_neon, idct16_neon, NULL },
3560 { iadst16_low1_neon, iadst16_low8_neon, iadst16_neon, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303561 { NULL, NULL, NULL, NULL },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303562 },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003563 { { idct32_low1_neon, idct32_low8_neon, idct32_low16_neon, idct32_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303564 { NULL, NULL, NULL, NULL },
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303565 { NULL, NULL, NULL, NULL } },
Yaowu Xueb5e4e22020-04-06 14:17:55 -07003566 { { idct64_low1_neon, idct64_low8_neon, idct64_low16_neon,
3567 idct64_low32_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303568 { NULL, NULL, NULL, NULL },
3569 { NULL, NULL, NULL, NULL } }
3570 };
3571
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303572static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
3573 uint8_t *output, int stride,
3574 TX_TYPE tx_type,
3575 TX_SIZE tx_size, int eob) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303576 (void)tx_type;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303577 int16x8_t a[32 * 4];
3578 int16x8_t b[32 * 4];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303579 int eobx, eoby;
3580 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003581 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303582 const int txw_idx = get_txw_idx(tx_size);
3583 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303584 const int txfm_size_col = tx_size_wide[tx_size];
3585 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303586 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3587 0);
3588 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3589 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303590 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303591 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303592 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3593 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303594 const int32_t *input_1;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303595 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303596
3597 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3598 input_1 = input;
3599 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3600 int k = j * 8 + i * txfm_size_col;
3601 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303602 transpose_s16_8x8q(&a[k], &a[k]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303603 input_1 += 8;
3604 }
3605 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303606 if (abs(rect_type) == 1) {
3607 int y = i * txfm_size_col;
3608 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3609 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303610 identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
3611 txw_idx, txfm_size_col, -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303612 for (int j = 0; j < buf_size_w_div8; ++j) {
3613 int k = j * 8 + i * txfm_size_col;
3614 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3615 }
3616 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303617 }
3618 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303619 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3620 txh_idx, txfm_size_row, -shift[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303621 }
3622 if (txfm_size_col >= 16) {
3623 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3624 lowbd_add_flip_buffer_16xn_neon(
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303625 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303626 }
3627 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303628 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303629 }
3630}
3631
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303632static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
3633 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3634 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303635 int16x8_t a[16 * 2];
3636 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303637 int eobx, eoby, ud_flip, lr_flip;
3638 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003639 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303640 const int txw_idx = get_txw_idx(tx_size);
3641 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303642 const int txfm_size_col = tx_size_wide[tx_size];
3643 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303644 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3645 0);
3646 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303647 const int buf_size_w_div8 = txfm_size_col >> 3;
3648 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3649 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
3650 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303651 const int32_t *input_1;
3652 int temp_b = 0;
3653 const transform_neon row_txfm =
3654 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303655
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303656 assert(row_txfm != NULL);
3657
3658 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3659
3660 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3661 input_1 = input;
3662 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3663 int k = j * 8 + i * txfm_size_col;
3664 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
3665 transpose_s16_8x8q(&a[k], &a[k]);
3666 input_1 += 8;
3667 }
3668 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303669 if (abs(rect_type) == 1) {
3670 int y = i * txfm_size_col;
3671 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3672 }
Scott LaVarnwayed25b612022-02-17 13:28:23 -05003673 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], INV_COS_BIT);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303674 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
3675 -shift[0]);
3676 if (lr_flip == 1) {
3677 for (int j = 0; j < buf_size_w_div8; ++j) {
3678 int k = j * 8 + i * txfm_size_col;
3679 flip_buf_ud_neon(&a[k], 8);
3680 transpose_s16_8x8q(
3681 &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
3682 }
3683 temp_b += 8;
3684 } else {
3685 for (int j = 0; j < buf_size_w_div8; ++j) {
3686 int k = j * 8 + i * txfm_size_col;
3687 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3688 }
3689 temp_b += 8;
3690 }
3691 }
3692 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303693 identity_txfm_round_neon(&b[j * txfm_size_row], &b[j * txfm_size_row],
3694 txh_idx, txfm_size_row, -shift[1]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303695 }
3696 if (txfm_size_col >= 16) {
3697 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3698 lowbd_add_flip_buffer_16xn_neon(
3699 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3700 }
3701 } else if (txfm_size_col == 8) {
3702 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3703 }
3704}
3705
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303706static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
3707 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3708 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303709 int16x8_t a[16 * 2];
3710 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303711 int eobx, eoby, ud_flip, lr_flip;
3712 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003713 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303714 const int txw_idx = get_txw_idx(tx_size);
3715 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303716 const int txfm_size_col = tx_size_wide[tx_size];
3717 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303718 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3719 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303720 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303721 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303722 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3723 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303724 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3725 const int32_t *input_1;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303726 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303727 const transform_neon col_txfm =
3728 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3729
3730 assert(col_txfm != NULL);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303731
3732 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3733
3734 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3735 input_1 = input;
3736 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3737 int k = j * 8 + i * txfm_size_col;
3738 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303739 transpose_s16_8x8q(&a[k], &a[k]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303740 input_1 += 8;
3741 }
3742 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303743 if (abs(rect_type) == 1) {
3744 int y = i * txfm_size_col;
3745 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3746 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303747 identity_txfm_round_neon(&a[i * txfm_size_col], &a[i * txfm_size_col],
3748 txw_idx, txfm_size_col, -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303749 for (int j = 0; j < buf_size_w_div8; ++j) {
3750 int k = j * 8 + i * txfm_size_col;
3751 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3752 }
3753 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303754 }
3755 for (int j = 0; j < buf_size_w_div8; ++j) {
Scott LaVarnwayed25b612022-02-17 13:28:23 -05003756 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303757 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303758 -shift[1]);
3759 }
3760 if (txfm_size_col >= 16) {
3761 for (int i = 0; i < (txfm_size_col >> 4); i++) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303762 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303763 output + 16 * i, stride, ud_flip,
3764 txfm_size_row);
3765 }
3766 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303767 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303768 }
3769}
3770
Venkat000f2f62018-07-05 12:03:05 +05303771static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
3772 uint8_t *output, int stride,
sachin garg56f10202018-09-24 14:05:25 +00003773 TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303774 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003775 TX_SIZE tx_size = TX_4X4;
Venkat000f2f62018-07-05 12:03:05 +05303776 DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
3777 int32_t *temp_in = txfm_buf;
3778
Yaowu Xua19e7622019-04-29 14:12:44 -07003779 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303780 const int txw_idx = get_txw_idx(tx_size);
3781 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303782 const int txfm_size_col = tx_size_wide[tx_size];
3783 const int txfm_size_row = tx_size_high[tx_size];
3784 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3785 int32_t *temp_out = temp_in + buf_offset;
3786 int32_t *buf = temp_out + buf_offset;
3787 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303788 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16, 16, 16 };
Venkat000f2f62018-07-05 12:03:05 +05303789 int r, bd = 8;
3790 const transform_1d_neon row_txfm =
3791 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3792 const transform_1d_neon col_txfm =
3793 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3794
3795 int ud_flip, lr_flip;
3796 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3797
3798 for (int i = 0; i < txfm_size_row; i++) {
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003799 row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303800
3801 input += txfm_size_col;
3802 buf_ptr += txfm_size_col;
3803 }
3804
3805 for (int c = 0; c < txfm_size_col; ++c) {
3806 if (lr_flip == 0) {
3807 for (r = 0; r < txfm_size_row; ++r)
3808 temp_in[r] = buf[r * txfm_size_col + c];
3809 } else {
3810 // flip left right
3811 for (r = 0; r < txfm_size_row; ++r)
3812 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3813 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303814 clamp_buf(temp_in, txfm_size_row, bd + 8);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003815 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303816 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3817
3818 if (ud_flip == 0) {
3819 for (r = 0; r < txfm_size_row; ++r) {
3820 output[r * stride + c] =
3821 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
3822 }
3823 } else {
3824 // flip upside down
3825 for (r = 0; r < txfm_size_row; ++r) {
3826 output[r * stride + c] = highbd_clip_pixel_add(
3827 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
3828 }
3829 }
3830 }
3831}
3832
3833void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003834 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303835 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003836 TX_SIZE tx_size = TX_4X8;
Venkat000f2f62018-07-05 12:03:05 +05303837 DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
3838 int32_t *temp_in = txfm_buf;
3839
Yaowu Xua19e7622019-04-29 14:12:44 -07003840 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303841 const int txw_idx = get_txw_idx(tx_size);
3842 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303843 const int txfm_size_col = tx_size_wide[tx_size];
3844 const int txfm_size_row = tx_size_high[tx_size];
3845 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3846 int32_t *temp_out = temp_in + buf_offset;
3847 int32_t *buf = temp_out + buf_offset;
3848 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303849 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3850 16, 16, 16, 16 };
Venkat000f2f62018-07-05 12:03:05 +05303851 int r, bd = 8;
3852 const transform_1d_neon row_txfm =
3853 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3854 const transform_1d_neon col_txfm =
3855 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3856
3857 int ud_flip, lr_flip;
3858 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3859
3860 for (int i = 0; i < txfm_size_row; i++) {
3861 for (int j = 0; j < txfm_size_col; j++)
3862 temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
3863
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003864 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303865 input += txfm_size_col;
3866 buf_ptr += txfm_size_col;
3867 }
3868
3869 for (int c = 0; c < txfm_size_col; ++c) {
3870 if (lr_flip == 0) {
3871 for (r = 0; r < txfm_size_row; ++r)
3872 temp_in[r] = buf[r * txfm_size_col + c];
3873 } else {
3874 // flip left right
3875 for (r = 0; r < txfm_size_row; ++r)
3876 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3877 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303878 clamp_buf(temp_in, txfm_size_row, bd + 8);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003879 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303880 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3881
3882 if (ud_flip == 0) {
3883 for (r = 0; r < txfm_size_row; ++r) {
3884 output[r * stride + c] =
3885 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
3886 }
3887 } else {
3888 // flip upside down
3889 for (r = 0; r < txfm_size_row; ++r) {
3890 output[r * stride + c] = highbd_clip_pixel_add(
3891 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
3892 }
3893 }
3894 }
3895}
3896
3897void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003898 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303899 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003900 TX_SIZE tx_size = TX_8X4;
Venkat000f2f62018-07-05 12:03:05 +05303901 DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
3902 int32_t *temp_in = txfm_buf;
3903
Yaowu Xua19e7622019-04-29 14:12:44 -07003904 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303905 const int txw_idx = get_txw_idx(tx_size);
3906 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303907 const int txfm_size_col = tx_size_wide[tx_size];
3908 const int txfm_size_row = tx_size_high[tx_size];
3909 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3910 int32_t *temp_out = temp_in + buf_offset;
3911 int32_t *buf = temp_out + buf_offset;
3912 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303913 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16,
3914 16, 16, 16, 16 };
Venkat000f2f62018-07-05 12:03:05 +05303915 int r, bd = 8;
3916 const transform_1d_neon row_txfm =
3917 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3918 const transform_1d_neon col_txfm =
3919 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3920
3921 int ud_flip, lr_flip;
3922 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3923
3924 for (int i = 0; i < txfm_size_row; i++) {
3925 for (int j = 0; j < txfm_size_col; j++)
3926 temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
3927
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003928 row_txfm(temp_in, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303929 input += txfm_size_col;
3930 buf_ptr += txfm_size_col;
3931 }
3932
3933 for (int c = 0; c < txfm_size_col; ++c) {
3934 if (lr_flip == 0) {
3935 for (r = 0; r < txfm_size_row; ++r)
3936 temp_in[r] = buf[r * txfm_size_col + c];
3937 } else {
3938 // flip left right
3939 for (r = 0; r < txfm_size_row; ++r)
3940 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3941 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303942 clamp_buf(temp_in, txfm_size_row, bd + 8);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003943 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303944 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3945
3946 if (ud_flip == 0) {
3947 for (r = 0; r < txfm_size_row; ++r) {
3948 output[r * stride + c] =
3949 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
3950 }
3951 } else {
3952 // flip upside down
3953 for (r = 0; r < txfm_size_row; ++r) {
3954 output[r * stride + c] = highbd_clip_pixel_add(
3955 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
3956 }
3957 }
3958 }
3959}
3960
3961void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003962 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303963 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003964 TX_SIZE tx_size = TX_4X16;
Venkat000f2f62018-07-05 12:03:05 +05303965 DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
3966 int32_t *temp_in = txfm_buf;
3967
Yaowu Xua19e7622019-04-29 14:12:44 -07003968 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303969 const int txw_idx = get_txw_idx(tx_size);
3970 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05303971 const int txfm_size_col = tx_size_wide[tx_size];
3972 const int txfm_size_row = tx_size_high[tx_size];
3973 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3974 int32_t *temp_out = temp_in + buf_offset;
3975 int32_t *buf = temp_out + buf_offset;
3976 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05303977 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
3978 16, 16, 16, 16, 16 };
Venkat000f2f62018-07-05 12:03:05 +05303979 int r, bd = 8;
3980 const transform_1d_neon row_txfm =
3981 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3982 const transform_1d_neon col_txfm =
3983 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3984
3985 int ud_flip, lr_flip;
3986 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3987
3988 for (int i = 0; i < txfm_size_row; i++) {
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05003989 row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05303990 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
3991 input += txfm_size_col;
3992 buf_ptr += txfm_size_col;
3993 }
3994
3995 for (int c = 0; c < txfm_size_col; ++c) {
3996 if (lr_flip == 0) {
3997 for (r = 0; r < txfm_size_row; ++r)
3998 temp_in[r] = buf[r * txfm_size_col + c];
3999 } else {
4000 // flip left right
4001 for (r = 0; r < txfm_size_row; ++r)
4002 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4003 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05304004 clamp_buf(temp_in, txfm_size_row, bd + 8);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05004005 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05304006 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4007
4008 if (ud_flip == 0) {
4009 for (r = 0; r < txfm_size_row; ++r) {
4010 output[r * stride + c] =
4011 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4012 }
4013 } else {
4014 // flip upside down
4015 for (r = 0; r < txfm_size_row; ++r) {
4016 output[r * stride + c] = highbd_clip_pixel_add(
4017 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4018 }
4019 }
4020 }
4021}
4022
4023void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00004024 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304025 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00004026 TX_SIZE tx_size = TX_16X4;
Venkat000f2f62018-07-05 12:03:05 +05304027 DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
4028 int32_t *temp_in = txfm_buf;
4029
Yaowu Xua19e7622019-04-29 14:12:44 -07004030 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05304031 const int txw_idx = get_txw_idx(tx_size);
4032 const int txh_idx = get_txh_idx(tx_size);
Venkat000f2f62018-07-05 12:03:05 +05304033 const int txfm_size_col = tx_size_wide[tx_size];
4034 const int txfm_size_row = tx_size_high[tx_size];
4035 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4036 int32_t *temp_out = temp_in + buf_offset;
4037 int32_t *buf = temp_out + buf_offset;
4038 int32_t *buf_ptr = buf;
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05304039 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16, 16, 16, 16, 16,
4040 16, 16, 16, 16, 16 };
Venkat000f2f62018-07-05 12:03:05 +05304041 int r, bd = 8;
4042 const transform_1d_neon row_txfm =
4043 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4044 const transform_1d_neon col_txfm =
4045 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4046
4047 int ud_flip, lr_flip;
4048 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4049
4050 for (int i = 0; i < txfm_size_row; i++) {
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05004051 row_txfm(input, buf_ptr, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05304052 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
4053 input += txfm_size_col;
4054 buf_ptr += txfm_size_col;
4055 }
4056
4057 for (int c = 0; c < txfm_size_col; ++c) {
4058 if (lr_flip == 0) {
4059 for (r = 0; r < txfm_size_row; ++r)
4060 temp_in[r] = buf[r * txfm_size_col + c];
4061 } else {
4062 // flip left right
4063 for (r = 0; r < txfm_size_row; ++r)
4064 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4065 }
Sachin Kumar Garg37b9a652019-04-24 20:43:27 +05304066 clamp_buf(temp_in, txfm_size_row, bd + 8);
Scott LaVarnwaya08d3f62022-02-14 14:26:11 -05004067 col_txfm(temp_in, temp_out, INV_COS_BIT, stage_range);
Venkat000f2f62018-07-05 12:03:05 +05304068 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4069
4070 if (ud_flip == 0) {
4071 for (r = 0; r < txfm_size_row; ++r) {
4072 output[r * stride + c] =
4073 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4074 }
4075 } else {
4076 // flip upside down
4077 for (r = 0; r < txfm_size_row; ++r) {
4078 output[r * stride + c] = highbd_clip_pixel_add(
4079 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4080 }
4081 }
4082 }
4083}
4084
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304085static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
4086 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4087 TX_SIZE tx_size, int eob) {
4088 int16x8_t a[64 * 8];
4089 int16x8_t b[64 * 8];
4090 int eobx, eoby, ud_flip, lr_flip;
4091 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07004092 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304093 const int txw_idx = get_txw_idx(tx_size);
4094 const int txh_idx = get_txh_idx(tx_size);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304095 const int txfm_size_col = tx_size_wide[tx_size];
4096 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304097 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304098 const int buf_size_w_div8 = txfm_size_col >> 3;
4099 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4100 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
sachin garg56f10202018-09-24 14:05:25 +00004101 const int input_stride = AOMMIN(32, txfm_size_col);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304102 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4103 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4104 const int32_t *input_1;
4105 int temp_b = 0;
4106
4107 const transform_neon row_txfm =
4108 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4109 const transform_neon col_txfm =
4110 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4111
4112 assert(col_txfm != NULL);
4113 assert(row_txfm != NULL);
4114
4115 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4116
4117 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4118 input_1 = input;
4119 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
4120 int k = j * 8 + i * txfm_size_col;
sachin garg56f10202018-09-24 14:05:25 +00004121 load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304122 transpose_s16_8x8q(&a[k], &a[k]);
4123 input_1 += 8;
4124 }
sachin garg56f10202018-09-24 14:05:25 +00004125 input += (input_stride * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304126 if (abs(rect_type) == 1) {
4127 int y = i * txfm_size_col;
sachin garg56f10202018-09-24 14:05:25 +00004128 round_shift_for_rect(&a[y], &a[y], input_stride);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304129 }
Scott LaVarnwayed25b612022-02-17 13:28:23 -05004130 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], INV_COS_BIT);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304131 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
4132 -shift[0]);
4133 if (lr_flip == 1) {
4134 for (int j = 0; j < buf_size_w_div8; ++j) {
4135 int k = j * 8 + i * txfm_size_col;
4136 flip_buf_ud_neon(&a[k], 8);
4137 transpose_s16_8x8q(
4138 &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
4139 }
4140 temp_b += 8;
4141 } else {
4142 for (int j = 0; j < buf_size_w_div8; ++j) {
4143 int k = j * 8 + i * txfm_size_col;
4144 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
4145 }
4146 temp_b += 8;
4147 }
4148 }
4149 for (int j = 0; j < buf_size_w_div8; ++j) {
Scott LaVarnwayed25b612022-02-17 13:28:23 -05004150 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], INV_COS_BIT);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304151 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
4152 -shift[1]);
4153 }
4154
4155 if (txfm_size_col >= 16) {
4156 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4157 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
4158 output + 16 * i, stride, ud_flip,
4159 txfm_size_row);
4160 }
4161 } else if (txfm_size_col == 8) {
4162 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
4163 }
4164}
4165
Venkat000f2f62018-07-05 12:03:05 +05304166static INLINE void lowbd_inv_txfm2d_add_universe_neon(
4167 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4168 TX_SIZE tx_size, int eob) {
4169 switch (tx_type) {
4170 case IDTX:
4171 lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
4172 eob);
4173 break;
4174
4175 case H_DCT:
4176 case H_ADST:
4177 case H_FLIPADST:
4178 lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
4179 tx_size, eob);
4180 break;
4181
4182 case V_DCT:
4183 case V_ADST:
4184 case V_FLIPADST:
4185 lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
4186 tx_size, eob);
4187 break;
4188
4189 default:
4190 lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
4191 tx_size, eob);
4192 break;
4193 }
4194}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304195
Venkat000f2f62018-07-05 12:03:05 +05304196void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4197 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4198 int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304199 switch (tx_size) {
4200 case TX_4X4:
sachin garg56f10202018-09-24 14:05:25 +00004201 lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304202 break;
4203
4204 case TX_4X8:
sachin garg56f10202018-09-24 14:05:25 +00004205 lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304206 break;
4207
4208 case TX_8X4:
sachin garg56f10202018-09-24 14:05:25 +00004209 lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304210 break;
4211
4212 case TX_4X16:
sachin garg56f10202018-09-24 14:05:25 +00004213 lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304214 break;
4215
4216 case TX_16X4:
sachin garg56f10202018-09-24 14:05:25 +00004217 lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304218 break;
4219
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304220 default:
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304221 lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
Venkat000f2f62018-07-05 12:03:05 +05304222 tx_size, eob);
Venkat000f2f62018-07-05 12:03:05 +05304223 break;
4224 }
4225}
4226void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
4227 const TxfmParam *txfm_param) {
4228 const TX_TYPE tx_type = txfm_param->tx_type;
4229 if (!txfm_param->lossless) {
4230 av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
4231 txfm_param->tx_size, txfm_param->eob);
4232 } else {
4233 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
4234 }
4235}