blob: 3f3833fc83834ec39ad5d56128d28e0c0fc2568d [file] [log] [blame]
Venkat000f2f62018-07-05 12:03:05 +05301/*
2 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053012#include <arm_neon.h>
13
Venkat000f2f62018-07-05 12:03:05 +053014#include "config/aom_config.h"
15#include "config/aom_dsp_rtcd.h"
16#include "config/av1_rtcd.h"
17
18#include "av1/common/av1_inv_txfm1d.h"
19#include "av1/common/av1_inv_txfm1d_cfg.h"
20#include "av1/common/av1_txfm.h"
21#include "av1/common/enums.h"
22#include "av1/common/idct.h"
23#include "av1/common/arm/av1_inv_txfm_neon.h"
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053024#include "av1/common/arm/transpose_neon.h"
Venkat000f2f62018-07-05 12:03:05 +053025
Venkat000f2f62018-07-05 12:03:05 +053026// 1D itx types
27typedef enum ATTRIBUTE_PACKED {
28 IDCT_1D,
29 IADST_1D,
30 IFLIPADST_1D = IADST_1D,
31 IIDENTITY_1D,
32 ITX_TYPES_1D,
33} ITX_TYPE_1D;
34
35static const ITX_TYPE_1D vitx_1d_tab[TX_TYPES] = {
36 IDCT_1D, IADST_1D, IDCT_1D, IADST_1D,
37 IFLIPADST_1D, IDCT_1D, IFLIPADST_1D, IADST_1D,
38 IFLIPADST_1D, IIDENTITY_1D, IDCT_1D, IIDENTITY_1D,
39 IADST_1D, IIDENTITY_1D, IFLIPADST_1D, IIDENTITY_1D,
40};
41
42static const ITX_TYPE_1D hitx_1d_tab[TX_TYPES] = {
43 IDCT_1D, IDCT_1D, IADST_1D, IADST_1D,
44 IDCT_1D, IFLIPADST_1D, IFLIPADST_1D, IFLIPADST_1D,
45 IADST_1D, IIDENTITY_1D, IIDENTITY_1D, IDCT_1D,
46 IIDENTITY_1D, IADST_1D, IIDENTITY_1D, IFLIPADST_1D,
47};
48
49// 1D functions
50static const transform_1d_neon lowbd_txfm_all_1d_arr[TX_SIZES][ITX_TYPES_1D] = {
51 { av1_idct4_new, av1_iadst4_new, av1_iidentity4_c },
52 { av1_idct8_new, av1_iadst8_new, av1_iidentity8_c },
53 { av1_idct16_new, av1_iadst16_new, av1_iidentity16_c },
54 { av1_idct32_new, NULL, NULL },
55 { av1_idct64_new, NULL, NULL },
56};
57
Sachin Kumar Garg11e09372018-07-17 18:02:10 +053058static INLINE void lowbd_add_flip_buffer_8xn_neon(int16x8_t *in,
59 uint8_t *output, int stride,
60 int flipud,
61 const int height) {
62 int j = flipud ? (height - 1) : 0;
63 const int step = flipud ? -1 : 1;
64 int16x8_t temp_output;
65 for (int i = 0; i < height; ++i, j += step) {
66 temp_output = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(output)));
67 temp_output = vaddq_s16(temp_output, in[j]);
68 vst1_u8(output, vqmovun_s16(temp_output));
69 output += stride;
70 }
71}
72
73static INLINE uint8x16_t lowbd_get_recon_16x16_neon(const uint8x16_t pred,
74 int16x8_t res0,
75 int16x8_t res1) {
76 int16x8_t temp_output[2];
77 uint8x16_t temp_output_8q;
78 temp_output[0] = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pred)));
79 temp_output[0] = vaddq_s16(temp_output[0], res0);
80 temp_output[1] = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pred)));
81 temp_output[1] = vaddq_s16(temp_output[1], res1);
82 temp_output_8q =
83 vcombine_u8(vqmovun_s16(temp_output[0]), vqmovun_s16(temp_output[1]));
84 return temp_output_8q;
85}
86
87static INLINE void lowbd_add_flip_buffer_16xn_neon(int16x8_t *in,
88 uint8_t *output, int stride,
89 int flipud, int height) {
90 uint8x16_t temp_output_8q;
91 int j = flipud ? (height - 1) : 0;
92 const int step = flipud ? -1 : 1;
93 for (int i = 0; i < height; ++i, j += step) {
94 temp_output_8q = vld1q_u8(output + i * stride);
95 temp_output_8q =
96 lowbd_get_recon_16x16_neon(temp_output_8q, in[j], in[j + height]);
97 vst1q_u8((output + i * stride), temp_output_8q);
98 }
99}
100
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530101static INLINE void lowbd_inv_txfm2d_memset_neon(int16x8_t *a, int size,
102 int value) {
103 for (int i = 0; i < size; i++) {
104 a[i] = vdupq_n_s16((int16_t)value);
105 }
106}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530107
108static INLINE void btf_16_lane_0_1_neon(const int16x8_t in0,
109 const int16x8_t in1, const int16x4_t c,
110 int16x8_t *t0, int16x8_t *t1) {
111 int32x4_t s0[2], s1[2];
112 int16x4_t v0[2], v1[2];
113
114 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
115 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
116 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
117 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
118
119 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
120 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
121 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
122 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
123
124 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
125 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
126 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
127 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
128
129 *t0 = vcombine_s16(v0[0], v0[1]);
130 *t1 = vcombine_s16(v1[0], v1[1]);
131}
132
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530133static INLINE void btf_16_lane_1_0_neon(const int16x8_t in0,
134 const int16x8_t in1, const int16x4_t c,
135 int16x8_t *t0, int16x8_t *t1) {
136 int32x4_t s0[2], s1[2];
137 int16x4_t v0[2], v1[2];
138
139 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
140 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
141 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
142 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
143
144 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 0);
145 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 0);
146 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 1);
147 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 1);
148
149 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
150 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
151 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
152 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
153
154 *t0 = vcombine_s16(v0[0], v0[1]);
155 *t1 = vcombine_s16(v1[0], v1[1]);
156}
157
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530158static INLINE void btf_16_lane_2_3_neon(const int16x8_t in0,
159 const int16x8_t in1, const int16x4_t c,
160 int16x8_t *t0, int16x8_t *t1) {
161 int32x4_t s0[2], s1[2];
162 int16x4_t v0[2], v1[2];
163
164 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
165 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
166 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
167 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
168
169 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
170 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
171 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
172 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
173
174 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
175 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
176 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
177 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
178
179 *t0 = vcombine_s16(v0[0], v0[1]);
180 *t1 = vcombine_s16(v1[0], v1[1]);
181}
182
183static INLINE void btf_16_neon(const int16x8_t in0, int16_t coef1,
184 int16_t coef2, int16x8_t *t0, int16x8_t *t1) {
185 int32x4_t s0_l, s0_h, s1_l, s1_h;
186 int16x4_t v0[2], v1[2];
187
188 s0_l = vmull_n_s16(vget_low_s16(in0), coef1);
189 s0_h = vmull_n_s16(vget_high_s16(in0), coef1);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530190 s1_l = vmull_n_s16(vget_low_s16(in0), coef2);
191 s1_h = vmull_n_s16(vget_high_s16(in0), coef2);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530192
193 v0[0] = vrshrn_n_s32(s0_l, INV_COS_BIT);
194 v0[1] = vrshrn_n_s32(s0_h, INV_COS_BIT);
195 v1[0] = vrshrn_n_s32(s1_l, INV_COS_BIT);
196 v1[1] = vrshrn_n_s32(s1_h, INV_COS_BIT);
197
198 *t0 = vcombine_s16(v0[0], v0[1]);
199 *t1 = vcombine_s16(v1[0], v1[1]);
200}
201
202static INLINE void btf_16_lane_3_2_neon(const int16x8_t in0,
203 const int16x8_t in1, const int16x4_t c,
204 int16x8_t *t0, int16x8_t *t1) {
205 int32x4_t s0[2], s1[2];
206 int16x4_t v0[2], v1[2];
207
208 s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
209 s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
210 s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
211 s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
212
213 s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
214 s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
215 s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
216 s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
217
218 v0[0] = vrshrn_n_s32(s0[0], INV_COS_BIT);
219 v0[1] = vrshrn_n_s32(s0[1], INV_COS_BIT);
220 v1[0] = vrshrn_n_s32(s1[0], INV_COS_BIT);
221 v1[1] = vrshrn_n_s32(s1[1], INV_COS_BIT);
222
223 *t0 = vcombine_s16(v0[0], v0[1]);
224 *t1 = vcombine_s16(v1[0], v1[1]);
225}
226
227static INLINE void btf_16_half_neon(int16x8_t *const x, const int16x4_t c) {
228 int32x4_t t0[2], t1[2];
229 int16x4_t v0[2], v1[2];
230
231 // Don't add/sub before multiply, which will overflow in iadst8.
232 const int32x4_t x0_lo = vmull_lane_s16(vget_low_s16(x[0]), c, 0);
233 const int32x4_t x0_hi = vmull_lane_s16(vget_high_s16(x[0]), c, 0);
234 const int32x4_t x1_lo = vmull_lane_s16(vget_low_s16(x[1]), c, 0);
235 const int32x4_t x1_hi = vmull_lane_s16(vget_high_s16(x[1]), c, 0);
236
237 t0[0] = vaddq_s32(x0_lo, x1_lo);
238 t0[1] = vaddq_s32(x0_hi, x1_hi);
239 t1[0] = vsubq_s32(x0_lo, x1_lo);
240 t1[1] = vsubq_s32(x0_hi, x1_hi);
241
242 v0[0] = vrshrn_n_s32(t0[0], INV_COS_BIT);
243 v0[1] = vrshrn_n_s32(t0[1], INV_COS_BIT);
244 v1[0] = vrshrn_n_s32(t1[0], INV_COS_BIT);
245 v1[1] = vrshrn_n_s32(t1[1], INV_COS_BIT);
246
247 x[0] = vcombine_s16(v0[0], v0[1]);
248 x[1] = vcombine_s16(v1[0], v1[1]);
249}
250
251static INLINE int16x4_t create_s16x4_neon(int16_t *const c0, int16_t *const c1,
252 int16_t *const c2,
253 int16_t *const c3) {
254 int16x4_t val = vdup_n_s16((int16_t)0);
255 val = vld1_lane_s16(c0, val, 0);
256 val = vld1_lane_s16(c1, val, 1);
257 val = vld1_lane_s16(c2, val, 2);
258 val = vld1_lane_s16(c3, val, 3);
259 return val;
260}
261
262static INLINE void iadst8_new_neon(int16x8_t *const in, int16x8_t *out,
263 int8_t cos_bit, int bit) {
264 (void)bit;
265 const int32_t *cospi = cospi_arr(cos_bit);
266
267 const int16x4_t c0 =
268 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
269 (int16_t *)(cospi + 20), (int16_t *)(cospi + 44));
270 const int16x4_t c1 =
271 create_s16x4_neon((int16_t *)(cospi + 36), (int16_t *)(cospi + 28),
272 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
273 const int16x4_t c2 =
274 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
275 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
276
277 int16x8_t x[8];
278 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
279
280 // Stage 1
281 x[0] = in[7];
282 x[1] = in[0];
283 x[2] = in[5];
284 x[3] = in[2];
285 x[4] = in[3];
286 x[5] = in[4];
287 x[6] = in[1];
288 x[7] = in[6];
289
290 // Stage 2
291 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
292 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
293 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
294 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
295
296 // Stage 3
297 x[0] = vqaddq_s16(s0, s4);
298 x[1] = vqaddq_s16(s1, s5);
299 x[2] = vqaddq_s16(s2, s6);
300 x[3] = vqaddq_s16(s3, s7);
301 x[4] = vqsubq_s16(s0, s4);
302 x[5] = vqsubq_s16(s1, s5);
303 x[6] = vqsubq_s16(s2, s6);
304 x[7] = vqsubq_s16(s3, s7);
305
306 // Stage 4
307 s0 = x[0];
308 s1 = x[1];
309 s2 = x[2];
310 s3 = x[3];
311 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
312 btf_16_lane_3_2_neon(x[7], x[6], c2, &s7, &s6);
313
314 // Stage 5
315 x[0] = vqaddq_s16(s0, s2);
316 x[1] = vqaddq_s16(s1, s3);
317 x[2] = vqsubq_s16(s0, s2);
318 x[3] = vqsubq_s16(s1, s3);
319 x[4] = vqaddq_s16(s4, s6);
320 x[5] = vqaddq_s16(s5, s7);
321 x[6] = vqsubq_s16(s4, s6);
322 x[7] = vqsubq_s16(s5, s7);
323
324 // stage 6
325 btf_16_half_neon(x + 2, c2);
326 btf_16_half_neon(x + 6, c2);
327
328 // Stage 7
329 out[0] = x[0];
330 out[1] = vnegq_s16(x[4]);
331 out[2] = x[6];
332 out[3] = vnegq_s16(x[2]);
333 out[4] = x[3];
334 out[5] = vnegq_s16(x[7]);
335 out[6] = x[5];
336 out[7] = vnegq_s16(x[1]);
337}
338
339static INLINE void iadst8_low1_new_neon(int16x8_t *const in, int16x8_t *out,
340 int8_t cos_bit, int bit) {
341 (void)bit;
342 const int32_t *cospi = cospi_arr(cos_bit);
343 const int16x4_t c2 =
344 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
345 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
346
347 int16x8_t x[8];
348 int16x8_t s0, s1, s4, s5;
349
350 // Stage 1
351 x[1] = in[0];
352
353 // Stage 2
354
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530355 btf_16_neon(x[1], cospi[60], -cospi[4], &s0, &s1);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530356
357 // Stage 3
358 x[0] = s0;
359 x[1] = s1;
360 x[4] = s0;
361 x[5] = s1;
362
363 // Stage 4
364 s0 = x[0];
365 s1 = x[1];
366 btf_16_lane_2_3_neon(x[4], x[5], c2, &s4, &s5);
367
368 // Stage 5
369 x[0] = s0;
370 x[1] = s1;
371 x[2] = s0;
372 x[3] = s1;
373 x[4] = s4;
374 x[5] = s5;
375 x[6] = s4;
376 x[7] = s5;
377
378 // stage 6
379 btf_16_half_neon(x + 2, c2);
380 btf_16_half_neon(x + 6, c2);
381
382 // Stage 7
383 out[0] = x[0];
384 out[1] = vnegq_s16(x[4]);
385 out[2] = x[6];
386 out[3] = vnegq_s16(x[2]);
387 out[4] = x[3];
388 out[5] = vnegq_s16(x[7]);
389 out[6] = x[5];
390 out[7] = vnegq_s16(x[1]);
391}
392
393static INLINE void idct8_new_neon(int16x8_t *in, int16x8_t *out, int8_t cos_bit,
394 int bit) {
395 (void)bit;
396 const int32_t *cospi = cospi_arr(cos_bit);
397 int16x8_t step1[8], step2[8];
398 const int16x4_t c0 =
399 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
400 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
401 const int16x4_t c2 =
402 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
403 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
404
405 // stage 2
406 btf_16_lane_0_1_neon(in[1], in[7], c0, &step1[7], &step1[4]);
407 btf_16_lane_2_3_neon(in[5], in[3], c0, &step1[6], &step1[5]);
408
409 // stage 3
410 btf_16_lane_0_1_neon(in[0], in[4], c2, &step2[0], &step2[1]);
411 btf_16_lane_2_3_neon(in[2], in[6], c2, &step2[3], &step2[2]);
412 step2[4] = vqaddq_s16(step1[4], step1[5]);
413 step2[5] = vqsubq_s16(step1[4], step1[5]);
414 step2[6] = vqsubq_s16(step1[7], step1[6]);
415 step2[7] = vqaddq_s16(step1[7], step1[6]);
416
417 // stage 4
418 step1[0] = vqaddq_s16(step2[0], step2[3]);
419 step1[1] = vqaddq_s16(step2[1], step2[2]);
420 step1[2] = vqsubq_s16(step2[1], step2[2]);
421 step1[3] = vqsubq_s16(step2[0], step2[3]);
422 btf_16_lane_0_1_neon(step2[6], step2[5], c2, &step1[6], &step1[5]);
423
424 // stage 5
425 out[0] = vqaddq_s16(step1[0], step2[7]);
426 out[1] = vqaddq_s16(step1[1], step1[6]);
427 out[2] = vqaddq_s16(step1[2], step1[5]);
428 out[3] = vqaddq_s16(step1[3], step2[4]);
429 out[4] = vqsubq_s16(step1[3], step2[4]);
430 out[5] = vqsubq_s16(step1[2], step1[5]);
431 out[6] = vqsubq_s16(step1[1], step1[6]);
432 out[7] = vqsubq_s16(step1[0], step2[7]);
433}
434
435static INLINE void idct8_low1_new_neon(int16x8_t *in, int16x8_t *out,
436 int8_t cos_bit, int bit) {
437 (void)bit;
438 const int32_t *cospi = cospi_arr(cos_bit);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530439 int16x8_t step1;
440 int32x4_t t32[2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530441
442 // stage 1
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530443 // stage 2
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530444 // stage 3
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530445 t32[0] = vmull_n_s16(vget_low_s16(in[0]), (int16_t)cospi[32]);
446 t32[1] = vmull_n_s16(vget_high_s16(in[0]), (int16_t)cospi[32]);
447
448 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
449 vrshrn_n_s32(t32[1], INV_COS_BIT));
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530450
451 // stage 4
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530452 // stage 5
453 out[0] = step1;
454 out[1] = step1;
455 out[2] = step1;
456 out[3] = step1;
457 out[4] = step1;
458 out[5] = step1;
459 out[6] = step1;
460 out[7] = step1;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530461}
462
463void av1_round_shift_array_16_neon(int16x8_t *arr, int size, int bit) {
464 assert(!(size % 4));
465 if (!bit) return;
466 const int16x8_t dup_bits_n_16x8 = vdupq_n_s16((int16_t)(-bit));
467 for (int i = 0; i < size; i++) {
468 arr[i] = vrshlq_s16(arr[i], dup_bits_n_16x8);
469 }
470}
471
472static INLINE void flip_buf_ud_neon(int16x8_t *input, int size) {
473 int16x8_t temp[8];
474 for (int i = 0; i < size; ++i) {
475 temp[i] = input[size - 1 - i];
476 }
477 for (int i = 0; i < size; ++i) {
478 input[i] = temp[i];
479 }
480}
481
482static INLINE void load_buffer_32bit_to_16bit_neon(const int32_t *input,
483 int16x8_t *const a,
484 int out_size) {
485 for (int i = 0; i < 8; ++i) {
486 a[i] = vcombine_s16(vmovn_s32(vld1q_s32(input)),
487 vmovn_s32(vld1q_s32(input + 4)));
488 input += out_size;
489 }
490}
491
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530492static INLINE void identity8_new_neon(int16x8_t *input, int16x8_t *output,
493 int8_t cos_bit, int bit) {
Sachin Kumar Garg11e09372018-07-17 18:02:10 +0530494 (void)bit;
495 (void)cos_bit;
496
497 output[0] = vmulq_n_s16(input[0], (int16_t)2);
498 output[1] = vmulq_n_s16(input[1], (int16_t)2);
499 output[2] = vmulq_n_s16(input[2], (int16_t)2);
500 output[3] = vmulq_n_s16(input[3], (int16_t)2);
501 output[4] = vmulq_n_s16(input[4], (int16_t)2);
502 output[5] = vmulq_n_s16(input[5], (int16_t)2);
503 output[6] = vmulq_n_s16(input[6], (int16_t)2);
504 output[7] = vmulq_n_s16(input[7], (int16_t)2);
505}
506
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +0530507static INLINE void round_shift_for_rect(int16x8_t *input, int16x8_t *output,
508 int size) {
509 int32x4_t out_low, out_high;
510 int16x4_t low, high;
511
512 for (int z = 0; z < size; ++z) {
513 out_low = vmull_n_s16(vget_low_s16(input[z]), (int16_t)NewInvSqrt2);
514 out_high = vmull_n_s16(vget_high_s16(input[z]), (int16_t)NewInvSqrt2);
515
516 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
517 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
518
519 output[z] = vcombine_s16(low, high);
520 }
521}
522
523static INLINE void identity16_new_neon(int16x8_t *input, int16x8_t *output,
524 int8_t cos_bit, int bit) {
525 (void)bit;
526 (void)cos_bit;
527
528 int32x4_t out_low, out_high;
529 int16x4_t low, high;
530 int16_t scale = (int16_t)(2 * NewSqrt2);
531
532 for (int z = 0; z < 16; ++z) {
533 out_low = vmull_n_s16(vget_low_s16(input[z]), scale);
534 out_high = vmull_n_s16(vget_high_s16(input[z]), scale);
535
536 low = vqrshrn_n_s32(out_low, (int32_t)NewSqrt2Bits);
537 high = vqrshrn_n_s32(out_high, (int32_t)NewSqrt2Bits);
538
539 output[z] = vcombine_s16(low, high);
540 }
541}
542
543static INLINE void identity32_new_neon(int16x8_t *input, int16x8_t *output,
544 int8_t cos_bit, int bit) {
545 (void)bit;
546 (void)cos_bit;
547
548 for (int z = 0; z < 32; ++z) {
549 output[z] = vmulq_n_s16(input[z], (int16_t)4);
550 }
551}
552
553static INLINE void idct16_low1_new_neon(int16x8_t *in, int16x8_t *out,
554 int8_t cos_bit, int bit) {
555 (void)bit;
556 const int32_t *cospi = cospi_arr(cos_bit);
557 int16x8_t step1;
558 int32x4_t t32[2];
559
560 // stage 4
561
562 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
563 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
564 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
565 vrshrn_n_s32(t32[1], INV_COS_BIT));
566
567 // stage 6
568 // stage 7
569 out[0] = step1;
570 out[1] = step1;
571 out[2] = step1;
572 out[3] = step1;
573 out[4] = step1;
574 out[5] = step1;
575 out[6] = step1;
576 out[7] = step1;
577 out[8] = step1;
578 out[9] = step1;
579 out[10] = step1;
580 out[11] = step1;
581 out[12] = step1;
582 out[13] = step1;
583 out[14] = step1;
584 out[15] = step1;
585}
586
587static INLINE void idct16_new_neon(int16x8_t *in, int16x8_t *out,
588 int8_t cos_bit, int bit) {
589 (void)bit;
590 const int32_t *cospi = cospi_arr(cos_bit);
591 int16x8_t step1[16], step2[16];
592
593 const int16x4_t c0 =
594 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
595 (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
596 const int16x4_t c1 =
597 create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
598 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
599 const int16x4_t c2 =
600 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
601 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
602 const int16x4_t c3 =
603 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
604 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
605
606 // stage 2
607
608 btf_16_lane_0_1_neon(in[1], in[15], c0, &step2[15], &step2[8]);
609 btf_16_lane_2_3_neon(in[9], in[7], c0, &step2[14], &step2[9]);
610 btf_16_lane_0_1_neon(in[5], in[11], c1, &step2[13], &step2[10]);
611 btf_16_lane_2_3_neon(in[13], in[3], c1, &step2[12], &step2[11]);
612
613 step2[0] = in[0];
614 step2[1] = in[8];
615 step2[2] = in[4];
616 step2[3] = in[12];
617 step2[4] = in[2];
618 step2[5] = in[10];
619 step2[6] = in[6];
620 step2[7] = in[14];
621
622 // stage 3
623
624 btf_16_lane_0_1_neon(step2[4], step2[7], c2, &step1[7], &step1[4]);
625 btf_16_lane_2_3_neon(step2[5], step2[6], c2, &step1[6], &step1[5]);
626
627 step1[0] = step2[0];
628 step1[1] = step2[1];
629 step1[2] = step2[2];
630 step1[3] = step2[3];
631 step1[8] = vqaddq_s16(step2[8], step2[9]);
632 step1[9] = vqsubq_s16(step2[8], step2[9]);
633 step1[10] = vqsubq_s16(step2[11], step2[10]);
634 step1[11] = vqaddq_s16(step2[11], step2[10]);
635 step1[12] = vqaddq_s16(step2[12], step2[13]);
636 step1[13] = vqsubq_s16(step2[12], step2[13]);
637 step1[14] = vqsubq_s16(step2[15], step2[14]);
638 step1[15] = vqaddq_s16(step2[15], step2[14]);
639
640 // stage 4
641
642 btf_16_lane_0_1_neon(step1[0], step1[1], c3, &step2[0], &step2[1]);
643 btf_16_lane_2_3_neon(step1[2], step1[3], c3, &step2[3], &step2[2]);
644 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
645 btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
646 &step2[10], &step2[13]);
647
648 step2[4] = vqaddq_s16(step1[4], step1[5]);
649 step2[5] = vqsubq_s16(step1[4], step1[5]);
650 step2[6] = vqsubq_s16(step1[7], step1[6]);
651 step2[7] = vqaddq_s16(step1[7], step1[6]);
652 step2[8] = step1[8];
653 step2[11] = step1[11];
654 step2[12] = step1[12];
655 step2[15] = step1[15];
656
657 // stage 5
658
659 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
660
661 step1[0] = vqaddq_s16(step2[0], step2[3]);
662 step1[1] = vqaddq_s16(step2[1], step2[2]);
663 step1[2] = vqsubq_s16(step2[1], step2[2]);
664 step1[3] = vqsubq_s16(step2[0], step2[3]);
665 step1[4] = step2[4];
666 step1[7] = step2[7];
667 step1[8] = vqaddq_s16(step2[8], step2[11]);
668 step1[9] = vqaddq_s16(step2[9], step2[10]);
669 step1[10] = vqsubq_s16(step2[9], step2[10]);
670 step1[11] = vqsubq_s16(step2[8], step2[11]);
671 step1[12] = vqsubq_s16(step2[15], step2[12]);
672 step1[13] = vqsubq_s16(step2[14], step2[13]);
673 step1[14] = vqaddq_s16(step2[14], step2[13]);
674 step1[15] = vqaddq_s16(step2[15], step2[12]);
675
676 // stage 6
677
678 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
679 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
680
681 step2[0] = vqaddq_s16(step1[0], step1[7]);
682 step2[1] = vqaddq_s16(step1[1], step1[6]);
683 step2[2] = vqaddq_s16(step1[2], step1[5]);
684 step2[3] = vqaddq_s16(step1[3], step1[4]);
685 step2[4] = vqsubq_s16(step1[3], step1[4]);
686 step2[5] = vqsubq_s16(step1[2], step1[5]);
687 step2[6] = vqsubq_s16(step1[1], step1[6]);
688 step2[7] = vqsubq_s16(step1[0], step1[7]);
689 step2[8] = step1[8];
690 step2[9] = step1[9];
691 step2[14] = step1[14];
692 step2[15] = step1[15];
693
694 // stage 7
695 out[0] = vqaddq_s16(step2[0], step2[15]);
696 out[1] = vqaddq_s16(step2[1], step2[14]);
697 out[2] = vqaddq_s16(step2[2], step2[13]);
698 out[3] = vqaddq_s16(step2[3], step2[12]);
699 out[4] = vqaddq_s16(step2[4], step2[11]);
700 out[5] = vqaddq_s16(step2[5], step2[10]);
701 out[6] = vqaddq_s16(step2[6], step2[9]);
702 out[7] = vqaddq_s16(step2[7], step2[8]);
703 out[8] = vqsubq_s16(step2[7], step2[8]);
704 out[9] = vqsubq_s16(step2[6], step2[9]);
705 out[10] = vqsubq_s16(step2[5], step2[10]);
706 out[11] = vqsubq_s16(step2[4], step2[11]);
707 out[12] = vqsubq_s16(step2[3], step2[12]);
708 out[13] = vqsubq_s16(step2[2], step2[13]);
709 out[14] = vqsubq_s16(step2[1], step2[14]);
710 out[15] = vqsubq_s16(step2[0], step2[15]);
711}
712
713static INLINE void idct16_low8_new_neon(int16x8_t *in, int16x8_t *out,
714 int8_t cos_bit, int bit) {
715 (void)bit;
716 const int32_t *cospi = cospi_arr(cos_bit);
717 int16x8_t step1[16], step2[16];
718 const int16x4_t c0 =
719 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
720 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
721
722 // stage 1
723 // stage 2
724
725 step2[0] = in[0];
726 step2[2] = in[4];
727 step2[4] = in[2];
728 step2[6] = in[6];
729
730 btf_16_neon(in[1], cospi[60], cospi[4], &step2[8], &step2[15]);
731 btf_16_neon(in[7], -cospi[36], cospi[28], &step2[9], &step2[14]);
732 btf_16_neon(in[5], cospi[44], cospi[20], &step2[10], &step2[13]);
733 btf_16_neon(in[3], -cospi[52], cospi[12], &step2[11], &step2[12]);
734
735 // stage 3
736
737 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
738 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
739
740 step1[0] = step2[0];
741 step1[2] = step2[2];
742 step1[8] = vqaddq_s16(step2[8], step2[9]);
743 step1[9] = vqsubq_s16(step2[8], step2[9]);
744 step1[10] = vqsubq_s16(step2[11], step2[10]);
745 step1[11] = vqaddq_s16(step2[11], step2[10]);
746 step1[12] = vqaddq_s16(step2[12], step2[13]);
747 step1[13] = vqsubq_s16(step2[12], step2[13]);
748 step1[14] = vqsubq_s16(step2[15], step2[14]);
749 step1[15] = vqaddq_s16(step2[15], step2[14]);
750
751 // stage 4
752
753 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
754 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
755 btf_16_lane_2_3_neon(step1[14], step1[9], c0, &step2[14], &step2[9]);
756 btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c0,
757 &step2[10], &step2[13]);
758
759 step2[4] = vqaddq_s16(step1[4], step1[5]);
760 step2[5] = vqsubq_s16(step1[4], step1[5]);
761 step2[6] = vqsubq_s16(step1[7], step1[6]);
762 step2[7] = vqaddq_s16(step1[7], step1[6]);
763 step2[8] = step1[8];
764 step2[11] = step1[11];
765 step2[12] = step1[12];
766 step2[15] = step1[15];
767
768 // stage 5
769
770 btf_16_lane_0_1_neon(step2[6], step2[5], c0, &step1[6], &step1[5]);
771 step1[0] = vqaddq_s16(step2[0], step2[3]);
772 step1[1] = vqaddq_s16(step2[1], step2[2]);
773 step1[2] = vqsubq_s16(step2[1], step2[2]);
774 step1[3] = vqsubq_s16(step2[0], step2[3]);
775 step1[4] = step2[4];
776 step1[7] = step2[7];
777 step1[8] = vqaddq_s16(step2[8], step2[11]);
778 step1[9] = vqaddq_s16(step2[9], step2[10]);
779 step1[10] = vqsubq_s16(step2[9], step2[10]);
780 step1[11] = vqsubq_s16(step2[8], step2[11]);
781 step1[12] = vqsubq_s16(step2[15], step2[12]);
782 step1[13] = vqsubq_s16(step2[14], step2[13]);
783 step1[14] = vqaddq_s16(step2[14], step2[13]);
784 step1[15] = vqaddq_s16(step2[15], step2[12]);
785
786 // stage 6
787 btf_16_lane_0_1_neon(step1[13], step1[10], c0, &step2[13], &step2[10]);
788 btf_16_lane_0_1_neon(step1[12], step1[11], c0, &step2[12], &step2[11]);
789
790 step2[0] = vqaddq_s16(step1[0], step1[7]);
791 step2[1] = vqaddq_s16(step1[1], step1[6]);
792 step2[2] = vqaddq_s16(step1[2], step1[5]);
793 step2[3] = vqaddq_s16(step1[3], step1[4]);
794 step2[4] = vqsubq_s16(step1[3], step1[4]);
795 step2[5] = vqsubq_s16(step1[2], step1[5]);
796 step2[6] = vqsubq_s16(step1[1], step1[6]);
797 step2[7] = vqsubq_s16(step1[0], step1[7]);
798 step2[8] = step1[8];
799 step2[9] = step1[9];
800 step2[14] = step1[14];
801 step2[15] = step1[15];
802
803 // stage 7
804
805 out[0] = vqaddq_s16(step2[0], step2[15]);
806 out[1] = vqaddq_s16(step2[1], step2[14]);
807 out[2] = vqaddq_s16(step2[2], step2[13]);
808 out[3] = vqaddq_s16(step2[3], step2[12]);
809 out[4] = vqaddq_s16(step2[4], step2[11]);
810 out[5] = vqaddq_s16(step2[5], step2[10]);
811 out[6] = vqaddq_s16(step2[6], step2[9]);
812 out[7] = vqaddq_s16(step2[7], step2[8]);
813 out[8] = vqsubq_s16(step2[7], step2[8]);
814 out[9] = vqsubq_s16(step2[6], step2[9]);
815 out[10] = vqsubq_s16(step2[5], step2[10]);
816 out[11] = vqsubq_s16(step2[4], step2[11]);
817 out[12] = vqsubq_s16(step2[3], step2[12]);
818 out[13] = vqsubq_s16(step2[2], step2[13]);
819 out[14] = vqsubq_s16(step2[1], step2[14]);
820 out[15] = vqsubq_s16(step2[0], step2[15]);
821}
822
823static INLINE void iadst16_new_neon(int16x8_t *const in, int16x8_t *out,
824 int8_t cos_bit, int bit) {
825 (void)bit;
826 const int32_t *cospi = cospi_arr(cos_bit);
827
828 const int16x4_t c0 =
829 create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
830 (int16_t *)(cospi + 10), (int16_t *)(cospi + 54));
831 const int16x4_t c1 =
832 create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
833 (int16_t *)(cospi + 26), (int16_t *)(cospi + 38));
834 const int16x4_t c2 =
835 create_s16x4_neon((int16_t *)(cospi + 34), (int16_t *)(cospi + 30),
836 (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
837 const int16x4_t c3 =
838 create_s16x4_neon((int16_t *)(cospi + 50), (int16_t *)(cospi + 14),
839 (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
840 const int16x4_t c4 =
841 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
842 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
843
844 const int16x4_t c =
845 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
846 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
847
848 int16x8_t x[16];
849 int16x8_t t[14];
850 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
851 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
852
853 // Stage 1
854 x[0] = in[15];
855 x[1] = in[0];
856 x[2] = in[13];
857 x[3] = in[2];
858 x[4] = in[11];
859 x[5] = in[4];
860 x[6] = in[9];
861 x[7] = in[6];
862 x[8] = in[7];
863 x[9] = in[8];
864 x[10] = in[5];
865 x[11] = in[10];
866 x[12] = in[3];
867 x[13] = in[12];
868 x[14] = in[1];
869 x[15] = in[14];
870
871 // Stage 2
872 btf_16_lane_0_1_neon(x[0], x[1], c0, &s0, &s1);
873 btf_16_lane_2_3_neon(x[2], x[3], c0, &s2, &s3);
874 btf_16_lane_0_1_neon(x[4], x[5], c1, &s4, &s5);
875 btf_16_lane_2_3_neon(x[6], x[7], c1, &s6, &s7);
876 btf_16_lane_0_1_neon(x[8], x[9], c2, &s8, &s9);
877 btf_16_lane_2_3_neon(x[10], x[11], c2, &s10, &s11);
878 btf_16_lane_0_1_neon(x[12], x[13], c3, &s12, &s13);
879 btf_16_lane_2_3_neon(x[14], x[15], c3, &s14, &s15);
880
881 // Stage 3
882 x[0] = vqaddq_s16(s0, s8);
883 x[1] = vqaddq_s16(s1, s9);
884 x[2] = vqaddq_s16(s2, s10);
885 x[3] = vqaddq_s16(s3, s11);
886 x[4] = vqaddq_s16(s4, s12);
887 x[5] = vqaddq_s16(s5, s13);
888 x[6] = vqaddq_s16(s6, s14);
889 x[7] = vqaddq_s16(s7, s15);
890 x[8] = vqsubq_s16(s0, s8);
891 x[9] = vqsubq_s16(s1, s9);
892 x[10] = vqsubq_s16(s2, s10);
893 x[11] = vqsubq_s16(s3, s11);
894 x[12] = vqsubq_s16(s4, s12);
895 x[13] = vqsubq_s16(s5, s13);
896 x[14] = vqsubq_s16(s6, s14);
897 x[15] = vqsubq_s16(s7, s15);
898
899 // Stage 4
900 t[0] = x[0];
901 t[1] = x[1];
902 t[2] = x[2];
903 t[3] = x[3];
904 t[4] = x[4];
905 t[5] = x[5];
906 t[6] = x[6];
907 t[7] = x[7];
908 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
909 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
910 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
911 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
912
913 // Stage 5
914 x[0] = vqaddq_s16(t[0], t[4]);
915 x[1] = vqaddq_s16(t[1], t[5]);
916 x[2] = vqaddq_s16(t[2], t[6]);
917 x[3] = vqaddq_s16(t[3], t[7]);
918 x[4] = vqsubq_s16(t[0], t[4]);
919 x[5] = vqsubq_s16(t[1], t[5]);
920 x[6] = vqsubq_s16(t[2], t[6]);
921 x[7] = vqsubq_s16(t[3], t[7]);
922 x[8] = vqaddq_s16(s8, s12);
923 x[9] = vqaddq_s16(s9, s13);
924 x[10] = vqaddq_s16(s10, s14);
925 x[11] = vqaddq_s16(s11, s15);
926 x[12] = vqsubq_s16(s8, s12);
927 x[13] = vqsubq_s16(s9, s13);
928 x[14] = vqsubq_s16(s10, s14);
929 x[15] = vqsubq_s16(s11, s15);
930
931 // stage 6
932 t[0] = x[0];
933 t[1] = x[1];
934 t[2] = x[2];
935 t[3] = x[3];
936 btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
937 btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
938 t[8] = x[8];
939 t[9] = x[9];
940 t[10] = x[10];
941 t[11] = x[11];
942 btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
943 btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
944
945 // Stage 7
946 x[0] = vqaddq_s16(t[0], t[2]);
947 x[1] = vqaddq_s16(t[1], t[3]);
948 x[2] = vqsubq_s16(t[0], t[2]);
949 x[3] = vqsubq_s16(t[1], t[3]);
950 x[4] = vqaddq_s16(s4, s6);
951 x[5] = vqaddq_s16(s5, s7);
952 x[6] = vqsubq_s16(s4, s6);
953 x[7] = vqsubq_s16(s5, s7);
954 x[8] = vqaddq_s16(t[8], t[10]);
955 x[9] = vqaddq_s16(t[9], t[11]);
956 x[10] = vqsubq_s16(t[8], t[10]);
957 x[11] = vqsubq_s16(t[9], t[11]);
958 x[12] = vqaddq_s16(s12, s14);
959 x[13] = vqaddq_s16(s13, s15);
960 x[14] = vqsubq_s16(s12, s14);
961 x[15] = vqsubq_s16(s13, s15);
962
963 // Stage 8
964 btf_16_half_neon(x + 2, c);
965 btf_16_half_neon(x + 6, c);
966 btf_16_half_neon(x + 10, c);
967 btf_16_half_neon(x + 14, c);
968
969 // Stage 9
970 out[0] = x[0];
971 out[1] = vnegq_s16(x[8]);
972 out[2] = x[12];
973 out[3] = vnegq_s16(x[4]);
974 out[4] = x[6];
975 out[5] = vnegq_s16(x[14]);
976 out[6] = x[10];
977 out[7] = vnegq_s16(x[2]);
978 out[8] = x[3];
979 out[9] = vnegq_s16(x[11]);
980 out[10] = x[15];
981 out[11] = vnegq_s16(x[7]);
982 out[12] = x[5];
983 out[13] = vnegq_s16(x[13]);
984 out[14] = x[9];
985 out[15] = vnegq_s16(x[1]);
986}
987
988static INLINE void iadst16_low1_new_neon(int16x8_t *const in, int16x8_t *out,
989 int8_t cos_bit, int bit) {
990 (void)bit;
991 const int32_t *cospi = cospi_arr(cos_bit);
992 const int16x4_t c4 =
993 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
994 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
995 const int16x4_t c =
996 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
997 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
998
999 int16x8_t x[16];
1000 int16x8_t t[10];
1001 int16x8_t s0, s1, s4, s5;
1002 int16x8_t s8, s9, s12, s13;
1003
1004 // Stage 1
1005 x[1] = in[0];
1006
1007 // Stage 2
1008 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1009
1010 // Stage 3
1011 x[0] = s0;
1012 x[1] = s1;
1013 x[8] = s0;
1014 x[9] = s1;
1015
1016 // Stage 4
1017 t[0] = x[0];
1018 t[1] = x[1];
1019 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
1020
1021 // Stage 5
1022 x[0] = t[0];
1023 x[1] = t[1];
1024 x[4] = t[0];
1025 x[5] = t[1];
1026 x[8] = s8;
1027 x[9] = s9;
1028 x[12] = s8;
1029 x[13] = s9;
1030
1031 // stage 6
1032 t[0] = x[0];
1033 t[1] = x[1];
1034 btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
1035 t[8] = x[8];
1036 t[9] = x[9];
1037 btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
1038
1039 // Stage 7
1040 x[0] = t[0];
1041 x[1] = t[1];
1042 x[2] = t[0];
1043 x[3] = t[1];
1044 x[4] = s4;
1045 x[5] = s5;
1046 x[6] = s4;
1047 x[7] = s5;
1048 x[8] = t[8];
1049 x[9] = t[9];
1050 x[10] = t[8];
1051 x[11] = t[9];
1052 x[12] = s12;
1053 x[13] = s13;
1054 x[14] = s12;
1055 x[15] = s13;
1056
1057 // Stage 8
1058 btf_16_half_neon(x + 2, c);
1059 btf_16_half_neon(x + 6, c);
1060 btf_16_half_neon(x + 10, c);
1061 btf_16_half_neon(x + 14, c);
1062
1063 // Stage 9
1064 out[0] = x[0];
1065 out[1] = vnegq_s16(x[8]);
1066 out[2] = x[12];
1067 out[3] = vnegq_s16(x[4]);
1068 out[4] = x[6];
1069 out[5] = vnegq_s16(x[14]);
1070 out[6] = x[10];
1071 out[7] = vnegq_s16(x[2]);
1072 out[8] = x[3];
1073 out[9] = vnegq_s16(x[11]);
1074 out[10] = x[15];
1075 out[11] = vnegq_s16(x[7]);
1076 out[12] = x[5];
1077 out[13] = vnegq_s16(x[13]);
1078 out[14] = x[9];
1079 out[15] = vnegq_s16(x[1]);
1080}
1081
1082static INLINE void iadst16_low8_new_neon(int16x8_t *const in, int16x8_t *out,
1083 int8_t cos_bit, int bit) {
1084 (void)bit;
1085 const int32_t *cospi = cospi_arr(cos_bit);
1086
1087 const int16x4_t c4 =
1088 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
1089 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
1090 const int16x4_t c =
1091 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
1092 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
1093
1094 int16x8_t x[16];
1095 int16x8_t t[14];
1096 int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
1097 int16x8_t s8, s9, s10, s11, s12, s13, s14, s15;
1098
1099 // Stage 1
1100 x[1] = in[0];
1101 x[3] = in[2];
1102 x[5] = in[4];
1103 x[7] = in[6];
1104 x[8] = in[7];
1105 x[10] = in[5];
1106 x[12] = in[3];
1107 x[14] = in[1];
1108
1109 // Stage 2
1110 btf_16_neon(x[1], cospi[62], -cospi[2], &s0, &s1);
1111 btf_16_neon(x[3], cospi[54], -cospi[10], &s2, &s3);
1112 btf_16_neon(x[5], cospi[46], -cospi[18], &s4, &s5);
1113 btf_16_neon(x[7], cospi[38], -cospi[26], &s6, &s7);
1114
1115 btf_16_neon(x[8], cospi[34], cospi[30], &s8, &s9);
1116 btf_16_neon(x[10], cospi[42], cospi[22], &s10, &s11);
1117 btf_16_neon(x[12], cospi[50], cospi[14], &s12, &s13);
1118 btf_16_neon(x[14], cospi[58], cospi[6], &s14, &s15);
1119
1120 // Stage 3
1121 x[0] = vqaddq_s16(s0, s8);
1122 x[1] = vqaddq_s16(s1, s9);
1123 x[2] = vqaddq_s16(s2, s10);
1124 x[3] = vqaddq_s16(s3, s11);
1125 x[4] = vqaddq_s16(s4, s12);
1126 x[5] = vqaddq_s16(s5, s13);
1127 x[6] = vqaddq_s16(s6, s14);
1128 x[7] = vqaddq_s16(s7, s15);
1129 x[8] = vqsubq_s16(s0, s8);
1130 x[9] = vqsubq_s16(s1, s9);
1131 x[10] = vqsubq_s16(s2, s10);
1132 x[11] = vqsubq_s16(s3, s11);
1133 x[12] = vqsubq_s16(s4, s12);
1134 x[13] = vqsubq_s16(s5, s13);
1135 x[14] = vqsubq_s16(s6, s14);
1136 x[15] = vqsubq_s16(s7, s15);
1137
1138 // Stage 4
1139 t[0] = x[0];
1140 t[1] = x[1];
1141 t[2] = x[2];
1142 t[3] = x[3];
1143 t[4] = x[4];
1144 t[5] = x[5];
1145 t[6] = x[6];
1146 t[7] = x[7];
1147 btf_16_lane_0_1_neon(x[8], x[9], c4, &s8, &s9);
1148 btf_16_lane_2_3_neon(x[10], x[11], c4, &s10, &s11);
1149 btf_16_lane_1_0_neon(x[13], x[12], c4, &s13, &s12);
1150 btf_16_lane_3_2_neon(x[15], x[14], c4, &s15, &s14);
1151
1152 // Stage 5
1153 x[0] = vqaddq_s16(t[0], t[4]);
1154 x[1] = vqaddq_s16(t[1], t[5]);
1155 x[2] = vqaddq_s16(t[2], t[6]);
1156 x[3] = vqaddq_s16(t[3], t[7]);
1157 x[4] = vqsubq_s16(t[0], t[4]);
1158 x[5] = vqsubq_s16(t[1], t[5]);
1159 x[6] = vqsubq_s16(t[2], t[6]);
1160 x[7] = vqsubq_s16(t[3], t[7]);
1161 x[8] = vqaddq_s16(s8, s12);
1162 x[9] = vqaddq_s16(s9, s13);
1163 x[10] = vqaddq_s16(s10, s14);
1164 x[11] = vqaddq_s16(s11, s15);
1165 x[12] = vqsubq_s16(s8, s12);
1166 x[13] = vqsubq_s16(s9, s13);
1167 x[14] = vqsubq_s16(s10, s14);
1168 x[15] = vqsubq_s16(s11, s15);
1169
1170 // stage 6
1171 t[0] = x[0];
1172 t[1] = x[1];
1173 t[2] = x[2];
1174 t[3] = x[3];
1175 btf_16_lane_2_3_neon(x[4], x[5], c, &s4, &s5);
1176 btf_16_lane_3_2_neon(x[7], x[6], c, &s7, &s6);
1177 t[8] = x[8];
1178 t[9] = x[9];
1179 t[10] = x[10];
1180 t[11] = x[11];
1181 btf_16_lane_2_3_neon(x[12], x[13], c, &s12, &s13);
1182 btf_16_lane_3_2_neon(x[15], x[14], c, &s15, &s14);
1183
1184 // Stage 7
1185 x[0] = vqaddq_s16(t[0], t[2]);
1186 x[1] = vqaddq_s16(t[1], t[3]);
1187 x[2] = vqsubq_s16(t[0], t[2]);
1188 x[3] = vqsubq_s16(t[1], t[3]);
1189 x[4] = vqaddq_s16(s4, s6);
1190 x[5] = vqaddq_s16(s5, s7);
1191 x[6] = vqsubq_s16(s4, s6);
1192 x[7] = vqsubq_s16(s5, s7);
1193 x[8] = vqaddq_s16(t[8], t[10]);
1194 x[9] = vqaddq_s16(t[9], t[11]);
1195 x[10] = vqsubq_s16(t[8], t[10]);
1196 x[11] = vqsubq_s16(t[9], t[11]);
1197 x[12] = vqaddq_s16(s12, s14);
1198 x[13] = vqaddq_s16(s13, s15);
1199 x[14] = vqsubq_s16(s12, s14);
1200 x[15] = vqsubq_s16(s13, s15);
1201
1202 // Stage 8
1203 btf_16_half_neon(x + 2, c);
1204 btf_16_half_neon(x + 6, c);
1205 btf_16_half_neon(x + 10, c);
1206 btf_16_half_neon(x + 14, c);
1207
1208 // Stage 9
1209 out[0] = x[0];
1210 out[1] = vnegq_s16(x[8]);
1211 out[2] = x[12];
1212 out[3] = vnegq_s16(x[4]);
1213 out[4] = x[6];
1214 out[5] = vnegq_s16(x[14]);
1215 out[6] = x[10];
1216 out[7] = vnegq_s16(x[2]);
1217 out[8] = x[3];
1218 out[9] = vnegq_s16(x[11]);
1219 out[10] = x[15];
1220 out[11] = vnegq_s16(x[7]);
1221 out[12] = x[5];
1222 out[13] = vnegq_s16(x[13]);
1223 out[14] = x[9];
1224 out[15] = vnegq_s16(x[1]);
1225}
1226
1227static INLINE void idct32_new_neon(int16x8_t *in, int16x8_t *out,
1228 int8_t cos_bit, int bit) {
1229 (void)bit;
1230 const int32_t *cospi = cospi_arr(cos_bit);
1231 int16x8_t step1[32], step2[32];
1232
1233 const int16x4_t c0 =
1234 create_s16x4_neon((int16_t *)(cospi + 2), (int16_t *)(cospi + 62),
1235 (int16_t *)(cospi + 34), (int16_t *)(cospi + 30));
1236 const int16x4_t c1 =
1237 create_s16x4_neon((int16_t *)(cospi + 18), (int16_t *)(cospi + 46),
1238 (int16_t *)(cospi + 50), (int16_t *)(cospi + 14));
1239 const int16x4_t c2 =
1240 create_s16x4_neon((int16_t *)(cospi + 10), (int16_t *)(cospi + 54),
1241 (int16_t *)(cospi + 42), (int16_t *)(cospi + 22));
1242 const int16x4_t c3 =
1243 create_s16x4_neon((int16_t *)(cospi + 26), (int16_t *)(cospi + 38),
1244 (int16_t *)(cospi + 58), (int16_t *)(cospi + 6));
1245 const int16x4_t c4 =
1246 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
1247 (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
1248 const int16x4_t c5 =
1249 create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
1250 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
1251 const int16x4_t c6 =
1252 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
1253 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
1254 const int16x4_t c7 =
1255 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
1256 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
1257
1258 // stage 2
1259
1260 btf_16_lane_0_1_neon(in[1], in[31], c0, &step2[31], &step2[16]);
1261 btf_16_lane_2_3_neon(in[17], in[15], c0, &step2[30], &step2[17]);
1262 btf_16_lane_0_1_neon(in[9], in[23], c1, &step2[29], &step2[18]);
1263 btf_16_lane_2_3_neon(in[25], in[7], c1, &step2[28], &step2[19]);
1264 btf_16_lane_0_1_neon(in[5], in[27], c2, &step2[27], &step2[20]);
1265 btf_16_lane_2_3_neon(in[21], in[11], c2, &step2[26], &step2[21]);
1266 btf_16_lane_0_1_neon(in[13], in[19], c3, &step2[25], &step2[22]);
1267 btf_16_lane_2_3_neon(in[29], in[3], c3, &step2[24], &step2[23]);
1268
1269 step2[0] = in[0];
1270 step2[1] = in[16];
1271 step2[2] = in[8];
1272 step2[3] = in[24];
1273 step2[4] = in[4];
1274 step2[5] = in[20];
1275 step2[6] = in[12];
1276 step2[7] = in[28];
1277 step2[8] = in[2];
1278 step2[9] = in[18];
1279 step2[10] = in[10];
1280 step2[11] = in[26];
1281 step2[12] = in[6];
1282 step2[13] = in[22];
1283 step2[14] = in[14];
1284 step2[15] = in[30];
1285
1286 // stage 3
1287
1288 btf_16_lane_0_1_neon(step2[8], step2[15], c4, &step1[15], &step1[8]);
1289 btf_16_lane_2_3_neon(step2[9], step2[14], c4, &step1[14], &step1[9]);
1290 btf_16_lane_0_1_neon(step2[10], step2[13], c5, &step1[13], &step1[10]);
1291 btf_16_lane_2_3_neon(step2[11], step2[12], c5, &step1[12], &step1[11]);
1292
1293 step1[0] = step2[0];
1294 step1[1] = step2[1];
1295 step1[2] = step2[2];
1296 step1[3] = step2[3];
1297 step1[4] = step2[4];
1298 step1[5] = step2[5];
1299 step1[6] = step2[6];
1300 step1[7] = step2[7];
1301
1302 step1[16] = vqaddq_s16(step2[16], step2[17]);
1303 step1[17] = vqsubq_s16(step2[16], step2[17]);
1304 step1[18] = vqsubq_s16(step2[19], step2[18]);
1305 step1[19] = vqaddq_s16(step2[19], step2[18]);
1306 step1[20] = vqaddq_s16(step2[20], step2[21]);
1307 step1[21] = vqsubq_s16(step2[20], step2[21]);
1308 step1[22] = vqsubq_s16(step2[23], step2[22]);
1309 step1[23] = vqaddq_s16(step2[23], step2[22]);
1310 step1[24] = vqaddq_s16(step2[24], step2[25]);
1311 step1[25] = vqsubq_s16(step2[24], step2[25]);
1312 step1[26] = vqsubq_s16(step2[27], step2[26]);
1313 step1[27] = vqaddq_s16(step2[27], step2[26]);
1314 step1[28] = vqaddq_s16(step2[28], step2[29]);
1315 step1[29] = vqsubq_s16(step2[28], step2[29]);
1316 step1[30] = vqsubq_s16(step2[31], step2[30]);
1317 step1[31] = vqaddq_s16(step2[31], step2[30]);
1318
1319 // stage 4
1320
1321 btf_16_lane_0_1_neon(step1[4], step1[7], c6, &step2[7], &step2[4]);
1322 btf_16_lane_2_3_neon(step1[5], step1[6], c6, &step2[6], &step2[5]);
1323 btf_16_lane_0_1_neon(step1[30], step1[17], c6, &step2[30], &step2[17]);
1324 btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c6,
1325 &step2[18], &step2[29]);
1326 btf_16_lane_2_3_neon(step1[26], step1[21], c6, &step2[26], &step2[21]);
1327 btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c6,
1328 &step2[22], &step2[25]);
1329
1330 step2[0] = step1[0];
1331 step2[1] = step1[1];
1332 step2[2] = step1[2];
1333 step2[3] = step1[3];
1334 step2[8] = vqaddq_s16(step1[8], step1[9]);
1335 step2[9] = vqsubq_s16(step1[8], step1[9]);
1336 step2[10] = vqsubq_s16(step1[11], step1[10]);
1337 step2[11] = vqaddq_s16(step1[11], step1[10]);
1338 step2[12] = vqaddq_s16(step1[12], step1[13]);
1339 step2[13] = vqsubq_s16(step1[12], step1[13]);
1340 step2[14] = vqsubq_s16(step1[15], step1[14]);
1341 step2[15] = vqaddq_s16(step1[15], step1[14]);
1342 step2[16] = step1[16];
1343 step2[19] = step1[19];
1344 step2[20] = step1[20];
1345 step2[23] = step1[23];
1346 step2[24] = step1[24];
1347 step2[27] = step1[27];
1348 step2[28] = step1[28];
1349 step2[31] = step1[31];
1350
1351 // stage 5
1352
1353 btf_16_lane_0_1_neon(step2[0], step2[1], c7, &step1[0], &step1[1]);
1354 btf_16_lane_2_3_neon(step2[2], step2[3], c7, &step1[3], &step1[2]);
1355 btf_16_lane_2_3_neon(step2[14], step2[9], c7, &step1[14], &step1[9]);
1356 btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c7,
1357 &step1[10], &step1[13]);
1358
1359 step1[4] = vqaddq_s16(step2[4], step2[5]);
1360 step1[5] = vqsubq_s16(step2[4], step2[5]);
1361 step1[6] = vqsubq_s16(step2[7], step2[6]);
1362 step1[7] = vqaddq_s16(step2[7], step2[6]);
1363 step1[8] = step2[8];
1364 step1[11] = step2[11];
1365 step1[12] = step2[12];
1366 step1[15] = step2[15];
1367 step1[16] = vqaddq_s16(step2[16], step2[19]);
1368 step1[17] = vqaddq_s16(step2[17], step2[18]);
1369 step1[18] = vqsubq_s16(step2[17], step2[18]);
1370 step1[19] = vqsubq_s16(step2[16], step2[19]);
1371 step1[20] = vqsubq_s16(step2[23], step2[20]);
1372 step1[21] = vqsubq_s16(step2[22], step2[21]);
1373 step1[22] = vqaddq_s16(step2[22], step2[21]);
1374 step1[23] = vqaddq_s16(step2[23], step2[20]);
1375 step1[24] = vqaddq_s16(step2[24], step2[27]);
1376 step1[25] = vqaddq_s16(step2[25], step2[26]);
1377 step1[26] = vqsubq_s16(step2[25], step2[26]);
1378 step1[27] = vqsubq_s16(step2[24], step2[27]);
1379 step1[28] = vqsubq_s16(step2[31], step2[28]);
1380 step1[29] = vqsubq_s16(step2[30], step2[29]);
1381 step1[30] = vqaddq_s16(step2[30], step2[29]);
1382 step1[31] = vqaddq_s16(step2[31], step2[28]);
1383
1384 // stage 6
1385
1386 btf_16_lane_0_1_neon(step1[6], step1[5], c7, &step2[6], &step2[5]);
1387 btf_16_lane_2_3_neon(step1[29], step1[18], c7, &step2[29], &step2[18]);
1388 btf_16_lane_2_3_neon(step1[28], step1[19], c7, &step2[28], &step2[19]);
1389 btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c7,
1390 &step2[20], &step2[27]);
1391 btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c7,
1392 &step2[21], &step2[26]);
1393
1394 step2[0] = vqaddq_s16(step1[0], step1[3]);
1395 step2[1] = vqaddq_s16(step1[1], step1[2]);
1396 step2[2] = vqsubq_s16(step1[1], step1[2]);
1397 step2[3] = vqsubq_s16(step1[0], step1[3]);
1398 step2[4] = step1[4];
1399 step2[7] = step1[7];
1400 step2[8] = vqaddq_s16(step1[8], step1[11]);
1401 step2[9] = vqaddq_s16(step1[9], step1[10]);
1402 step2[10] = vqsubq_s16(step1[9], step1[10]);
1403 step2[11] = vqsubq_s16(step1[8], step1[11]);
1404 step2[12] = vqsubq_s16(step1[15], step1[12]);
1405 step2[13] = vqsubq_s16(step1[14], step1[13]);
1406 step2[14] = vqaddq_s16(step1[14], step1[13]);
1407 step2[15] = vqaddq_s16(step1[15], step1[12]);
1408 step2[16] = step1[16];
1409 step2[17] = step1[17];
1410 step2[22] = step1[22];
1411 step2[23] = step1[23];
1412 step2[24] = step1[24];
1413 step2[25] = step1[25];
1414 step2[30] = step1[30];
1415 step2[31] = step1[31];
1416
1417 // stage 7
1418
1419 btf_16_lane_0_1_neon(step2[13], step2[10], c7, &step1[13], &step1[10]);
1420 btf_16_lane_0_1_neon(step2[12], step2[11], c7, &step1[12], &step1[11]);
1421
1422 step1[0] = vqaddq_s16(step2[0], step2[7]);
1423 step1[1] = vqaddq_s16(step2[1], step2[6]);
1424 step1[2] = vqaddq_s16(step2[2], step2[5]);
1425 step1[3] = vqaddq_s16(step2[3], step2[4]);
1426 step1[4] = vqsubq_s16(step2[3], step2[4]);
1427 step1[5] = vqsubq_s16(step2[2], step2[5]);
1428 step1[6] = vqsubq_s16(step2[1], step2[6]);
1429 step1[7] = vqsubq_s16(step2[0], step2[7]);
1430 step1[8] = step2[8];
1431 step1[9] = step2[9];
1432 step1[14] = step2[14];
1433 step1[15] = step2[15];
1434 step1[16] = vqaddq_s16(step2[16], step2[23]);
1435 step1[17] = vqaddq_s16(step2[17], step2[22]);
1436 step1[18] = vqaddq_s16(step2[18], step2[21]);
1437 step1[19] = vqaddq_s16(step2[19], step2[20]);
1438 step1[20] = vqsubq_s16(step2[19], step2[20]);
1439 step1[21] = vqsubq_s16(step2[18], step2[21]);
1440 step1[22] = vqsubq_s16(step2[17], step2[22]);
1441 step1[23] = vqsubq_s16(step2[16], step2[23]);
1442 step1[24] = vqsubq_s16(step2[31], step2[24]);
1443 step1[25] = vqsubq_s16(step2[30], step2[25]);
1444 step1[26] = vqsubq_s16(step2[29], step2[26]);
1445 step1[27] = vqsubq_s16(step2[28], step2[27]);
1446 step1[28] = vqaddq_s16(step2[27], step2[28]);
1447 step1[29] = vqaddq_s16(step2[26], step2[29]);
1448 step1[30] = vqaddq_s16(step2[25], step2[30]);
1449 step1[31] = vqaddq_s16(step2[24], step2[31]);
1450
1451 // stage 8
1452
1453 btf_16_lane_0_1_neon(step1[27], step1[20], c7, &step2[27], &step2[20]);
1454 btf_16_lane_0_1_neon(step1[26], step1[21], c7, &step2[26], &step2[21]);
1455 btf_16_lane_0_1_neon(step1[25], step1[22], c7, &step2[25], &step2[22]);
1456 btf_16_lane_0_1_neon(step1[24], step1[23], c7, &step2[24], &step2[23]);
1457
1458 step2[0] = vqaddq_s16(step1[0], step1[15]);
1459 step2[1] = vqaddq_s16(step1[1], step1[14]);
1460 step2[2] = vqaddq_s16(step1[2], step1[13]);
1461 step2[3] = vqaddq_s16(step1[3], step1[12]);
1462 step2[4] = vqaddq_s16(step1[4], step1[11]);
1463 step2[5] = vqaddq_s16(step1[5], step1[10]);
1464 step2[6] = vqaddq_s16(step1[6], step1[9]);
1465 step2[7] = vqaddq_s16(step1[7], step1[8]);
1466 step2[8] = vqsubq_s16(step1[7], step1[8]);
1467 step2[9] = vqsubq_s16(step1[6], step1[9]);
1468 step2[10] = vqsubq_s16(step1[5], step1[10]);
1469 step2[11] = vqsubq_s16(step1[4], step1[11]);
1470 step2[12] = vqsubq_s16(step1[3], step1[12]);
1471 step2[13] = vqsubq_s16(step1[2], step1[13]);
1472 step2[14] = vqsubq_s16(step1[1], step1[14]);
1473 step2[15] = vqsubq_s16(step1[0], step1[15]);
1474 step2[16] = step1[16];
1475 step2[17] = step1[17];
1476 step2[18] = step1[18];
1477 step2[19] = step1[19];
1478 step2[28] = step1[28];
1479 step2[29] = step1[29];
1480 step2[30] = step1[30];
1481 step2[31] = step1[31];
1482
1483 // stage 9
1484
1485 out[0] = vqaddq_s16(step2[0], step2[31]);
1486 out[1] = vqaddq_s16(step2[1], step2[30]);
1487 out[2] = vqaddq_s16(step2[2], step2[29]);
1488 out[3] = vqaddq_s16(step2[3], step2[28]);
1489 out[4] = vqaddq_s16(step2[4], step2[27]);
1490 out[5] = vqaddq_s16(step2[5], step2[26]);
1491 out[6] = vqaddq_s16(step2[6], step2[25]);
1492 out[7] = vqaddq_s16(step2[7], step2[24]);
1493 out[8] = vqaddq_s16(step2[8], step2[23]);
1494 out[9] = vqaddq_s16(step2[9], step2[22]);
1495 out[10] = vqaddq_s16(step2[10], step2[21]);
1496 out[11] = vqaddq_s16(step2[11], step2[20]);
1497 out[12] = vqaddq_s16(step2[12], step2[19]);
1498 out[13] = vqaddq_s16(step2[13], step2[18]);
1499 out[14] = vqaddq_s16(step2[14], step2[17]);
1500 out[15] = vqaddq_s16(step2[15], step2[16]);
1501 out[16] = vqsubq_s16(step2[15], step2[16]);
1502 out[17] = vqsubq_s16(step2[14], step2[17]);
1503 out[18] = vqsubq_s16(step2[13], step2[18]);
1504 out[19] = vqsubq_s16(step2[12], step2[19]);
1505 out[20] = vqsubq_s16(step2[11], step2[20]);
1506 out[21] = vqsubq_s16(step2[10], step2[21]);
1507 out[22] = vqsubq_s16(step2[9], step2[22]);
1508 out[23] = vqsubq_s16(step2[8], step2[23]);
1509 out[24] = vqsubq_s16(step2[7], step2[24]);
1510 out[25] = vqsubq_s16(step2[6], step2[25]);
1511 out[26] = vqsubq_s16(step2[5], step2[26]);
1512 out[27] = vqsubq_s16(step2[4], step2[27]);
1513 out[28] = vqsubq_s16(step2[3], step2[28]);
1514 out[29] = vqsubq_s16(step2[2], step2[29]);
1515 out[30] = vqsubq_s16(step2[1], step2[30]);
1516 out[31] = vqsubq_s16(step2[0], step2[31]);
1517}
1518
1519static INLINE void idct32_low1_new_neon(int16x8_t *in, int16x8_t *out,
1520 int8_t cos_bit, int bit) {
1521 (void)bit;
1522 const int32_t *cospi = cospi_arr(cos_bit);
1523 int16x8_t step1;
1524 int32x4_t t32[2];
1525
1526 // stage 1
1527 // stage 2
1528 // stage 3
1529 // stage 4
1530 // stage 5
1531
1532 t32[0] = vmull_n_s16(vget_low_s16(in[0]), cospi[32]);
1533 t32[1] = vmull_n_s16(vget_high_s16(in[0]), cospi[32]);
1534 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1535 vrshrn_n_s32(t32[1], INV_COS_BIT));
1536
1537 // stage 6
1538 // stage 7
1539 // stage 8
1540 // stage 9
1541
1542 out[0] = step1;
1543 out[1] = step1;
1544 out[2] = step1;
1545 out[3] = step1;
1546 out[4] = step1;
1547 out[5] = step1;
1548 out[6] = step1;
1549 out[7] = step1;
1550 out[8] = step1;
1551 out[9] = step1;
1552 out[10] = step1;
1553 out[11] = step1;
1554 out[12] = step1;
1555 out[13] = step1;
1556 out[14] = step1;
1557 out[15] = step1;
1558 out[16] = step1;
1559 out[17] = step1;
1560 out[18] = step1;
1561 out[19] = step1;
1562 out[20] = step1;
1563 out[21] = step1;
1564 out[22] = step1;
1565 out[23] = step1;
1566 out[24] = step1;
1567 out[25] = step1;
1568 out[26] = step1;
1569 out[27] = step1;
1570 out[28] = step1;
1571 out[29] = step1;
1572 out[30] = step1;
1573 out[31] = step1;
1574}
1575
1576static INLINE void idct32_low8_new_neon(int16x8_t *in, int16x8_t *out,
1577 int8_t cos_bit, int bit) {
1578 (void)bit;
1579 const int32_t *cospi = cospi_arr(cos_bit);
1580 int16x8_t step1[32], step2[32];
1581 int32x4_t t32[16];
1582 const int16x4_t c0 =
1583 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
1584 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
1585 const int16x4_t c1 =
1586 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
1587 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
1588
1589 // stage 1
1590 // stage 2
1591
1592 step2[0] = in[0];
1593 step2[4] = in[4];
1594 step2[8] = in[2];
1595 step2[12] = in[6];
1596
1597 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1598 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1599 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1600 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1601
1602 // stage 3
1603 step1[0] = step2[0];
1604 step1[4] = step2[4];
1605
1606 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1607 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1608
1609 step1[16] = step2[16];
1610 step1[17] = step2[16];
1611 step1[18] = step2[19];
1612 step1[19] = step2[19];
1613 step1[20] = step2[20];
1614 step1[21] = step2[20];
1615 step1[22] = step2[23];
1616 step1[23] = step2[23];
1617 step1[24] = step2[24];
1618 step1[25] = step2[24];
1619 step1[26] = step2[27];
1620 step1[27] = step2[27];
1621 step1[28] = step2[28];
1622 step1[29] = step2[28];
1623 step1[30] = step2[31];
1624 step1[31] = step2[31];
1625
1626 // stage 4
1627
1628 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1629 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1630 btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
1631 &step2[18], &step2[29]);
1632 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1633 btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
1634 &step2[22], &step2[25]);
1635
1636 step2[0] = step1[0];
1637 step2[8] = step1[8];
1638 step2[9] = step1[8];
1639 step2[10] = step1[11];
1640 step2[11] = step1[11];
1641 step2[12] = step1[12];
1642 step2[13] = step1[12];
1643 step2[14] = step1[15];
1644 step2[15] = step1[15];
1645 step2[16] = step1[16];
1646 step2[19] = step1[19];
1647 step2[20] = step1[20];
1648 step2[23] = step1[23];
1649 step2[24] = step1[24];
1650 step2[27] = step1[27];
1651 step2[28] = step1[28];
1652 step2[31] = step1[31];
1653
1654 // stage 5
1655
1656 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1657 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1658 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1659 vrshrn_n_s32(t32[1], INV_COS_BIT));
1660
1661 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1662 btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
1663 &step1[10], &step1[13]);
1664
1665 step1[4] = step2[4];
1666 step1[5] = step2[4];
1667 step1[6] = step2[7];
1668 step1[7] = step2[7];
1669 step1[8] = step2[8];
1670 step1[11] = step2[11];
1671 step1[12] = step2[12];
1672 step1[15] = step2[15];
1673 step1[16] = vqaddq_s16(step2[16], step2[19]);
1674 step1[17] = vqaddq_s16(step2[17], step2[18]);
1675 step1[18] = vqsubq_s16(step2[17], step2[18]);
1676 step1[19] = vqsubq_s16(step2[16], step2[19]);
1677 step1[20] = vqsubq_s16(step2[23], step2[20]);
1678 step1[21] = vqsubq_s16(step2[22], step2[21]);
1679 step1[22] = vqaddq_s16(step2[22], step2[21]);
1680 step1[23] = vqaddq_s16(step2[23], step2[20]);
1681 step1[24] = vqaddq_s16(step2[24], step2[27]);
1682 step1[25] = vqaddq_s16(step2[25], step2[26]);
1683 step1[26] = vqsubq_s16(step2[25], step2[26]);
1684 step1[27] = vqsubq_s16(step2[24], step2[27]);
1685 step1[28] = vqsubq_s16(step2[31], step2[28]);
1686 step1[29] = vqsubq_s16(step2[30], step2[29]);
1687 step1[30] = vqaddq_s16(step2[30], step2[29]);
1688 step1[31] = vqaddq_s16(step2[31], step2[28]);
1689
1690 // stage 6
1691
1692 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1693 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1694 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1695 btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
1696 &step2[20], &step2[27]);
1697 btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
1698 &step2[21], &step2[26]);
1699
1700 step2[0] = step1[0];
1701 step2[1] = step1[0];
1702 step2[2] = step1[0];
1703 step2[3] = step1[0];
1704 step2[4] = step1[4];
1705 step2[7] = step1[7];
1706 step2[8] = vqaddq_s16(step1[8], step1[11]);
1707 step2[9] = vqaddq_s16(step1[9], step1[10]);
1708 step2[10] = vqsubq_s16(step1[9], step1[10]);
1709 step2[11] = vqsubq_s16(step1[8], step1[11]);
1710 step2[12] = vqsubq_s16(step1[15], step1[12]);
1711 step2[13] = vqsubq_s16(step1[14], step1[13]);
1712 step2[14] = vqaddq_s16(step1[14], step1[13]);
1713 step2[15] = vqaddq_s16(step1[15], step1[12]);
1714 step2[16] = step1[16];
1715 step2[17] = step1[17];
1716 step2[22] = step1[22];
1717 step2[23] = step1[23];
1718 step2[24] = step1[24];
1719 step2[25] = step1[25];
1720 step2[30] = step1[30];
1721 step2[31] = step1[31];
1722
1723 // stage 7
1724
1725 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1726 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1727
1728 step1[0] = vqaddq_s16(step2[0], step2[7]);
1729 step1[1] = vqaddq_s16(step2[1], step2[6]);
1730 step1[2] = vqaddq_s16(step2[2], step2[5]);
1731 step1[3] = vqaddq_s16(step2[3], step2[4]);
1732 step1[4] = vqsubq_s16(step2[3], step2[4]);
1733 step1[5] = vqsubq_s16(step2[2], step2[5]);
1734 step1[6] = vqsubq_s16(step2[1], step2[6]);
1735 step1[7] = vqsubq_s16(step2[0], step2[7]);
1736 step1[8] = step2[8];
1737 step1[9] = step2[9];
1738 step1[14] = step2[14];
1739 step1[15] = step2[15];
1740 step1[16] = vqaddq_s16(step2[16], step2[23]);
1741 step1[17] = vqaddq_s16(step2[17], step2[22]);
1742 step1[18] = vqaddq_s16(step2[18], step2[21]);
1743 step1[19] = vqaddq_s16(step2[19], step2[20]);
1744 step1[20] = vqsubq_s16(step2[19], step2[20]);
1745 step1[21] = vqsubq_s16(step2[18], step2[21]);
1746 step1[22] = vqsubq_s16(step2[17], step2[22]);
1747 step1[23] = vqsubq_s16(step2[16], step2[23]);
1748 step1[24] = vqsubq_s16(step2[31], step2[24]);
1749 step1[25] = vqsubq_s16(step2[30], step2[25]);
1750 step1[26] = vqsubq_s16(step2[29], step2[26]);
1751 step1[27] = vqsubq_s16(step2[28], step2[27]);
1752 step1[28] = vqaddq_s16(step2[27], step2[28]);
1753 step1[29] = vqaddq_s16(step2[26], step2[29]);
1754 step1[30] = vqaddq_s16(step2[25], step2[30]);
1755 step1[31] = vqaddq_s16(step2[24], step2[31]);
1756
1757 // stage 8
1758
1759 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
1760 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
1761 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
1762 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
1763
1764 step2[0] = vqaddq_s16(step1[0], step1[15]);
1765 step2[1] = vqaddq_s16(step1[1], step1[14]);
1766 step2[2] = vqaddq_s16(step1[2], step1[13]);
1767 step2[3] = vqaddq_s16(step1[3], step1[12]);
1768 step2[4] = vqaddq_s16(step1[4], step1[11]);
1769 step2[5] = vqaddq_s16(step1[5], step1[10]);
1770 step2[6] = vqaddq_s16(step1[6], step1[9]);
1771 step2[7] = vqaddq_s16(step1[7], step1[8]);
1772 step2[8] = vqsubq_s16(step1[7], step1[8]);
1773 step2[9] = vqsubq_s16(step1[6], step1[9]);
1774 step2[10] = vqsubq_s16(step1[5], step1[10]);
1775 step2[11] = vqsubq_s16(step1[4], step1[11]);
1776 step2[12] = vqsubq_s16(step1[3], step1[12]);
1777 step2[13] = vqsubq_s16(step1[2], step1[13]);
1778 step2[14] = vqsubq_s16(step1[1], step1[14]);
1779 step2[15] = vqsubq_s16(step1[0], step1[15]);
1780 step2[16] = step1[16];
1781 step2[17] = step1[17];
1782 step2[18] = step1[18];
1783 step2[19] = step1[19];
1784 step2[28] = step1[28];
1785 step2[29] = step1[29];
1786 step2[30] = step1[30];
1787 step2[31] = step1[31];
1788
1789 // stage 9
1790
1791 out[0] = vqaddq_s16(step2[0], step2[31]);
1792 out[1] = vqaddq_s16(step2[1], step2[30]);
1793 out[2] = vqaddq_s16(step2[2], step2[29]);
1794 out[3] = vqaddq_s16(step2[3], step2[28]);
1795 out[4] = vqaddq_s16(step2[4], step2[27]);
1796 out[5] = vqaddq_s16(step2[5], step2[26]);
1797 out[6] = vqaddq_s16(step2[6], step2[25]);
1798 out[7] = vqaddq_s16(step2[7], step2[24]);
1799 out[8] = vqaddq_s16(step2[8], step2[23]);
1800 out[9] = vqaddq_s16(step2[9], step2[22]);
1801 out[10] = vqaddq_s16(step2[10], step2[21]);
1802 out[11] = vqaddq_s16(step2[11], step2[20]);
1803 out[12] = vqaddq_s16(step2[12], step2[19]);
1804 out[13] = vqaddq_s16(step2[13], step2[18]);
1805 out[14] = vqaddq_s16(step2[14], step2[17]);
1806 out[15] = vqaddq_s16(step2[15], step2[16]);
1807 out[16] = vqsubq_s16(step2[15], step2[16]);
1808 out[17] = vqsubq_s16(step2[14], step2[17]);
1809 out[18] = vqsubq_s16(step2[13], step2[18]);
1810 out[19] = vqsubq_s16(step2[12], step2[19]);
1811 out[20] = vqsubq_s16(step2[11], step2[20]);
1812 out[21] = vqsubq_s16(step2[10], step2[21]);
1813 out[22] = vqsubq_s16(step2[9], step2[22]);
1814 out[23] = vqsubq_s16(step2[8], step2[23]);
1815 out[24] = vqsubq_s16(step2[7], step2[24]);
1816 out[25] = vqsubq_s16(step2[6], step2[25]);
1817 out[26] = vqsubq_s16(step2[5], step2[26]);
1818 out[27] = vqsubq_s16(step2[4], step2[27]);
1819 out[28] = vqsubq_s16(step2[3], step2[28]);
1820 out[29] = vqsubq_s16(step2[2], step2[29]);
1821 out[30] = vqsubq_s16(step2[1], step2[30]);
1822 out[31] = vqsubq_s16(step2[0], step2[31]);
1823}
1824
1825static INLINE void idct32_low16_new_neon(int16x8_t *in, int16x8_t *out,
1826 int8_t cos_bit, int bit) {
1827 (void)bit;
1828 const int32_t *cospi = cospi_arr(cos_bit);
1829 int16x8_t step1[32], step2[32];
1830 int32x4_t t32[16];
1831 const int16x4_t c0 =
1832 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
1833 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
1834 const int16x4_t c1 =
1835 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
1836 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
1837
1838 // stage 1
1839 // stage 2
1840
1841 btf_16_neon(in[1], cospi[62], cospi[2], &step2[16], &step2[31]);
1842 btf_16_neon(in[15], -cospi[34], cospi[30], &step2[17], &step2[30]);
1843 btf_16_neon(in[9], cospi[46], cospi[18], &step2[18], &step2[29]);
1844 btf_16_neon(in[7], -cospi[50], cospi[14], &step2[19], &step2[28]);
1845 btf_16_neon(in[5], cospi[54], cospi[10], &step2[20], &step2[27]);
1846 btf_16_neon(in[11], -cospi[42], cospi[22], &step2[21], &step2[26]);
1847 btf_16_neon(in[13], cospi[38], cospi[26], &step2[22], &step2[25]);
1848 btf_16_neon(in[3], -cospi[58], cospi[6], &step2[23], &step2[24]);
1849
1850 step2[0] = in[0];
1851 step2[2] = in[8];
1852 step2[4] = in[4];
1853 step2[6] = in[12];
1854 step2[8] = in[2];
1855 step2[10] = in[10];
1856 step2[12] = in[6];
1857 step2[14] = in[14];
1858
1859 // stage 3
1860
1861 btf_16_neon(step2[8], cospi[60], cospi[4], &step1[8], &step1[15]);
1862 btf_16_neon(step2[14], -cospi[36], cospi[28], &step1[9], &step1[14]);
1863 btf_16_neon(step2[10], cospi[44], cospi[20], &step1[10], &step1[13]);
1864 btf_16_neon(step2[12], -cospi[52], cospi[12], &step1[11], &step1[12]);
1865
1866 step1[0] = step2[0];
1867 step1[2] = step2[2];
1868 step1[4] = step2[4];
1869 step1[6] = step2[6];
1870 step1[16] = vqaddq_s16(step2[16], step2[17]);
1871 step1[17] = vqsubq_s16(step2[16], step2[17]);
1872 step1[18] = vqsubq_s16(step2[19], step2[18]);
1873 step1[19] = vqaddq_s16(step2[19], step2[18]);
1874 step1[20] = vqaddq_s16(step2[20], step2[21]);
1875 step1[21] = vqsubq_s16(step2[20], step2[21]);
1876 step1[22] = vqsubq_s16(step2[23], step2[22]);
1877 step1[23] = vqaddq_s16(step2[23], step2[22]);
1878 step1[24] = vqaddq_s16(step2[24], step2[25]);
1879 step1[25] = vqsubq_s16(step2[24], step2[25]);
1880 step1[26] = vqsubq_s16(step2[27], step2[26]);
1881 step1[27] = vqaddq_s16(step2[27], step2[26]);
1882 step1[28] = vqaddq_s16(step2[28], step2[29]);
1883 step1[29] = vqsubq_s16(step2[28], step2[29]);
1884 step1[30] = vqsubq_s16(step2[31], step2[30]);
1885 step1[31] = vqaddq_s16(step2[31], step2[30]);
1886
1887 // stage 4
1888
1889 btf_16_neon(step1[4], cospi[56], cospi[8], &step2[4], &step2[7]);
1890 btf_16_neon(step1[6], -cospi[40], cospi[24], &step2[5], &step2[6]);
1891 btf_16_lane_0_1_neon(step1[30], step1[17], c0, &step2[30], &step2[17]);
1892 btf_16_lane_1_0_neon(vnegq_s16(step1[18]), vnegq_s16(step1[29]), c0,
1893 &step2[18], &step2[29]);
1894 btf_16_lane_2_3_neon(step1[26], step1[21], c0, &step2[26], &step2[21]);
1895 btf_16_lane_3_2_neon(vnegq_s16(step1[22]), vnegq_s16(step1[25]), c0,
1896 &step2[22], &step2[25]);
1897
1898 step2[0] = step1[0];
1899 step2[2] = step1[2];
1900 step2[8] = vqaddq_s16(step1[8], step1[9]);
1901 step2[9] = vqsubq_s16(step1[8], step1[9]);
1902 step2[10] = vqsubq_s16(step1[11], step1[10]);
1903 step2[11] = vqaddq_s16(step1[11], step1[10]);
1904 step2[12] = vqaddq_s16(step1[12], step1[13]);
1905 step2[13] = vqsubq_s16(step1[12], step1[13]);
1906 step2[14] = vqsubq_s16(step1[15], step1[14]);
1907 step2[15] = vqaddq_s16(step1[15], step1[14]);
1908 step2[16] = step1[16];
1909 step2[19] = step1[19];
1910 step2[20] = step1[20];
1911 step2[23] = step1[23];
1912 step2[24] = step1[24];
1913 step2[27] = step1[27];
1914 step2[28] = step1[28];
1915 step2[31] = step1[31];
1916
1917 // stage 5
1918
1919 t32[0] = vmull_n_s16(vget_low_s16(step2[0]), cospi[32]);
1920 t32[1] = vmull_n_s16(vget_high_s16(step2[0]), cospi[32]);
1921
1922 step1[0] = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
1923 vrshrn_n_s32(t32[1], INV_COS_BIT));
1924
1925 btf_16_neon(step2[2], cospi[48], cospi[16], &step1[2], &step1[3]);
1926 btf_16_lane_2_3_neon(step2[14], step2[9], c1, &step1[14], &step1[9]);
1927 btf_16_lane_3_2_neon(vnegq_s16(step2[10]), vnegq_s16(step2[13]), c1,
1928 &step1[10], &step1[13]);
1929
1930 step1[4] = vqaddq_s16(step2[4], step2[5]);
1931 step1[5] = vqsubq_s16(step2[4], step2[5]);
1932 step1[6] = vqsubq_s16(step2[7], step2[6]);
1933 step1[7] = vqaddq_s16(step2[7], step2[6]);
1934 step1[8] = step2[8];
1935 step1[11] = step2[11];
1936 step1[12] = step2[12];
1937 step1[15] = step2[15];
1938 step1[16] = vqaddq_s16(step2[16], step2[19]);
1939 step1[17] = vqaddq_s16(step2[17], step2[18]);
1940 step1[18] = vqsubq_s16(step2[17], step2[18]);
1941 step1[19] = vqsubq_s16(step2[16], step2[19]);
1942 step1[20] = vqsubq_s16(step2[23], step2[20]);
1943 step1[21] = vqsubq_s16(step2[22], step2[21]);
1944 step1[22] = vqaddq_s16(step2[22], step2[21]);
1945 step1[23] = vqaddq_s16(step2[23], step2[20]);
1946 step1[24] = vqaddq_s16(step2[24], step2[27]);
1947 step1[25] = vqaddq_s16(step2[25], step2[26]);
1948 step1[26] = vqsubq_s16(step2[25], step2[26]);
1949 step1[27] = vqsubq_s16(step2[24], step2[27]);
1950 step1[28] = vqsubq_s16(step2[31], step2[28]);
1951 step1[29] = vqsubq_s16(step2[30], step2[29]);
1952 step1[30] = vqaddq_s16(step2[30], step2[29]);
1953 step1[31] = vqaddq_s16(step2[31], step2[28]);
1954
1955 // stage 6
1956
1957 btf_16_lane_0_1_neon(step1[6], step1[5], c1, &step2[6], &step2[5]);
1958 btf_16_lane_2_3_neon(step1[29], step1[18], c1, &step2[29], &step2[18]);
1959 btf_16_lane_2_3_neon(step1[28], step1[19], c1, &step2[28], &step2[19]);
1960 btf_16_lane_3_2_neon(vnegq_s16(step1[20]), vnegq_s16(step1[27]), c1,
1961 &step2[20], &step2[27]);
1962 btf_16_lane_3_2_neon(vnegq_s16(step1[21]), vnegq_s16(step1[26]), c1,
1963 &step2[21], &step2[26]);
1964
1965 step2[0] = vqaddq_s16(step1[0], step1[3]);
1966 step2[1] = vqaddq_s16(step1[0], step1[2]);
1967 step2[2] = vqsubq_s16(step1[0], step1[2]);
1968 step2[3] = vqsubq_s16(step1[0], step1[3]);
1969 step2[4] = step1[4];
1970 step2[7] = step1[7];
1971 step2[8] = vqaddq_s16(step1[8], step1[11]);
1972 step2[9] = vqaddq_s16(step1[9], step1[10]);
1973 step2[10] = vqsubq_s16(step1[9], step1[10]);
1974 step2[11] = vqsubq_s16(step1[8], step1[11]);
1975 step2[12] = vqsubq_s16(step1[15], step1[12]);
1976 step2[13] = vqsubq_s16(step1[14], step1[13]);
1977 step2[14] = vqaddq_s16(step1[14], step1[13]);
1978 step2[15] = vqaddq_s16(step1[15], step1[12]);
1979 step2[16] = step1[16];
1980 step2[17] = step1[17];
1981 step2[22] = step1[22];
1982 step2[23] = step1[23];
1983 step2[24] = step1[24];
1984 step2[25] = step1[25];
1985 step2[30] = step1[30];
1986 step2[31] = step1[31];
1987
1988 // stage 7
1989
1990 btf_16_lane_0_1_neon(step2[13], step2[10], c1, &step1[13], &step1[10]);
1991 btf_16_lane_0_1_neon(step2[12], step2[11], c1, &step1[12], &step1[11]);
1992
1993 step1[0] = vqaddq_s16(step2[0], step2[7]);
1994 step1[1] = vqaddq_s16(step2[1], step2[6]);
1995 step1[2] = vqaddq_s16(step2[2], step2[5]);
1996 step1[3] = vqaddq_s16(step2[3], step2[4]);
1997 step1[4] = vqsubq_s16(step2[3], step2[4]);
1998 step1[5] = vqsubq_s16(step2[2], step2[5]);
1999 step1[6] = vqsubq_s16(step2[1], step2[6]);
2000 step1[7] = vqsubq_s16(step2[0], step2[7]);
2001 step1[8] = step2[8];
2002 step1[9] = step2[9];
2003 step1[14] = step2[14];
2004 step1[15] = step2[15];
2005 step1[16] = vqaddq_s16(step2[16], step2[23]);
2006 step1[17] = vqaddq_s16(step2[17], step2[22]);
2007 step1[18] = vqaddq_s16(step2[18], step2[21]);
2008 step1[19] = vqaddq_s16(step2[19], step2[20]);
2009 step1[20] = vqsubq_s16(step2[19], step2[20]);
2010 step1[21] = vqsubq_s16(step2[18], step2[21]);
2011 step1[22] = vqsubq_s16(step2[17], step2[22]);
2012 step1[23] = vqsubq_s16(step2[16], step2[23]);
2013 step1[24] = vqsubq_s16(step2[31], step2[24]);
2014 step1[25] = vqsubq_s16(step2[30], step2[25]);
2015 step1[26] = vqsubq_s16(step2[29], step2[26]);
2016 step1[27] = vqsubq_s16(step2[28], step2[27]);
2017 step1[28] = vqaddq_s16(step2[27], step2[28]);
2018 step1[29] = vqaddq_s16(step2[26], step2[29]);
2019 step1[30] = vqaddq_s16(step2[25], step2[30]);
2020 step1[31] = vqaddq_s16(step2[24], step2[31]);
2021
2022 // stage 8
2023
2024 btf_16_lane_0_1_neon(step1[27], step1[20], c1, &step2[27], &step2[20]);
2025 btf_16_lane_0_1_neon(step1[26], step1[21], c1, &step2[26], &step2[21]);
2026 btf_16_lane_0_1_neon(step1[25], step1[22], c1, &step2[25], &step2[22]);
2027 btf_16_lane_0_1_neon(step1[24], step1[23], c1, &step2[24], &step2[23]);
2028
2029 step2[0] = vqaddq_s16(step1[0], step1[15]);
2030 step2[1] = vqaddq_s16(step1[1], step1[14]);
2031 step2[2] = vqaddq_s16(step1[2], step1[13]);
2032 step2[3] = vqaddq_s16(step1[3], step1[12]);
2033 step2[4] = vqaddq_s16(step1[4], step1[11]);
2034 step2[5] = vqaddq_s16(step1[5], step1[10]);
2035 step2[6] = vqaddq_s16(step1[6], step1[9]);
2036 step2[7] = vqaddq_s16(step1[7], step1[8]);
2037 step2[8] = vqsubq_s16(step1[7], step1[8]);
2038 step2[9] = vqsubq_s16(step1[6], step1[9]);
2039 step2[10] = vqsubq_s16(step1[5], step1[10]);
2040 step2[11] = vqsubq_s16(step1[4], step1[11]);
2041 step2[12] = vqsubq_s16(step1[3], step1[12]);
2042 step2[13] = vqsubq_s16(step1[2], step1[13]);
2043 step2[14] = vqsubq_s16(step1[1], step1[14]);
2044 step2[15] = vqsubq_s16(step1[0], step1[15]);
2045 step2[16] = step1[16];
2046 step2[17] = step1[17];
2047 step2[18] = step1[18];
2048 step2[19] = step1[19];
2049 step2[28] = step1[28];
2050 step2[29] = step1[29];
2051 step2[30] = step1[30];
2052 step2[31] = step1[31];
2053
2054 // stage 9
2055
2056 out[0] = vqaddq_s16(step2[0], step2[31]);
2057 out[1] = vqaddq_s16(step2[1], step2[30]);
2058 out[2] = vqaddq_s16(step2[2], step2[29]);
2059 out[3] = vqaddq_s16(step2[3], step2[28]);
2060 out[4] = vqaddq_s16(step2[4], step2[27]);
2061 out[5] = vqaddq_s16(step2[5], step2[26]);
2062 out[6] = vqaddq_s16(step2[6], step2[25]);
2063 out[7] = vqaddq_s16(step2[7], step2[24]);
2064 out[8] = vqaddq_s16(step2[8], step2[23]);
2065 out[9] = vqaddq_s16(step2[9], step2[22]);
2066 out[10] = vqaddq_s16(step2[10], step2[21]);
2067 out[11] = vqaddq_s16(step2[11], step2[20]);
2068 out[12] = vqaddq_s16(step2[12], step2[19]);
2069 out[13] = vqaddq_s16(step2[13], step2[18]);
2070 out[14] = vqaddq_s16(step2[14], step2[17]);
2071 out[15] = vqaddq_s16(step2[15], step2[16]);
2072 out[16] = vqsubq_s16(step2[15], step2[16]);
2073 out[17] = vqsubq_s16(step2[14], step2[17]);
2074 out[18] = vqsubq_s16(step2[13], step2[18]);
2075 out[19] = vqsubq_s16(step2[12], step2[19]);
2076 out[20] = vqsubq_s16(step2[11], step2[20]);
2077 out[21] = vqsubq_s16(step2[10], step2[21]);
2078 out[22] = vqsubq_s16(step2[9], step2[22]);
2079 out[23] = vqsubq_s16(step2[8], step2[23]);
2080 out[24] = vqsubq_s16(step2[7], step2[24]);
2081 out[25] = vqsubq_s16(step2[6], step2[25]);
2082 out[26] = vqsubq_s16(step2[5], step2[26]);
2083 out[27] = vqsubq_s16(step2[4], step2[27]);
2084 out[28] = vqsubq_s16(step2[3], step2[28]);
2085 out[29] = vqsubq_s16(step2[2], step2[29]);
2086 out[30] = vqsubq_s16(step2[1], step2[30]);
2087 out[31] = vqsubq_s16(step2[0], step2[31]);
2088}
sachin garg56f10202018-09-24 14:05:25 +00002089static INLINE void idct64_stage9_neon(int16x8_t *step2, int16x8_t *step1,
2090 int8_t cos_bit) {
2091 const int32_t *cospi = cospi_arr(cos_bit);
2092 const int16x4_t c3 =
2093 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
2094 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
2095
2096 btf_16_lane_0_1_neon(step2[27], step2[20], c3, &step1[27], &step1[20]);
2097 btf_16_lane_0_1_neon(step2[26], step2[21], c3, &step1[26], &step1[21]);
2098 btf_16_lane_0_1_neon(step2[25], step2[22], c3, &step1[25], &step1[22]);
2099 btf_16_lane_0_1_neon(step2[24], step2[23], c3, &step1[24], &step1[23]);
2100
2101 step1[0] = vqaddq_s16(step2[0], step2[15]);
2102 step1[1] = vqaddq_s16(step2[1], step2[14]);
2103 step1[2] = vqaddq_s16(step2[2], step2[13]);
2104 step1[3] = vqaddq_s16(step2[3], step2[12]);
2105 step1[4] = vqaddq_s16(step2[4], step2[11]);
2106 step1[5] = vqaddq_s16(step2[5], step2[10]);
2107 step1[6] = vqaddq_s16(step2[6], step2[9]);
2108 step1[7] = vqaddq_s16(step2[7], step2[8]);
2109 step1[8] = vqsubq_s16(step2[7], step2[8]);
2110 step1[9] = vqsubq_s16(step2[6], step2[9]);
2111 step1[10] = vqsubq_s16(step2[5], step2[10]);
2112 step1[11] = vqsubq_s16(step2[4], step2[11]);
2113 step1[12] = vqsubq_s16(step2[3], step2[12]);
2114 step1[13] = vqsubq_s16(step2[2], step2[13]);
2115 step1[14] = vqsubq_s16(step2[1], step2[14]);
2116 step1[15] = vqsubq_s16(step2[0], step2[15]);
2117 step1[16] = step2[16];
2118 step1[17] = step2[17];
2119 step1[18] = step2[18];
2120 step1[19] = step2[19];
2121 step1[28] = step2[28];
2122 step1[29] = step2[29];
2123 step1[30] = step2[30];
2124 step1[31] = step2[31];
2125 step1[32] = vqaddq_s16(step2[32], step2[47]);
2126 step1[33] = vqaddq_s16(step2[33], step2[46]);
2127 step1[34] = vqaddq_s16(step2[34], step2[45]);
2128 step1[35] = vqaddq_s16(step2[35], step2[44]);
2129 step1[36] = vqaddq_s16(step2[36], step2[43]);
2130 step1[37] = vqaddq_s16(step2[37], step2[42]);
2131 step1[38] = vqaddq_s16(step2[38], step2[41]);
2132 step1[39] = vqaddq_s16(step2[39], step2[40]);
2133 step1[40] = vqsubq_s16(step2[39], step2[40]);
2134 step1[41] = vqsubq_s16(step2[38], step2[41]);
2135 step1[42] = vqsubq_s16(step2[37], step2[42]);
2136 step1[43] = vqsubq_s16(step2[36], step2[43]);
2137 step1[44] = vqsubq_s16(step2[35], step2[44]);
2138 step1[45] = vqsubq_s16(step2[34], step2[45]);
2139 step1[46] = vqsubq_s16(step2[33], step2[46]);
2140 step1[47] = vqsubq_s16(step2[32], step2[47]);
2141 step1[48] = vqsubq_s16(step2[63], step2[48]);
2142 step1[49] = vqsubq_s16(step2[62], step2[49]);
2143 step1[50] = vqsubq_s16(step2[61], step2[50]);
2144 step1[51] = vqsubq_s16(step2[60], step2[51]);
2145 step1[52] = vqsubq_s16(step2[59], step2[52]);
2146 step1[53] = vqsubq_s16(step2[58], step2[53]);
2147 step1[54] = vqsubq_s16(step2[57], step2[54]);
2148 step1[55] = vqsubq_s16(step2[56], step2[55]);
2149 step1[56] = vqaddq_s16(step2[56], step2[55]);
2150 step1[57] = vqaddq_s16(step2[57], step2[54]);
2151 step1[58] = vqaddq_s16(step2[58], step2[53]);
2152 step1[59] = vqaddq_s16(step2[59], step2[52]);
2153 step1[60] = vqaddq_s16(step2[60], step2[51]);
2154 step1[61] = vqaddq_s16(step2[61], step2[50]);
2155 step1[62] = vqaddq_s16(step2[62], step2[49]);
2156 step1[63] = vqaddq_s16(step2[63], step2[48]);
2157}
2158
2159static INLINE void idct64_stage10_neon(int16x8_t *step1, int16x8_t *step2,
2160 int8_t cos_bit) {
2161 const int32_t *cospi = cospi_arr(cos_bit);
2162 const int16x4_t c3 =
2163 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
2164 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
2165
2166 btf_16_lane_0_1_neon(step1[55], step1[40], c3, &step2[55], &step2[40]);
2167 btf_16_lane_0_1_neon(step1[54], step1[41], c3, &step2[54], &step2[41]);
2168 btf_16_lane_0_1_neon(step1[53], step1[42], c3, &step2[53], &step2[42]);
2169 btf_16_lane_0_1_neon(step1[52], step1[43], c3, &step2[52], &step2[43]);
2170 btf_16_lane_0_1_neon(step1[51], step1[44], c3, &step2[51], &step2[44]);
2171 btf_16_lane_0_1_neon(step1[50], step1[45], c3, &step2[50], &step2[45]);
2172 btf_16_lane_0_1_neon(step1[49], step1[46], c3, &step2[49], &step2[46]);
2173 btf_16_lane_0_1_neon(step1[48], step1[47], c3, &step2[48], &step2[47]);
2174
2175 step2[0] = vqaddq_s16(step1[0], step1[31]);
2176 step2[1] = vqaddq_s16(step1[1], step1[30]);
2177 step2[2] = vqaddq_s16(step1[2], step1[29]);
2178 step2[3] = vqaddq_s16(step1[3], step1[28]);
2179 step2[4] = vqaddq_s16(step1[4], step1[27]);
2180 step2[5] = vqaddq_s16(step1[5], step1[26]);
2181 step2[6] = vqaddq_s16(step1[6], step1[25]);
2182 step2[7] = vqaddq_s16(step1[7], step1[24]);
2183 step2[8] = vqaddq_s16(step1[8], step1[23]);
2184 step2[9] = vqaddq_s16(step1[9], step1[22]);
2185 step2[10] = vqaddq_s16(step1[10], step1[21]);
2186 step2[11] = vqaddq_s16(step1[11], step1[20]);
2187 step2[12] = vqaddq_s16(step1[12], step1[19]);
2188 step2[13] = vqaddq_s16(step1[13], step1[18]);
2189 step2[14] = vqaddq_s16(step1[14], step1[17]);
2190 step2[15] = vqaddq_s16(step1[15], step1[16]);
2191 step2[16] = vqsubq_s16(step1[15], step1[16]);
2192 step2[17] = vqsubq_s16(step1[14], step1[17]);
2193 step2[18] = vqsubq_s16(step1[13], step1[18]);
2194 step2[19] = vqsubq_s16(step1[12], step1[19]);
2195 step2[20] = vqsubq_s16(step1[11], step1[20]);
2196 step2[21] = vqsubq_s16(step1[10], step1[21]);
2197 step2[22] = vqsubq_s16(step1[9], step1[22]);
2198 step2[23] = vqsubq_s16(step1[8], step1[23]);
2199 step2[24] = vqsubq_s16(step1[7], step1[24]);
2200 step2[25] = vqsubq_s16(step1[6], step1[25]);
2201 step2[26] = vqsubq_s16(step1[5], step1[26]);
2202 step2[27] = vqsubq_s16(step1[4], step1[27]);
2203 step2[28] = vqsubq_s16(step1[3], step1[28]);
2204 step2[29] = vqsubq_s16(step1[2], step1[29]);
2205 step2[30] = vqsubq_s16(step1[1], step1[30]);
2206 step2[31] = vqsubq_s16(step1[0], step1[31]);
2207 step2[32] = step1[32];
2208 step2[33] = step1[33];
2209 step2[34] = step1[34];
2210 step2[35] = step1[35];
2211 step2[36] = step1[36];
2212 step2[37] = step1[37];
2213 step2[38] = step1[38];
2214 step2[39] = step1[39];
2215 step2[56] = step1[56];
2216 step2[57] = step1[57];
2217 step2[58] = step1[58];
2218 step2[59] = step1[59];
2219 step2[60] = step1[60];
2220 step2[61] = step1[61];
2221 step2[62] = step1[62];
2222 step2[63] = step1[63];
2223}
2224
2225static INLINE void idct64_low32_new_neon(int16x8_t *in, int16x8_t *out,
2226 int8_t cos_bit, int bit) {
2227 (void)bit;
2228 const int32_t *cospi = cospi_arr(cos_bit);
2229 int16x8_t step2[64], step1[64];
2230 const int16x4_t c0 =
2231 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
2232 (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
2233 const int16x4_t c1 =
2234 create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
2235 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
2236 const int16x4_t c2 =
2237 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
2238 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
2239 const int16x4_t c3 =
2240 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
2241 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
2242
2243 // stage 1
2244 // stage 2
2245
2246 step2[0] = in[0];
2247 step2[2] = in[16];
2248 step2[4] = in[8];
2249 step2[6] = in[24];
2250 step2[8] = in[4];
2251 step2[10] = in[20];
2252 step2[12] = in[12];
2253 step2[14] = in[28];
2254 step2[16] = in[2];
2255 step2[18] = in[18];
2256 step2[20] = in[10];
2257 step2[22] = in[26];
2258 step2[24] = in[6];
2259 step2[26] = in[22];
2260 step2[28] = in[14];
2261 step2[30] = in[30];
2262
2263 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2264 btf_16_neon(in[31], -cospi[33], cospi[31], &step2[33], &step2[62]);
2265 btf_16_neon(in[17], cospi[47], cospi[17], &step2[34], &step2[61]);
2266 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
2267 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
2268 btf_16_neon(in[23], -cospi[41], cospi[23], &step2[37], &step2[58]);
2269 btf_16_neon(in[25], cospi[39], cospi[25], &step2[38], &step2[57]);
2270 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2271 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2272 btf_16_neon(in[27], -cospi[37], cospi[27], &step2[41], &step2[54]);
2273 btf_16_neon(in[21], cospi[43], cospi[21], &step2[42], &step2[53]);
2274 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
2275 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
2276 btf_16_neon(in[19], -cospi[45], cospi[19], &step2[45], &step2[50]);
2277 btf_16_neon(in[29], cospi[35], cospi[29], &step2[46], &step2[49]);
2278 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2279
2280 // stage 3
2281
2282 step1[0] = step2[0];
2283 step1[2] = step2[2];
2284 step1[4] = step2[4];
2285 step1[6] = step2[6];
2286 step1[8] = step2[8];
2287 step1[10] = step2[10];
2288 step1[12] = step2[12];
2289 step1[14] = step2[14];
2290
2291 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2292 btf_16_neon(step2[30], -cospi[34], cospi[30], &step1[17], &step1[30]);
2293 btf_16_neon(step2[18], cospi[46], cospi[18], &step1[18], &step1[29]);
2294 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
2295 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
2296 btf_16_neon(step2[26], -cospi[42], cospi[22], &step1[21], &step1[26]);
2297 btf_16_neon(step2[22], cospi[38], cospi[26], &step1[22], &step1[25]);
2298 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2299
2300 step1[32] = vqaddq_s16(step2[32], step2[33]);
2301 step1[33] = vqsubq_s16(step2[32], step2[33]);
2302 step1[34] = vqsubq_s16(step2[35], step2[34]);
2303 step1[35] = vqaddq_s16(step2[35], step2[34]);
2304 step1[36] = vqaddq_s16(step2[36], step2[37]);
2305 step1[37] = vqsubq_s16(step2[36], step2[37]);
2306 step1[38] = vqsubq_s16(step2[39], step2[38]);
2307 step1[39] = vqaddq_s16(step2[39], step2[38]);
2308 step1[40] = vqaddq_s16(step2[40], step2[41]);
2309 step1[41] = vqsubq_s16(step2[40], step2[41]);
2310 step1[42] = vqsubq_s16(step2[43], step2[42]);
2311 step1[43] = vqaddq_s16(step2[43], step2[42]);
2312 step1[44] = vqaddq_s16(step2[44], step2[45]);
2313 step1[45] = vqsubq_s16(step2[44], step2[45]);
2314 step1[46] = vqsubq_s16(step2[47], step2[46]);
2315 step1[47] = vqaddq_s16(step2[47], step2[46]);
2316 step1[48] = vqaddq_s16(step2[48], step2[49]);
2317 step1[49] = vqsubq_s16(step2[48], step2[49]);
2318 step1[50] = vqsubq_s16(step2[51], step2[50]);
2319 step1[51] = vqaddq_s16(step2[51], step2[50]);
2320 step1[52] = vqaddq_s16(step2[52], step2[53]);
2321 step1[53] = vqsubq_s16(step2[52], step2[53]);
2322 step1[54] = vqsubq_s16(step2[55], step2[54]);
2323 step1[55] = vqaddq_s16(step2[55], step2[54]);
2324 step1[56] = vqaddq_s16(step2[56], step2[57]);
2325 step1[57] = vqsubq_s16(step2[56], step2[57]);
2326 step1[58] = vqsubq_s16(step2[59], step2[58]);
2327 step1[59] = vqaddq_s16(step2[59], step2[58]);
2328 step1[60] = vqaddq_s16(step2[60], step2[61]);
2329 step1[61] = vqsubq_s16(step2[60], step2[61]);
2330 step1[62] = vqsubq_s16(step2[63], step2[62]);
2331 step1[63] = vqaddq_s16(step2[63], step2[62]);
2332
2333 // stage 4
2334
2335 step2[0] = step1[0];
2336 step2[2] = step1[2];
2337 step2[4] = step1[4];
2338 step2[6] = step1[6];
2339
2340 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2341 btf_16_neon(step1[14], -cospi[36], cospi[28], &step2[9], &step2[14]);
2342 btf_16_neon(step1[10], cospi[44], cospi[20], &step2[10], &step2[13]);
2343 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
2344 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2345 btf_16_lane_1_0_neon(vnegq_s16(step1[34]), vnegq_s16(step1[61]), c0,
2346 &step2[34], &step2[61]);
2347 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
2348 btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
2349 &step2[38], &step2[57]);
2350 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2351 btf_16_lane_1_0_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c1,
2352 &step2[42], &step2[53]);
2353 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
2354 btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
2355 &step2[46], &step2[49]);
2356
2357 step2[16] = vqaddq_s16(step1[16], step1[17]);
2358 step2[17] = vqsubq_s16(step1[16], step1[17]);
2359 step2[18] = vqsubq_s16(step1[19], step1[18]);
2360 step2[19] = vqaddq_s16(step1[19], step1[18]);
2361 step2[20] = vqaddq_s16(step1[20], step1[21]);
2362 step2[21] = vqsubq_s16(step1[20], step1[21]);
2363 step2[22] = vqsubq_s16(step1[23], step1[22]);
2364 step2[23] = vqaddq_s16(step1[23], step1[22]);
2365 step2[24] = vqaddq_s16(step1[24], step1[25]);
2366 step2[25] = vqsubq_s16(step1[24], step1[25]);
2367 step2[26] = vqsubq_s16(step1[27], step1[26]);
2368 step2[27] = vqaddq_s16(step1[27], step1[26]);
2369 step2[28] = vqaddq_s16(step1[28], step1[29]);
2370 step2[29] = vqsubq_s16(step1[28], step1[29]);
2371 step2[30] = vqsubq_s16(step1[31], step1[30]);
2372 step2[31] = vqaddq_s16(step1[31], step1[30]);
2373 step2[32] = step1[32];
2374 step2[35] = step1[35];
2375 step2[36] = step1[36];
2376 step2[39] = step1[39];
2377 step2[40] = step1[40];
2378 step2[43] = step1[43];
2379 step2[44] = step1[44];
2380 step2[47] = step1[47];
2381 step2[48] = step1[48];
2382 step2[51] = step1[51];
2383 step2[52] = step1[52];
2384 step2[55] = step1[55];
2385 step2[56] = step1[56];
2386 step2[59] = step1[59];
2387 step2[60] = step1[60];
2388 step2[63] = step1[63];
2389
2390 // stage 5
2391
2392 step1[0] = step2[0];
2393 step1[2] = step2[2];
2394
2395 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
2396 btf_16_neon(step2[6], -cospi[40], cospi[24], &step1[5], &step1[6]);
2397 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2398 btf_16_lane_1_0_neon(vnegq_s16(step2[18]), vnegq_s16(step2[29]), c2,
2399 &step1[18], &step1[29]);
2400 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
2401 btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
2402 &step1[22], &step1[25]);
2403
2404 step1[8] = vqaddq_s16(step2[8], step2[9]);
2405 step1[9] = vqsubq_s16(step2[8], step2[9]);
2406 step1[10] = vqsubq_s16(step2[11], step2[10]);
2407 step1[11] = vqaddq_s16(step2[11], step2[10]);
2408 step1[12] = vqaddq_s16(step2[12], step2[13]);
2409 step1[13] = vqsubq_s16(step2[12], step2[13]);
2410 step1[14] = vqsubq_s16(step2[15], step2[14]);
2411 step1[15] = vqaddq_s16(step2[15], step2[14]);
2412 step1[16] = step2[16];
2413 step1[19] = step2[19];
2414 step1[20] = step2[20];
2415 step1[23] = step2[23];
2416 step1[24] = step2[24];
2417 step1[27] = step2[27];
2418 step1[28] = step2[28];
2419 step1[31] = step2[31];
2420 step1[32] = vqaddq_s16(step2[32], step2[35]);
2421 step1[33] = vqaddq_s16(step2[33], step2[34]);
2422 step1[34] = vqsubq_s16(step2[33], step2[34]);
2423 step1[35] = vqsubq_s16(step2[32], step2[35]);
2424 step1[36] = vqsubq_s16(step2[39], step2[36]);
2425 step1[37] = vqsubq_s16(step2[38], step2[37]);
2426 step1[38] = vqaddq_s16(step2[38], step2[37]);
2427 step1[39] = vqaddq_s16(step2[39], step2[36]);
2428 step1[40] = vqaddq_s16(step2[40], step2[43]);
2429 step1[41] = vqaddq_s16(step2[41], step2[42]);
2430 step1[42] = vqsubq_s16(step2[41], step2[42]);
2431 step1[43] = vqsubq_s16(step2[40], step2[43]);
2432 step1[44] = vqsubq_s16(step2[47], step2[44]);
2433 step1[45] = vqsubq_s16(step2[46], step2[45]);
2434 step1[46] = vqaddq_s16(step2[46], step2[45]);
2435 step1[47] = vqaddq_s16(step2[47], step2[44]);
2436 step1[48] = vqaddq_s16(step2[48], step2[51]);
2437 step1[49] = vqaddq_s16(step2[49], step2[50]);
2438 step1[50] = vqsubq_s16(step2[49], step2[50]);
2439 step1[51] = vqsubq_s16(step2[48], step2[51]);
2440 step1[52] = vqsubq_s16(step2[55], step2[52]);
2441 step1[53] = vqsubq_s16(step2[54], step2[53]);
2442 step1[54] = vqaddq_s16(step2[54], step2[53]);
2443 step1[55] = vqaddq_s16(step2[55], step2[52]);
2444 step1[56] = vqaddq_s16(step2[56], step2[59]);
2445 step1[57] = vqaddq_s16(step2[57], step2[58]);
2446 step1[58] = vqsubq_s16(step2[57], step2[58]);
2447 step1[59] = vqsubq_s16(step2[56], step2[59]);
2448 step1[60] = vqsubq_s16(step2[63], step2[60]);
2449 step1[61] = vqsubq_s16(step2[62], step2[61]);
2450 step1[62] = vqaddq_s16(step2[62], step2[61]);
2451 step1[63] = vqaddq_s16(step2[63], step2[60]);
2452
2453 // stage 6
2454
2455 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2456 btf_16_neon(step1[2], cospi[48], cospi[16], &step2[2], &step2[3]);
2457 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2458 btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
2459 &step2[10], &step2[13]);
2460 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2461 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2462 btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
2463 &step2[36], &step2[59]);
2464 btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
2465 &step2[37], &step2[58]);
2466 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2467 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2468 btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
2469 &step2[44], &step2[51]);
2470 btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
2471 &step2[45], &step2[50]);
2472
2473 step2[4] = vqaddq_s16(step1[4], step1[5]);
2474 step2[5] = vqsubq_s16(step1[4], step1[5]);
2475 step2[6] = vqsubq_s16(step1[7], step1[6]);
2476 step2[7] = vqaddq_s16(step1[7], step1[6]);
2477 step2[8] = step1[8];
2478 step2[11] = step1[11];
2479 step2[12] = step1[12];
2480 step2[15] = step1[15];
2481 step2[16] = vqaddq_s16(step1[16], step1[19]);
2482 step2[17] = vqaddq_s16(step1[17], step1[18]);
2483 step2[18] = vqsubq_s16(step1[17], step1[18]);
2484 step2[19] = vqsubq_s16(step1[16], step1[19]);
2485 step2[20] = vqsubq_s16(step1[23], step1[20]);
2486 step2[21] = vqsubq_s16(step1[22], step1[21]);
2487 step2[22] = vqaddq_s16(step1[22], step1[21]);
2488 step2[23] = vqaddq_s16(step1[23], step1[20]);
2489 step2[24] = vqaddq_s16(step1[24], step1[27]);
2490 step2[25] = vqaddq_s16(step1[25], step1[26]);
2491 step2[26] = vqsubq_s16(step1[25], step1[26]);
2492 step2[27] = vqsubq_s16(step1[24], step1[27]);
2493 step2[28] = vqsubq_s16(step1[31], step1[28]);
2494 step2[29] = vqsubq_s16(step1[30], step1[29]);
2495 step2[30] = vqaddq_s16(step1[30], step1[29]);
2496 step2[31] = vqaddq_s16(step1[31], step1[28]);
2497 step2[32] = step1[32];
2498 step2[33] = step1[33];
2499 step2[38] = step1[38];
2500 step2[39] = step1[39];
2501 step2[40] = step1[40];
2502 step2[41] = step1[41];
2503 step2[46] = step1[46];
2504 step2[47] = step1[47];
2505 step2[48] = step1[48];
2506 step2[49] = step1[49];
2507 step2[54] = step1[54];
2508 step2[55] = step1[55];
2509 step2[56] = step1[56];
2510 step2[57] = step1[57];
2511 step2[62] = step1[62];
2512 step2[63] = step1[63];
2513
2514 // stage 7
2515
2516 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
2517 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2518 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2519 btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
2520 &step1[20], &step1[27]);
2521 btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
2522 &step1[21], &step1[26]);
2523
2524 step1[0] = vqaddq_s16(step2[0], step2[3]);
2525 step1[1] = vqaddq_s16(step2[1], step2[2]);
2526 step1[2] = vqsubq_s16(step2[1], step2[2]);
2527 step1[3] = vqsubq_s16(step2[0], step2[3]);
2528 step1[4] = step2[4];
2529 step1[7] = step2[7];
2530 step1[8] = vqaddq_s16(step2[8], step2[11]);
2531 step1[9] = vqaddq_s16(step2[9], step2[10]);
2532 step1[10] = vqsubq_s16(step2[9], step2[10]);
2533 step1[11] = vqsubq_s16(step2[8], step2[11]);
2534 step1[12] = vqsubq_s16(step2[15], step2[12]);
2535 step1[13] = vqsubq_s16(step2[14], step2[13]);
2536 step1[14] = vqaddq_s16(step2[14], step2[13]);
2537 step1[15] = vqaddq_s16(step2[15], step2[12]);
2538 step1[16] = step2[16];
2539 step1[17] = step2[17];
2540 step1[22] = step2[22];
2541 step1[23] = step2[23];
2542 step1[24] = step2[24];
2543 step1[25] = step2[25];
2544 step1[30] = step2[30];
2545 step1[31] = step2[31];
2546 step1[32] = vqaddq_s16(step2[32], step2[39]);
2547 step1[33] = vqaddq_s16(step2[33], step2[38]);
2548 step1[34] = vqaddq_s16(step2[34], step2[37]);
2549 step1[35] = vqaddq_s16(step2[35], step2[36]);
2550 step1[36] = vqsubq_s16(step2[35], step2[36]);
2551 step1[37] = vqsubq_s16(step2[34], step2[37]);
2552 step1[38] = vqsubq_s16(step2[33], step2[38]);
2553 step1[39] = vqsubq_s16(step2[32], step2[39]);
2554 step1[40] = vqsubq_s16(step2[47], step2[40]);
2555 step1[41] = vqsubq_s16(step2[46], step2[41]);
2556 step1[42] = vqsubq_s16(step2[45], step2[42]);
2557 step1[43] = vqsubq_s16(step2[44], step2[43]);
2558 step1[44] = vqaddq_s16(step2[43], step2[44]);
2559 step1[45] = vqaddq_s16(step2[42], step2[45]);
2560 step1[46] = vqaddq_s16(step2[41], step2[46]);
2561 step1[47] = vqaddq_s16(step2[40], step2[47]);
2562 step1[48] = vqaddq_s16(step2[48], step2[55]);
2563 step1[49] = vqaddq_s16(step2[49], step2[54]);
2564 step1[50] = vqaddq_s16(step2[50], step2[53]);
2565 step1[51] = vqaddq_s16(step2[51], step2[52]);
2566 step1[52] = vqsubq_s16(step2[51], step2[52]);
2567 step1[53] = vqsubq_s16(step2[50], step2[53]);
2568 step1[54] = vqsubq_s16(step2[49], step2[54]);
2569 step1[55] = vqsubq_s16(step2[48], step2[55]);
2570 step1[56] = vqsubq_s16(step2[63], step2[56]);
2571 step1[57] = vqsubq_s16(step2[62], step2[57]);
2572 step1[58] = vqsubq_s16(step2[61], step2[58]);
2573 step1[59] = vqsubq_s16(step2[60], step2[59]);
2574 step1[60] = vqaddq_s16(step2[59], step2[60]);
2575 step1[61] = vqaddq_s16(step2[58], step2[61]);
2576 step1[62] = vqaddq_s16(step2[57], step2[62]);
2577 step1[63] = vqaddq_s16(step2[56], step2[63]);
2578
2579 // stage 8
2580
2581 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
2582 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
2583 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
2584 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
2585 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
2586 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
2587 btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
2588 &step2[40], &step2[55]);
2589 btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
2590 &step2[41], &step2[54]);
2591 btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
2592 &step2[42], &step2[53]);
2593 btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
2594 &step2[43], &step2[52]);
2595
2596 step2[0] = vqaddq_s16(step1[0], step1[7]);
2597 step2[1] = vqaddq_s16(step1[1], step1[6]);
2598 step2[2] = vqaddq_s16(step1[2], step1[5]);
2599 step2[3] = vqaddq_s16(step1[3], step1[4]);
2600 step2[4] = vqsubq_s16(step1[3], step1[4]);
2601 step2[5] = vqsubq_s16(step1[2], step1[5]);
2602 step2[6] = vqsubq_s16(step1[1], step1[6]);
2603 step2[7] = vqsubq_s16(step1[0], step1[7]);
2604 step2[8] = step1[8];
2605 step2[9] = step1[9];
2606 step2[14] = step1[14];
2607 step2[15] = step1[15];
2608 step2[16] = vqaddq_s16(step1[16], step1[23]);
2609 step2[17] = vqaddq_s16(step1[17], step1[22]);
2610 step2[18] = vqaddq_s16(step1[18], step1[21]);
2611 step2[19] = vqaddq_s16(step1[19], step1[20]);
2612 step2[20] = vqsubq_s16(step1[19], step1[20]);
2613 step2[21] = vqsubq_s16(step1[18], step1[21]);
2614 step2[22] = vqsubq_s16(step1[17], step1[22]);
2615 step2[23] = vqsubq_s16(step1[16], step1[23]);
2616 step2[24] = vqsubq_s16(step1[31], step1[24]);
2617 step2[25] = vqsubq_s16(step1[30], step1[25]);
2618 step2[26] = vqsubq_s16(step1[29], step1[26]);
2619 step2[27] = vqsubq_s16(step1[28], step1[27]);
2620 step2[28] = vqaddq_s16(step1[28], step1[27]);
2621 step2[29] = vqaddq_s16(step1[29], step1[26]);
2622 step2[30] = vqaddq_s16(step1[30], step1[25]);
2623 step2[31] = vqaddq_s16(step1[31], step1[24]);
2624 step2[32] = step1[32];
2625 step2[33] = step1[33];
2626 step2[34] = step1[34];
2627 step2[35] = step1[35];
2628 step2[44] = step1[44];
2629 step2[45] = step1[45];
2630 step2[46] = step1[46];
2631 step2[47] = step1[47];
2632 step2[48] = step1[48];
2633 step2[49] = step1[49];
2634 step2[50] = step1[50];
2635 step2[51] = step1[51];
2636 step2[60] = step1[60];
2637 step2[61] = step1[61];
2638 step2[62] = step1[62];
2639 step2[63] = step1[63];
2640
2641 // stage 9
2642 idct64_stage9_neon(step2, step1, cos_bit);
2643
2644 // stage 10
2645 idct64_stage10_neon(step1, step2, cos_bit);
2646
2647 // stage 11
2648
2649 out[0] = vqaddq_s16(step2[0], step2[63]);
2650 out[1] = vqaddq_s16(step2[1], step2[62]);
2651 out[2] = vqaddq_s16(step2[2], step2[61]);
2652 out[3] = vqaddq_s16(step2[3], step2[60]);
2653 out[4] = vqaddq_s16(step2[4], step2[59]);
2654 out[5] = vqaddq_s16(step2[5], step2[58]);
2655 out[6] = vqaddq_s16(step2[6], step2[57]);
2656 out[7] = vqaddq_s16(step2[7], step2[56]);
2657 out[8] = vqaddq_s16(step2[8], step2[55]);
2658 out[9] = vqaddq_s16(step2[9], step2[54]);
2659 out[10] = vqaddq_s16(step2[10], step2[53]);
2660 out[11] = vqaddq_s16(step2[11], step2[52]);
2661 out[12] = vqaddq_s16(step2[12], step2[51]);
2662 out[13] = vqaddq_s16(step2[13], step2[50]);
2663 out[14] = vqaddq_s16(step2[14], step2[49]);
2664 out[15] = vqaddq_s16(step2[15], step2[48]);
2665 out[16] = vqaddq_s16(step2[16], step2[47]);
2666 out[17] = vqaddq_s16(step2[17], step2[46]);
2667 out[18] = vqaddq_s16(step2[18], step2[45]);
2668 out[19] = vqaddq_s16(step2[19], step2[44]);
2669 out[20] = vqaddq_s16(step2[20], step2[43]);
2670 out[21] = vqaddq_s16(step2[21], step2[42]);
2671 out[22] = vqaddq_s16(step2[22], step2[41]);
2672 out[23] = vqaddq_s16(step2[23], step2[40]);
2673 out[24] = vqaddq_s16(step2[24], step2[39]);
2674 out[25] = vqaddq_s16(step2[25], step2[38]);
2675 out[26] = vqaddq_s16(step2[26], step2[37]);
2676 out[27] = vqaddq_s16(step2[27], step2[36]);
2677 out[28] = vqaddq_s16(step2[28], step2[35]);
2678 out[29] = vqaddq_s16(step2[29], step2[34]);
2679 out[30] = vqaddq_s16(step2[30], step2[33]);
2680 out[31] = vqaddq_s16(step2[31], step2[32]);
2681 out[32] = vqsubq_s16(step2[31], step2[32]);
2682 out[33] = vqsubq_s16(step2[30], step2[33]);
2683 out[34] = vqsubq_s16(step2[29], step2[34]);
2684 out[35] = vqsubq_s16(step2[28], step2[35]);
2685 out[36] = vqsubq_s16(step2[27], step2[36]);
2686 out[37] = vqsubq_s16(step2[26], step2[37]);
2687 out[38] = vqsubq_s16(step2[25], step2[38]);
2688 out[39] = vqsubq_s16(step2[24], step2[39]);
2689 out[40] = vqsubq_s16(step2[23], step2[40]);
2690 out[41] = vqsubq_s16(step2[22], step2[41]);
2691 out[42] = vqsubq_s16(step2[21], step2[42]);
2692 out[43] = vqsubq_s16(step2[20], step2[43]);
2693 out[44] = vqsubq_s16(step2[19], step2[44]);
2694 out[45] = vqsubq_s16(step2[18], step2[45]);
2695 out[46] = vqsubq_s16(step2[17], step2[46]);
2696 out[47] = vqsubq_s16(step2[16], step2[47]);
2697 out[48] = vqsubq_s16(step2[15], step2[48]);
2698 out[49] = vqsubq_s16(step2[14], step2[49]);
2699 out[50] = vqsubq_s16(step2[13], step2[50]);
2700 out[51] = vqsubq_s16(step2[12], step2[51]);
2701 out[52] = vqsubq_s16(step2[11], step2[52]);
2702 out[53] = vqsubq_s16(step2[10], step2[53]);
2703 out[54] = vqsubq_s16(step2[9], step2[54]);
2704 out[55] = vqsubq_s16(step2[8], step2[55]);
2705 out[56] = vqsubq_s16(step2[7], step2[56]);
2706 out[57] = vqsubq_s16(step2[6], step2[57]);
2707 out[58] = vqsubq_s16(step2[5], step2[58]);
2708 out[59] = vqsubq_s16(step2[4], step2[59]);
2709 out[60] = vqsubq_s16(step2[3], step2[60]);
2710 out[61] = vqsubq_s16(step2[2], step2[61]);
2711 out[62] = vqsubq_s16(step2[1], step2[62]);
2712 out[63] = vqsubq_s16(step2[0], step2[63]);
2713}
2714
2715static INLINE void idct64_low1_new_neon(int16x8_t *input, int16x8_t *out,
2716 int8_t cos_bit, int bit) {
2717 (void)bit;
2718 const int32_t *cospi = cospi_arr(cos_bit);
2719 int16x8_t step1;
2720 int32x4_t t32[2];
2721
2722 // stage 1
2723 // stage 2
2724 // stage 3
2725 // stage 4
2726 // stage 5
2727 // stage 6
2728
2729 t32[0] = vmull_n_s16(vget_low_s16(input[0]), cospi[32]);
2730 t32[1] = vmull_n_s16(vget_high_s16(input[0]), cospi[32]);
2731
2732 step1 = vcombine_s16(vrshrn_n_s32(t32[0], INV_COS_BIT),
2733 vrshrn_n_s32(t32[1], INV_COS_BIT));
2734 // stage 7
2735 // stage 8
2736 // stage 9
2737 // stage 10
2738 // stage 11
2739 out[0] = step1;
2740 out[1] = step1;
2741 out[2] = step1;
2742 out[3] = step1;
2743 out[4] = step1;
2744 out[5] = step1;
2745 out[6] = step1;
2746 out[7] = step1;
2747 out[8] = step1;
2748 out[9] = step1;
2749 out[10] = step1;
2750 out[11] = step1;
2751 out[12] = step1;
2752 out[13] = step1;
2753 out[14] = step1;
2754 out[15] = step1;
2755 out[16] = step1;
2756 out[17] = step1;
2757 out[18] = step1;
2758 out[19] = step1;
2759 out[20] = step1;
2760 out[21] = step1;
2761 out[22] = step1;
2762 out[23] = step1;
2763 out[24] = step1;
2764 out[25] = step1;
2765 out[26] = step1;
2766 out[27] = step1;
2767 out[28] = step1;
2768 out[29] = step1;
2769 out[30] = step1;
2770 out[31] = step1;
2771 out[32] = step1;
2772 out[33] = step1;
2773 out[34] = step1;
2774 out[35] = step1;
2775 out[36] = step1;
2776 out[37] = step1;
2777 out[38] = step1;
2778 out[39] = step1;
2779 out[40] = step1;
2780 out[41] = step1;
2781 out[42] = step1;
2782 out[43] = step1;
2783 out[44] = step1;
2784 out[45] = step1;
2785 out[46] = step1;
2786 out[47] = step1;
2787 out[48] = step1;
2788 out[49] = step1;
2789 out[50] = step1;
2790 out[51] = step1;
2791 out[52] = step1;
2792 out[53] = step1;
2793 out[54] = step1;
2794 out[55] = step1;
2795 out[56] = step1;
2796 out[57] = step1;
2797 out[58] = step1;
2798 out[59] = step1;
2799 out[60] = step1;
2800 out[61] = step1;
2801 out[62] = step1;
2802 out[63] = step1;
2803}
2804
2805static INLINE void idct64_low8_new_neon(int16x8_t *in, int16x8_t *out,
2806 int8_t cos_bit, int bit) {
2807 (void)bit;
2808 const int32_t *cospi = cospi_arr(cos_bit);
2809 int16x8_t step2[64], step1[64];
2810
2811 const int16x4_t c0 =
2812 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
2813 (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
2814 const int16x4_t c1 =
2815 create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
2816 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
2817 const int16x4_t c2 =
2818 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
2819 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
2820 const int16x4_t c3 =
2821 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
2822 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
2823
2824 // stage 1
2825 // stage 2
2826
2827 step2[0] = in[0];
2828 step2[8] = in[4];
2829 step2[16] = in[2];
2830 step2[24] = in[6];
2831
2832 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
2833 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
2834 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
2835 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
2836
2837 // stage 3
2838
2839 step1[0] = step2[0];
2840 step1[8] = step2[8];
2841
2842 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
2843 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
2844
2845 step1[32] = step2[32];
2846 step1[33] = step2[32];
2847 step1[38] = step2[39];
2848 step1[39] = step2[39];
2849 step1[40] = step2[40];
2850 step1[41] = step2[40];
2851 step1[46] = step2[47];
2852 step1[47] = step2[47];
2853 step1[48] = step2[48];
2854 step1[49] = step2[48];
2855 step1[54] = step2[55];
2856 step1[55] = step2[55];
2857 step1[56] = step2[56];
2858 step1[57] = step2[56];
2859 step1[62] = step2[63];
2860 step1[63] = step2[63];
2861
2862 // stage 4
2863
2864 step2[0] = step1[0];
2865
2866 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
2867 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
2868 btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
2869 &step2[38], &step2[57]);
2870 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
2871 btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
2872 &step2[46], &step2[49]);
2873
2874 step2[16] = step1[16];
2875 step2[17] = step1[16];
2876 step2[22] = step1[23];
2877 step2[23] = step1[23];
2878 step2[24] = step1[24];
2879 step2[25] = step1[24];
2880 step2[30] = step1[31];
2881 step2[31] = step1[31];
2882 step2[32] = step1[32];
2883 step2[39] = step1[39];
2884 step2[40] = step1[40];
2885 step2[47] = step1[47];
2886 step2[48] = step1[48];
2887 step2[55] = step1[55];
2888 step2[56] = step1[56];
2889 step2[63] = step1[63];
2890
2891 // stage 5
2892
2893 step1[0] = step2[0];
2894
2895 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
2896 btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
2897 &step1[22], &step1[25]);
2898
2899 step1[8] = step2[8];
2900 step1[9] = step2[8];
2901 step1[14] = step2[15];
2902 step1[15] = step2[15];
2903
2904 step1[16] = step2[16];
2905 step1[23] = step2[23];
2906 step1[24] = step2[24];
2907 step1[31] = step2[31];
2908 step1[32] = step2[32];
2909 step1[33] = step2[33];
2910 step1[34] = step2[33];
2911 step1[35] = step2[32];
2912 step1[36] = step2[39];
2913 step1[37] = step2[38];
2914 step1[38] = step2[38];
2915 step1[39] = step2[39];
2916 step1[40] = step2[40];
2917 step1[41] = step2[41];
2918 step1[42] = step2[41];
2919 step1[43] = step2[40];
2920 step1[44] = step2[47];
2921 step1[45] = step2[46];
2922 step1[46] = step2[46];
2923 step1[47] = step2[47];
2924 step1[48] = step2[48];
2925 step1[49] = step2[49];
2926 step1[50] = step2[49];
2927 step1[51] = step2[48];
2928 step1[52] = step2[55];
2929 step1[53] = step2[54];
2930 step1[54] = step2[54];
2931 step1[55] = step2[55];
2932 step1[56] = step2[56];
2933 step1[57] = step2[57];
2934 step1[58] = step2[57];
2935 step1[59] = step2[56];
2936 step1[60] = step2[63];
2937 step1[61] = step2[62];
2938 step1[62] = step2[62];
2939 step1[63] = step2[63];
2940
2941 // stage 6
2942
2943 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
2944 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
2945 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
2946 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
2947 btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
2948 &step2[36], &step2[59]);
2949 btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
2950 &step2[37], &step2[58]);
2951 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
2952 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
2953 btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
2954 &step2[44], &step2[51]);
2955 btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
2956 &step2[45], &step2[50]);
2957
2958 step2[8] = step1[8];
2959 step2[15] = step1[15];
2960 step2[16] = step1[16];
2961 step2[17] = step1[17];
2962 step2[18] = step1[17];
2963 step2[19] = step1[16];
2964 step2[20] = step1[23];
2965 step2[21] = step1[22];
2966 step2[22] = step1[22];
2967 step2[23] = step1[23];
2968 step2[24] = step1[24];
2969 step2[25] = step1[25];
2970 step2[26] = step1[25];
2971 step2[27] = step1[24];
2972 step2[28] = step1[31];
2973 step2[29] = step1[30];
2974 step2[30] = step1[30];
2975 step2[31] = step1[31];
2976 step2[32] = step1[32];
2977 step2[33] = step1[33];
2978 step2[38] = step1[38];
2979 step2[39] = step1[39];
2980 step2[40] = step1[40];
2981 step2[41] = step1[41];
2982 step2[46] = step1[46];
2983 step2[47] = step1[47];
2984 step2[48] = step1[48];
2985 step2[49] = step1[49];
2986 step2[54] = step1[54];
2987 step2[55] = step1[55];
2988 step2[56] = step1[56];
2989 step2[57] = step1[57];
2990 step2[62] = step1[62];
2991 step2[63] = step1[63];
2992
2993 // stage 7
2994
2995 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
2996 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
2997 btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
2998 &step1[20], &step1[27]);
2999 btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
3000 &step1[21], &step1[26]);
3001
3002 step1[0] = step2[0];
3003 step1[1] = step2[1];
3004 step1[2] = step2[1];
3005 step1[3] = step2[0];
3006 step1[8] = step2[8];
3007 step1[9] = step2[9];
3008 step1[10] = step2[9];
3009 step1[11] = step2[8];
3010 step1[12] = step2[15];
3011 step1[13] = step2[14];
3012 step1[14] = step2[14];
3013 step1[15] = step2[15];
3014 step1[16] = step2[16];
3015 step1[17] = step2[17];
3016 step1[22] = step2[22];
3017 step1[23] = step2[23];
3018 step1[24] = step2[24];
3019 step1[25] = step2[25];
3020 step1[30] = step2[30];
3021 step1[31] = step2[31];
3022 step1[32] = vqaddq_s16(step2[32], step2[39]);
3023 step1[33] = vqaddq_s16(step2[33], step2[38]);
3024 step1[34] = vqaddq_s16(step2[34], step2[37]);
3025 step1[35] = vqaddq_s16(step2[35], step2[36]);
3026 step1[36] = vqsubq_s16(step2[35], step2[36]);
3027 step1[37] = vqsubq_s16(step2[34], step2[37]);
3028 step1[38] = vqsubq_s16(step2[33], step2[38]);
3029 step1[39] = vqsubq_s16(step2[32], step2[39]);
3030 step1[40] = vqsubq_s16(step2[47], step2[40]);
3031 step1[41] = vqsubq_s16(step2[46], step2[41]);
3032 step1[42] = vqsubq_s16(step2[45], step2[42]);
3033 step1[43] = vqsubq_s16(step2[44], step2[43]);
3034 step1[44] = vqaddq_s16(step2[43], step2[44]);
3035 step1[45] = vqaddq_s16(step2[42], step2[45]);
3036 step1[46] = vqaddq_s16(step2[41], step2[46]);
3037 step1[47] = vqaddq_s16(step2[40], step2[47]);
3038 step1[48] = vqaddq_s16(step2[48], step2[55]);
3039 step1[49] = vqaddq_s16(step2[49], step2[54]);
3040 step1[50] = vqaddq_s16(step2[50], step2[53]);
3041 step1[51] = vqaddq_s16(step2[51], step2[52]);
3042 step1[52] = vqsubq_s16(step2[51], step2[52]);
3043 step1[53] = vqsubq_s16(step2[50], step2[53]);
3044 step1[54] = vqsubq_s16(step2[49], step2[54]);
3045 step1[55] = vqsubq_s16(step2[48], step2[55]);
3046 step1[56] = vqsubq_s16(step2[63], step2[56]);
3047 step1[57] = vqsubq_s16(step2[62], step2[57]);
3048 step1[58] = vqsubq_s16(step2[61], step2[58]);
3049 step1[59] = vqsubq_s16(step2[60], step2[59]);
3050 step1[60] = vqaddq_s16(step2[59], step2[60]);
3051 step1[61] = vqaddq_s16(step2[58], step2[61]);
3052 step1[62] = vqaddq_s16(step2[57], step2[62]);
3053 step1[63] = vqaddq_s16(step2[56], step2[63]);
3054
3055 // stage 8
3056
3057 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3058 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3059 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3060 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3061 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3062 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
3063 btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
3064 &step2[40], &step2[55]);
3065 btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
3066 &step2[41], &step2[54]);
3067 btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
3068 &step2[42], &step2[53]);
3069 btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
3070 &step2[43], &step2[52]);
3071
3072 step2[0] = step1[0];
3073 step2[1] = step1[1];
3074 step2[2] = step1[2];
3075 step2[3] = step1[3];
3076 step2[4] = step1[3];
3077 step2[5] = step1[2];
3078 step2[6] = step1[1];
3079 step2[7] = step1[0];
3080 step2[8] = step1[8];
3081 step2[9] = step1[9];
3082 step2[14] = step1[14];
3083 step2[15] = step1[15];
3084 step2[16] = vqaddq_s16(step1[16], step1[23]);
3085 step2[17] = vqaddq_s16(step1[17], step1[22]);
3086 step2[18] = vqaddq_s16(step1[18], step1[21]);
3087 step2[19] = vqaddq_s16(step1[19], step1[20]);
3088 step2[20] = vqsubq_s16(step1[19], step1[20]);
3089 step2[21] = vqsubq_s16(step1[18], step1[21]);
3090 step2[22] = vqsubq_s16(step1[17], step1[22]);
3091 step2[23] = vqsubq_s16(step1[16], step1[23]);
3092 step2[24] = vqsubq_s16(step1[31], step1[24]);
3093 step2[25] = vqsubq_s16(step1[30], step1[25]);
3094 step2[26] = vqsubq_s16(step1[29], step1[26]);
3095 step2[27] = vqsubq_s16(step1[28], step1[27]);
3096 step2[28] = vqaddq_s16(step1[28], step1[27]);
3097 step2[29] = vqaddq_s16(step1[29], step1[26]);
3098 step2[30] = vqaddq_s16(step1[30], step1[25]);
3099 step2[31] = vqaddq_s16(step1[31], step1[24]);
3100 step2[32] = step1[32];
3101 step2[33] = step1[33];
3102 step2[34] = step1[34];
3103 step2[35] = step1[35];
3104 step2[44] = step1[44];
3105 step2[45] = step1[45];
3106 step2[46] = step1[46];
3107 step2[47] = step1[47];
3108 step2[48] = step1[48];
3109 step2[49] = step1[49];
3110 step2[50] = step1[50];
3111 step2[51] = step1[51];
3112 step2[60] = step1[60];
3113 step2[61] = step1[61];
3114 step2[62] = step1[62];
3115 step2[63] = step1[63];
3116
3117 // stage 9
3118 idct64_stage9_neon(step2, step1, cos_bit);
3119
3120 // stage 10
3121 idct64_stage10_neon(step1, step2, cos_bit);
3122
3123 // stage 11
3124
3125 out[0] = vqaddq_s16(step2[0], step2[63]);
3126 out[1] = vqaddq_s16(step2[1], step2[62]);
3127 out[2] = vqaddq_s16(step2[2], step2[61]);
3128 out[3] = vqaddq_s16(step2[3], step2[60]);
3129 out[4] = vqaddq_s16(step2[4], step2[59]);
3130 out[5] = vqaddq_s16(step2[5], step2[58]);
3131 out[6] = vqaddq_s16(step2[6], step2[57]);
3132 out[7] = vqaddq_s16(step2[7], step2[56]);
3133 out[8] = vqaddq_s16(step2[8], step2[55]);
3134 out[9] = vqaddq_s16(step2[9], step2[54]);
3135 out[10] = vqaddq_s16(step2[10], step2[53]);
3136 out[11] = vqaddq_s16(step2[11], step2[52]);
3137 out[12] = vqaddq_s16(step2[12], step2[51]);
3138 out[13] = vqaddq_s16(step2[13], step2[50]);
3139 out[14] = vqaddq_s16(step2[14], step2[49]);
3140 out[15] = vqaddq_s16(step2[15], step2[48]);
3141 out[16] = vqaddq_s16(step2[16], step2[47]);
3142 out[17] = vqaddq_s16(step2[17], step2[46]);
3143 out[18] = vqaddq_s16(step2[18], step2[45]);
3144 out[19] = vqaddq_s16(step2[19], step2[44]);
3145 out[20] = vqaddq_s16(step2[20], step2[43]);
3146 out[21] = vqaddq_s16(step2[21], step2[42]);
3147 out[22] = vqaddq_s16(step2[22], step2[41]);
3148 out[23] = vqaddq_s16(step2[23], step2[40]);
3149 out[24] = vqaddq_s16(step2[24], step2[39]);
3150 out[25] = vqaddq_s16(step2[25], step2[38]);
3151 out[26] = vqaddq_s16(step2[26], step2[37]);
3152 out[27] = vqaddq_s16(step2[27], step2[36]);
3153 out[28] = vqaddq_s16(step2[28], step2[35]);
3154 out[29] = vqaddq_s16(step2[29], step2[34]);
3155 out[30] = vqaddq_s16(step2[30], step2[33]);
3156 out[31] = vqaddq_s16(step2[31], step2[32]);
3157 out[32] = vqsubq_s16(step2[31], step2[32]);
3158 out[33] = vqsubq_s16(step2[30], step2[33]);
3159 out[34] = vqsubq_s16(step2[29], step2[34]);
3160 out[35] = vqsubq_s16(step2[28], step2[35]);
3161 out[36] = vqsubq_s16(step2[27], step2[36]);
3162 out[37] = vqsubq_s16(step2[26], step2[37]);
3163 out[38] = vqsubq_s16(step2[25], step2[38]);
3164 out[39] = vqsubq_s16(step2[24], step2[39]);
3165 out[40] = vqsubq_s16(step2[23], step2[40]);
3166 out[41] = vqsubq_s16(step2[22], step2[41]);
3167 out[42] = vqsubq_s16(step2[21], step2[42]);
3168 out[43] = vqsubq_s16(step2[20], step2[43]);
3169 out[44] = vqsubq_s16(step2[19], step2[44]);
3170 out[45] = vqsubq_s16(step2[18], step2[45]);
3171 out[46] = vqsubq_s16(step2[17], step2[46]);
3172 out[47] = vqsubq_s16(step2[16], step2[47]);
3173 out[48] = vqsubq_s16(step2[15], step2[48]);
3174 out[49] = vqsubq_s16(step2[14], step2[49]);
3175 out[50] = vqsubq_s16(step2[13], step2[50]);
3176 out[51] = vqsubq_s16(step2[12], step2[51]);
3177 out[52] = vqsubq_s16(step2[11], step2[52]);
3178 out[53] = vqsubq_s16(step2[10], step2[53]);
3179 out[54] = vqsubq_s16(step2[9], step2[54]);
3180 out[55] = vqsubq_s16(step2[8], step2[55]);
3181 out[56] = vqsubq_s16(step2[7], step2[56]);
3182 out[57] = vqsubq_s16(step2[6], step2[57]);
3183 out[58] = vqsubq_s16(step2[5], step2[58]);
3184 out[59] = vqsubq_s16(step2[4], step2[59]);
3185 out[60] = vqsubq_s16(step2[3], step2[60]);
3186 out[61] = vqsubq_s16(step2[2], step2[61]);
3187 out[62] = vqsubq_s16(step2[1], step2[62]);
3188 out[63] = vqsubq_s16(step2[0], step2[63]);
3189}
3190
3191static INLINE void idct64_low16_new_neon(int16x8_t *in, int16x8_t *out,
3192 int8_t cos_bit, int bit) {
3193 (void)bit;
3194 const int32_t *cospi = cospi_arr(cos_bit);
3195 int16x8_t step2[64], step1[64];
3196
3197 const int16x4_t c0 =
3198 create_s16x4_neon((int16_t *)(cospi + 4), (int16_t *)(cospi + 60),
3199 (int16_t *)(cospi + 36), (int16_t *)(cospi + 28));
3200 const int16x4_t c1 =
3201 create_s16x4_neon((int16_t *)(cospi + 20), (int16_t *)(cospi + 44),
3202 (int16_t *)(cospi + 52), (int16_t *)(cospi + 12));
3203 const int16x4_t c2 =
3204 create_s16x4_neon((int16_t *)(cospi + 8), (int16_t *)(cospi + 56),
3205 (int16_t *)(cospi + 40), (int16_t *)(cospi + 24));
3206 const int16x4_t c3 =
3207 create_s16x4_neon((int16_t *)(cospi + 32), (int16_t *)(cospi + 32),
3208 (int16_t *)(cospi + 16), (int16_t *)(cospi + 48));
3209
3210 // stage 1
3211 // stage 2
3212
3213 step2[0] = in[0];
3214 step2[4] = in[8];
3215 step2[8] = in[4];
3216 step2[12] = in[12];
3217 step2[16] = in[2];
3218 step2[20] = in[10];
3219 step2[24] = in[6];
3220 step2[28] = in[14];
3221
3222 btf_16_neon(in[1], cospi[63], cospi[1], &step2[32], &step2[63]);
3223 btf_16_neon(in[15], -cospi[49], cospi[15], &step2[35], &step2[60]);
3224 btf_16_neon(in[9], cospi[55], cospi[9], &step2[36], &step2[59]);
3225 btf_16_neon(in[7], -cospi[57], cospi[7], &step2[39], &step2[56]);
3226 btf_16_neon(in[5], cospi[59], cospi[5], &step2[40], &step2[55]);
3227 btf_16_neon(in[11], -cospi[53], cospi[11], &step2[43], &step2[52]);
3228 btf_16_neon(in[13], cospi[51], cospi[13], &step2[44], &step2[51]);
3229 btf_16_neon(in[3], -cospi[61], cospi[3], &step2[47], &step2[48]);
3230
3231 // stage 3
3232
3233 step1[0] = step2[0];
3234 step1[4] = step2[4];
3235 step1[8] = step2[8];
3236 step1[12] = step2[12];
3237
3238 btf_16_neon(step2[16], cospi[62], cospi[2], &step1[16], &step1[31]);
3239 btf_16_neon(step2[20], cospi[54], cospi[10], &step1[20], &step1[27]);
3240 btf_16_neon(step2[24], -cospi[58], cospi[6], &step1[23], &step1[24]);
3241 btf_16_neon(step2[28], -cospi[50], cospi[14], &step1[19], &step1[28]);
3242
3243 step1[32] = step2[32];
3244 step1[33] = step2[32];
3245 step1[34] = step2[35];
3246 step1[35] = step2[35];
3247 step1[36] = step2[36];
3248 step1[37] = step2[36];
3249 step1[38] = step2[39];
3250 step1[39] = step2[39];
3251 step1[40] = step2[40];
3252 step1[41] = step2[40];
3253 step1[42] = step2[43];
3254 step1[43] = step2[43];
3255 step1[44] = step2[44];
3256 step1[45] = step2[44];
3257 step1[46] = step2[47];
3258 step1[47] = step2[47];
3259 step1[48] = step2[48];
3260 step1[49] = step2[48];
3261 step1[50] = step2[51];
3262 step1[51] = step2[51];
3263 step1[52] = step2[52];
3264 step1[53] = step2[52];
3265 step1[54] = step2[55];
3266 step1[55] = step2[55];
3267 step1[56] = step2[56];
3268 step1[57] = step2[56];
3269 step1[58] = step2[59];
3270 step1[59] = step2[59];
3271 step1[60] = step2[60];
3272 step1[61] = step2[60];
3273 step1[62] = step2[63];
3274 step1[63] = step2[63];
3275
3276 // stage 4
3277
3278 step2[0] = step1[0];
3279 step2[4] = step1[4];
3280
3281 btf_16_neon(step1[8], cospi[60], cospi[4], &step2[8], &step2[15]);
3282 btf_16_neon(step1[12], -cospi[52], cospi[12], &step2[11], &step2[12]);
3283 btf_16_lane_0_1_neon(step1[62], step1[33], c0, &step2[62], &step2[33]);
3284 btf_16_lane_1_0_neon(vnegq_s16(step1[34]), vnegq_s16(step1[61]), c0,
3285 &step2[34], &step2[61]);
3286 btf_16_lane_2_3_neon(step1[58], step1[37], c0, &step2[58], &step2[37]);
3287 btf_16_lane_3_2_neon(vnegq_s16(step1[38]), vnegq_s16(step1[57]), c0,
3288 &step2[38], &step2[57]);
3289 btf_16_lane_0_1_neon(step1[54], step1[41], c1, &step2[54], &step2[41]);
3290 btf_16_lane_1_0_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c1,
3291 &step2[42], &step2[53]);
3292 btf_16_lane_2_3_neon(step1[50], step1[45], c1, &step2[50], &step2[45]);
3293 btf_16_lane_3_2_neon(vnegq_s16(step1[46]), vnegq_s16(step1[49]), c1,
3294 &step2[46], &step2[49]);
3295
3296 step2[16] = step1[16];
3297 step2[17] = step1[16];
3298 step2[18] = step1[19];
3299 step2[19] = step1[19];
3300 step2[20] = step1[20];
3301 step2[21] = step1[20];
3302 step2[22] = step1[23];
3303 step2[23] = step1[23];
3304 step2[24] = step1[24];
3305 step2[25] = step1[24];
3306 step2[26] = step1[27];
3307 step2[27] = step1[27];
3308 step2[28] = step1[28];
3309 step2[29] = step1[28];
3310 step2[30] = step1[31];
3311 step2[31] = step1[31];
3312 step2[32] = step1[32];
3313 step2[35] = step1[35];
3314 step2[36] = step1[36];
3315 step2[39] = step1[39];
3316 step2[40] = step1[40];
3317 step2[43] = step1[43];
3318 step2[44] = step1[44];
3319 step2[47] = step1[47];
3320 step2[48] = step1[48];
3321 step2[51] = step1[51];
3322 step2[52] = step1[52];
3323 step2[55] = step1[55];
3324 step2[56] = step1[56];
3325 step2[59] = step1[59];
3326 step2[60] = step1[60];
3327 step2[63] = step1[63];
3328
3329 // stage 5
3330
3331 step1[0] = step2[0];
3332
3333 btf_16_neon(step2[4], cospi[56], cospi[8], &step1[4], &step1[7]);
3334 btf_16_lane_0_1_neon(step2[30], step2[17], c2, &step1[30], &step1[17]);
3335 btf_16_lane_1_0_neon(vnegq_s16(step2[18]), vnegq_s16(step2[29]), c2,
3336 &step1[18], &step1[29]);
3337 btf_16_lane_2_3_neon(step2[26], step2[21], c2, &step1[26], &step1[21]);
3338 btf_16_lane_3_2_neon(vnegq_s16(step2[22]), vnegq_s16(step2[25]), c2,
3339 &step1[22], &step1[25]);
3340
3341 step1[8] = step2[8];
3342 step1[9] = step2[8];
3343 step1[10] = step2[11];
3344 step1[11] = step2[11];
3345 step1[12] = step2[12];
3346 step1[13] = step2[12];
3347 step1[14] = step2[15];
3348 step1[15] = step2[15];
3349 step1[16] = step2[16];
3350 step1[19] = step2[19];
3351 step1[20] = step2[20];
3352 step1[23] = step2[23];
3353 step1[24] = step2[24];
3354 step1[27] = step2[27];
3355 step1[28] = step2[28];
3356 step1[31] = step2[31];
3357 step1[32] = vqaddq_s16(step2[32], step2[35]);
3358 step1[33] = vqaddq_s16(step2[33], step2[34]);
3359 step1[34] = vqsubq_s16(step2[33], step2[34]);
3360 step1[35] = vqsubq_s16(step2[32], step2[35]);
3361 step1[36] = vqsubq_s16(step2[39], step2[36]);
3362 step1[37] = vqsubq_s16(step2[38], step2[37]);
3363 step1[38] = vqaddq_s16(step2[38], step2[37]);
3364 step1[39] = vqaddq_s16(step2[39], step2[36]);
3365 step1[40] = vqaddq_s16(step2[40], step2[43]);
3366 step1[41] = vqaddq_s16(step2[41], step2[42]);
3367 step1[42] = vqsubq_s16(step2[41], step2[42]);
3368 step1[43] = vqsubq_s16(step2[40], step2[43]);
3369 step1[44] = vqsubq_s16(step2[47], step2[44]);
3370 step1[45] = vqsubq_s16(step2[46], step2[45]);
3371 step1[46] = vqaddq_s16(step2[46], step2[45]);
3372 step1[47] = vqaddq_s16(step2[47], step2[44]);
3373 step1[48] = vqaddq_s16(step2[48], step2[51]);
3374 step1[49] = vqaddq_s16(step2[49], step2[50]);
3375 step1[50] = vqsubq_s16(step2[49], step2[50]);
3376 step1[51] = vqsubq_s16(step2[48], step2[51]);
3377 step1[52] = vqsubq_s16(step2[55], step2[52]);
3378 step1[53] = vqsubq_s16(step2[54], step2[53]);
3379 step1[54] = vqaddq_s16(step2[54], step2[53]);
3380 step1[55] = vqaddq_s16(step2[55], step2[52]);
3381 step1[56] = vqaddq_s16(step2[56], step2[59]);
3382 step1[57] = vqaddq_s16(step2[57], step2[58]);
3383 step1[58] = vqsubq_s16(step2[57], step2[58]);
3384 step1[59] = vqsubq_s16(step2[56], step2[59]);
3385 step1[60] = vqsubq_s16(step2[63], step2[60]);
3386 step1[61] = vqsubq_s16(step2[62], step2[61]);
3387 step1[62] = vqaddq_s16(step2[62], step2[61]);
3388 step1[63] = vqaddq_s16(step2[63], step2[60]);
3389
3390 // stage 6
3391
3392 btf_16_neon(step1[0], cospi[32], cospi[32], &step2[0], &step2[1]);
3393 btf_16_lane_2_3_neon(step1[14], step1[9], c3, &step2[14], &step2[9]);
3394 btf_16_lane_3_2_neon(vnegq_s16(step1[10]), vnegq_s16(step1[13]), c3,
3395 &step2[10], &step2[13]);
3396 btf_16_lane_0_1_neon(step1[61], step1[34], c2, &step2[61], &step2[34]);
3397 btf_16_lane_0_1_neon(step1[60], step1[35], c2, &step2[60], &step2[35]);
3398 btf_16_lane_1_0_neon(vnegq_s16(step1[36]), vnegq_s16(step1[59]), c2,
3399 &step2[36], &step2[59]);
3400 btf_16_lane_1_0_neon(vnegq_s16(step1[37]), vnegq_s16(step1[58]), c2,
3401 &step2[37], &step2[58]);
3402 btf_16_lane_2_3_neon(step1[53], step1[42], c2, &step2[53], &step2[42]);
3403 btf_16_lane_2_3_neon(step1[52], step1[43], c2, &step2[52], &step2[43]);
3404 btf_16_lane_3_2_neon(vnegq_s16(step1[44]), vnegq_s16(step1[51]), c2,
3405 &step2[44], &step2[51]);
3406 btf_16_lane_3_2_neon(vnegq_s16(step1[45]), vnegq_s16(step1[50]), c2,
3407 &step2[45], &step2[50]);
3408
3409 step2[4] = step1[4];
3410 step2[5] = step1[4];
3411 step2[6] = step1[7];
3412 step2[7] = step1[7];
3413 step2[8] = step1[8];
3414 step2[11] = step1[11];
3415 step2[12] = step1[12];
3416 step2[15] = step1[15];
3417 step2[16] = vqaddq_s16(step1[16], step1[19]);
3418 step2[17] = vqaddq_s16(step1[17], step1[18]);
3419 step2[18] = vqsubq_s16(step1[17], step1[18]);
3420 step2[19] = vqsubq_s16(step1[16], step1[19]);
3421 step2[20] = vqsubq_s16(step1[23], step1[20]);
3422 step2[21] = vqsubq_s16(step1[22], step1[21]);
3423 step2[22] = vqaddq_s16(step1[22], step1[21]);
3424 step2[23] = vqaddq_s16(step1[23], step1[20]);
3425 step2[24] = vqaddq_s16(step1[24], step1[27]);
3426 step2[25] = vqaddq_s16(step1[25], step1[26]);
3427 step2[26] = vqsubq_s16(step1[25], step1[26]);
3428 step2[27] = vqsubq_s16(step1[24], step1[27]);
3429 step2[28] = vqsubq_s16(step1[31], step1[28]);
3430 step2[29] = vqsubq_s16(step1[30], step1[29]);
3431 step2[30] = vqaddq_s16(step1[30], step1[29]);
3432 step2[31] = vqaddq_s16(step1[31], step1[28]);
3433 step2[32] = step1[32];
3434 step2[33] = step1[33];
3435 step2[38] = step1[38];
3436 step2[39] = step1[39];
3437 step2[40] = step1[40];
3438 step2[41] = step1[41];
3439 step2[46] = step1[46];
3440 step2[47] = step1[47];
3441 step2[48] = step1[48];
3442 step2[49] = step1[49];
3443 step2[54] = step1[54];
3444 step2[55] = step1[55];
3445 step2[56] = step1[56];
3446 step2[57] = step1[57];
3447 step2[62] = step1[62];
3448 step2[63] = step1[63];
3449
3450 // stage 7
3451
3452 btf_16_lane_0_1_neon(step2[6], step2[5], c3, &step1[6], &step1[5]);
3453 btf_16_lane_2_3_neon(step2[29], step2[18], c3, &step1[29], &step1[18]);
3454 btf_16_lane_2_3_neon(step2[28], step2[19], c3, &step1[28], &step1[19]);
3455 btf_16_lane_3_2_neon(vnegq_s16(step2[20]), vnegq_s16(step2[27]), c3,
3456 &step1[20], &step1[27]);
3457 btf_16_lane_3_2_neon(vnegq_s16(step2[21]), vnegq_s16(step2[26]), c3,
3458 &step1[21], &step1[26]);
3459
3460 step1[0] = step2[0];
3461 step1[1] = step2[1];
3462 step1[2] = step2[1];
3463 step1[3] = step2[0];
3464 step1[4] = step2[4];
3465 step1[7] = step2[7];
3466 step1[8] = vqaddq_s16(step2[8], step2[11]);
3467 step1[9] = vqaddq_s16(step2[9], step2[10]);
3468 step1[10] = vqsubq_s16(step2[9], step2[10]);
3469 step1[11] = vqsubq_s16(step2[8], step2[11]);
3470 step1[12] = vqsubq_s16(step2[15], step2[12]);
3471 step1[13] = vqsubq_s16(step2[14], step2[13]);
3472 step1[14] = vqaddq_s16(step2[14], step2[13]);
3473 step1[15] = vqaddq_s16(step2[15], step2[12]);
3474 step1[16] = step2[16];
3475 step1[17] = step2[17];
3476 step1[22] = step2[22];
3477 step1[23] = step2[23];
3478 step1[24] = step2[24];
3479 step1[25] = step2[25];
3480 step1[30] = step2[30];
3481 step1[31] = step2[31];
3482 step1[32] = vqaddq_s16(step2[32], step2[39]);
3483 step1[33] = vqaddq_s16(step2[33], step2[38]);
3484 step1[34] = vqaddq_s16(step2[34], step2[37]);
3485 step1[35] = vqaddq_s16(step2[35], step2[36]);
3486 step1[36] = vqsubq_s16(step2[35], step2[36]);
3487 step1[37] = vqsubq_s16(step2[34], step2[37]);
3488 step1[38] = vqsubq_s16(step2[33], step2[38]);
3489 step1[39] = vqsubq_s16(step2[32], step2[39]);
3490 step1[40] = vqsubq_s16(step2[47], step2[40]);
3491 step1[41] = vqsubq_s16(step2[46], step2[41]);
3492 step1[42] = vqsubq_s16(step2[45], step2[42]);
3493 step1[43] = vqsubq_s16(step2[44], step2[43]);
3494 step1[44] = vqaddq_s16(step2[43], step2[44]);
3495 step1[45] = vqaddq_s16(step2[42], step2[45]);
3496 step1[46] = vqaddq_s16(step2[41], step2[46]);
3497 step1[47] = vqaddq_s16(step2[40], step2[47]);
3498 step1[48] = vqaddq_s16(step2[48], step2[55]);
3499 step1[49] = vqaddq_s16(step2[49], step2[54]);
3500 step1[50] = vqaddq_s16(step2[50], step2[53]);
3501 step1[51] = vqaddq_s16(step2[51], step2[52]);
3502 step1[52] = vqsubq_s16(step2[51], step2[52]);
3503 step1[53] = vqsubq_s16(step2[50], step2[53]);
3504 step1[54] = vqsubq_s16(step2[49], step2[54]);
3505 step1[55] = vqsubq_s16(step2[48], step2[55]);
3506 step1[56] = vqsubq_s16(step2[63], step2[56]);
3507 step1[57] = vqsubq_s16(step2[62], step2[57]);
3508 step1[58] = vqsubq_s16(step2[61], step2[58]);
3509 step1[59] = vqsubq_s16(step2[60], step2[59]);
3510 step1[60] = vqaddq_s16(step2[59], step2[60]);
3511 step1[61] = vqaddq_s16(step2[58], step2[61]);
3512 step1[62] = vqaddq_s16(step2[57], step2[62]);
3513 step1[63] = vqaddq_s16(step2[56], step2[63]);
3514
3515 // stage 8
3516
3517 btf_16_lane_0_1_neon(step1[13], step1[10], c3, &step2[13], &step2[10]);
3518 btf_16_lane_0_1_neon(step1[12], step1[11], c3, &step2[12], &step2[11]);
3519 btf_16_lane_2_3_neon(step1[59], step1[36], c3, &step2[59], &step2[36]);
3520 btf_16_lane_2_3_neon(step1[58], step1[37], c3, &step2[58], &step2[37]);
3521 btf_16_lane_2_3_neon(step1[57], step1[38], c3, &step2[57], &step2[38]);
3522 btf_16_lane_2_3_neon(step1[56], step1[39], c3, &step2[56], &step2[39]);
3523 btf_16_lane_3_2_neon(vnegq_s16(step1[40]), vnegq_s16(step1[55]), c3,
3524 &step2[40], &step2[55]);
3525 btf_16_lane_3_2_neon(vnegq_s16(step1[41]), vnegq_s16(step1[54]), c3,
3526 &step2[41], &step2[54]);
3527 btf_16_lane_3_2_neon(vnegq_s16(step1[42]), vnegq_s16(step1[53]), c3,
3528 &step2[42], &step2[53]);
3529 btf_16_lane_3_2_neon(vnegq_s16(step1[43]), vnegq_s16(step1[52]), c3,
3530 &step2[43], &step2[52]);
3531
3532 step2[0] = vqaddq_s16(step1[0], step1[7]);
3533 step2[1] = vqaddq_s16(step1[1], step1[6]);
3534 step2[2] = vqaddq_s16(step1[2], step1[5]);
3535 step2[3] = vqaddq_s16(step1[3], step1[4]);
3536 step2[4] = vqsubq_s16(step1[3], step1[4]);
3537 step2[5] = vqsubq_s16(step1[2], step1[5]);
3538 step2[6] = vqsubq_s16(step1[1], step1[6]);
3539 step2[7] = vqsubq_s16(step1[0], step1[7]);
3540 step2[8] = step1[8];
3541 step2[9] = step1[9];
3542 step2[14] = step1[14];
3543 step2[15] = step1[15];
3544 step2[16] = vqaddq_s16(step1[16], step1[23]);
3545 step2[17] = vqaddq_s16(step1[17], step1[22]);
3546 step2[18] = vqaddq_s16(step1[18], step1[21]);
3547 step2[19] = vqaddq_s16(step1[19], step1[20]);
3548 step2[20] = vqsubq_s16(step1[19], step1[20]);
3549 step2[21] = vqsubq_s16(step1[18], step1[21]);
3550 step2[22] = vqsubq_s16(step1[17], step1[22]);
3551 step2[23] = vqsubq_s16(step1[16], step1[23]);
3552 step2[24] = vqsubq_s16(step1[31], step1[24]);
3553 step2[25] = vqsubq_s16(step1[30], step1[25]);
3554 step2[26] = vqsubq_s16(step1[29], step1[26]);
3555 step2[27] = vqsubq_s16(step1[28], step1[27]);
3556 step2[28] = vqaddq_s16(step1[28], step1[27]);
3557 step2[29] = vqaddq_s16(step1[29], step1[26]);
3558 step2[30] = vqaddq_s16(step1[30], step1[25]);
3559 step2[31] = vqaddq_s16(step1[31], step1[24]);
3560 step2[32] = step1[32];
3561 step2[33] = step1[33];
3562 step2[34] = step1[34];
3563 step2[35] = step1[35];
3564 step2[44] = step1[44];
3565 step2[45] = step1[45];
3566 step2[46] = step1[46];
3567 step2[47] = step1[47];
3568 step2[48] = step1[48];
3569 step2[49] = step1[49];
3570 step2[50] = step1[50];
3571 step2[51] = step1[51];
3572 step2[60] = step1[60];
3573 step2[61] = step1[61];
3574 step2[62] = step1[62];
3575 step2[63] = step1[63];
3576
3577 // stage 9
3578 idct64_stage9_neon(step2, step1, cos_bit);
3579
3580 // stage 10
3581 idct64_stage10_neon(step1, step2, cos_bit);
3582
3583 // stage 11
3584
3585 out[0] = vqaddq_s16(step2[0], step2[63]);
3586 out[1] = vqaddq_s16(step2[1], step2[62]);
3587 out[2] = vqaddq_s16(step2[2], step2[61]);
3588 out[3] = vqaddq_s16(step2[3], step2[60]);
3589 out[4] = vqaddq_s16(step2[4], step2[59]);
3590 out[5] = vqaddq_s16(step2[5], step2[58]);
3591 out[6] = vqaddq_s16(step2[6], step2[57]);
3592 out[7] = vqaddq_s16(step2[7], step2[56]);
3593 out[8] = vqaddq_s16(step2[8], step2[55]);
3594 out[9] = vqaddq_s16(step2[9], step2[54]);
3595 out[10] = vqaddq_s16(step2[10], step2[53]);
3596 out[11] = vqaddq_s16(step2[11], step2[52]);
3597 out[12] = vqaddq_s16(step2[12], step2[51]);
3598 out[13] = vqaddq_s16(step2[13], step2[50]);
3599 out[14] = vqaddq_s16(step2[14], step2[49]);
3600 out[15] = vqaddq_s16(step2[15], step2[48]);
3601 out[16] = vqaddq_s16(step2[16], step2[47]);
3602 out[17] = vqaddq_s16(step2[17], step2[46]);
3603 out[18] = vqaddq_s16(step2[18], step2[45]);
3604 out[19] = vqaddq_s16(step2[19], step2[44]);
3605 out[20] = vqaddq_s16(step2[20], step2[43]);
3606 out[21] = vqaddq_s16(step2[21], step2[42]);
3607 out[22] = vqaddq_s16(step2[22], step2[41]);
3608 out[23] = vqaddq_s16(step2[23], step2[40]);
3609 out[24] = vqaddq_s16(step2[24], step2[39]);
3610 out[25] = vqaddq_s16(step2[25], step2[38]);
3611 out[26] = vqaddq_s16(step2[26], step2[37]);
3612 out[27] = vqaddq_s16(step2[27], step2[36]);
3613 out[28] = vqaddq_s16(step2[28], step2[35]);
3614 out[29] = vqaddq_s16(step2[29], step2[34]);
3615 out[30] = vqaddq_s16(step2[30], step2[33]);
3616 out[31] = vqaddq_s16(step2[31], step2[32]);
3617 out[32] = vqsubq_s16(step2[31], step2[32]);
3618 out[33] = vqsubq_s16(step2[30], step2[33]);
3619 out[34] = vqsubq_s16(step2[29], step2[34]);
3620 out[35] = vqsubq_s16(step2[28], step2[35]);
3621 out[36] = vqsubq_s16(step2[27], step2[36]);
3622 out[37] = vqsubq_s16(step2[26], step2[37]);
3623 out[38] = vqsubq_s16(step2[25], step2[38]);
3624 out[39] = vqsubq_s16(step2[24], step2[39]);
3625 out[40] = vqsubq_s16(step2[23], step2[40]);
3626 out[41] = vqsubq_s16(step2[22], step2[41]);
3627 out[42] = vqsubq_s16(step2[21], step2[42]);
3628 out[43] = vqsubq_s16(step2[20], step2[43]);
3629 out[44] = vqsubq_s16(step2[19], step2[44]);
3630 out[45] = vqsubq_s16(step2[18], step2[45]);
3631 out[46] = vqsubq_s16(step2[17], step2[46]);
3632 out[47] = vqsubq_s16(step2[16], step2[47]);
3633 out[48] = vqsubq_s16(step2[15], step2[48]);
3634 out[49] = vqsubq_s16(step2[14], step2[49]);
3635 out[50] = vqsubq_s16(step2[13], step2[50]);
3636 out[51] = vqsubq_s16(step2[12], step2[51]);
3637 out[52] = vqsubq_s16(step2[11], step2[52]);
3638 out[53] = vqsubq_s16(step2[10], step2[53]);
3639 out[54] = vqsubq_s16(step2[9], step2[54]);
3640 out[55] = vqsubq_s16(step2[8], step2[55]);
3641 out[56] = vqsubq_s16(step2[7], step2[56]);
3642 out[57] = vqsubq_s16(step2[6], step2[57]);
3643 out[58] = vqsubq_s16(step2[5], step2[58]);
3644 out[59] = vqsubq_s16(step2[4], step2[59]);
3645 out[60] = vqsubq_s16(step2[3], step2[60]);
3646 out[61] = vqsubq_s16(step2[2], step2[61]);
3647 out[62] = vqsubq_s16(step2[1], step2[62]);
3648 out[63] = vqsubq_s16(step2[0], step2[63]);
3649}
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303650
Venkat000f2f62018-07-05 12:03:05 +05303651// Functions for blocks with eob at DC and within
3652// topleft 8x8, 16x16, 32x32 corner
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303653static const transform_neon
3654 lowbd_txfm_all_1d_zeros_w_arr[TX_SIZES][ITX_TYPES_1D][4] = {
3655 {
3656 { NULL, NULL, NULL, NULL },
3657 { NULL, NULL, NULL, NULL },
3658 { NULL, NULL, NULL, NULL },
3659 },
3660 { { idct8_low1_new_neon, idct8_new_neon, NULL, NULL },
3661 { iadst8_low1_new_neon, iadst8_new_neon, NULL, NULL },
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303662 { identity8_new_neon, identity8_new_neon, NULL, NULL } },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303663 {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303664 { idct16_low1_new_neon, idct16_low8_new_neon, idct16_new_neon, NULL },
3665 { iadst16_low1_new_neon, iadst16_low8_new_neon, iadst16_new_neon,
3666 NULL },
3667 { identity16_new_neon, identity16_new_neon, identity16_new_neon,
3668 NULL },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303669 },
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303670 { { idct32_low1_new_neon, idct32_low8_new_neon, idct32_low16_new_neon,
3671 idct32_new_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303672 { NULL, NULL, NULL, NULL },
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303673 { identity32_new_neon, identity32_new_neon, identity32_new_neon,
3674 identity32_new_neon } },
sachin garg56f10202018-09-24 14:05:25 +00003675 { { idct64_low1_new_neon, idct64_low8_new_neon, idct64_low16_new_neon,
3676 idct64_low32_new_neon },
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303677 { NULL, NULL, NULL, NULL },
3678 { NULL, NULL, NULL, NULL } }
3679 };
3680
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303681static INLINE void lowbd_inv_txfm2d_add_idtx_neon(const int32_t *input,
3682 uint8_t *output, int stride,
3683 TX_TYPE tx_type,
3684 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303685 int16x8_t a[32 * 4];
3686 int16x8_t b[32 * 4];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303687 int eobx, eoby;
3688 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003689 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303690 const int txw_idx = get_txw_idx(tx_size);
3691 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07003692 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
3693 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303694 const int txfm_size_col = tx_size_wide[tx_size];
3695 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303696 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3697 0);
3698 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3699 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303700 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303701 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303702 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3703 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
3704 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
3705 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3706 const int32_t *input_1;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303707 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303708 const transform_neon row_txfm =
3709 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
3710 const transform_neon col_txfm =
3711 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3712
3713 assert(col_txfm != NULL);
3714 assert(row_txfm != NULL);
3715
3716 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3717 input_1 = input;
3718 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3719 int k = j * 8 + i * txfm_size_col;
3720 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303721 transpose_s16_8x8q(&a[k], &a[k]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303722 input_1 += 8;
3723 }
3724 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303725 if (abs(rect_type) == 1) {
3726 int y = i * txfm_size_col;
3727 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3728 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303729 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
3730 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
3731 -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303732 for (int j = 0; j < buf_size_w_div8; ++j) {
3733 int k = j * 8 + i * txfm_size_col;
3734 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3735 }
3736 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303737 }
3738 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303739 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
3740 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303741 -shift[1]);
3742 }
3743 if (txfm_size_col >= 16) {
3744 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3745 lowbd_add_flip_buffer_16xn_neon(
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303746 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303747 }
3748 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303749 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303750 }
3751}
3752
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303753static INLINE void lowbd_inv_txfm2d_add_v_identity_neon(
3754 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3755 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303756 int16x8_t a[16 * 2];
3757 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303758 int eobx, eoby, ud_flip, lr_flip;
3759 get_eobx_eoby_scan_v_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003760 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303761 const int txw_idx = get_txw_idx(tx_size);
3762 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07003763 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
3764 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303765 const int txfm_size_col = tx_size_wide[tx_size];
3766 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303767 lowbd_inv_txfm2d_memset_neon(&b[0], (txfm_size_col * (txfm_size_row) >> 3),
3768 0);
3769 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303770 const int buf_size_w_div8 = txfm_size_col >> 3;
3771 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3772 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
3773 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
3774 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3775 const int32_t *input_1;
3776 int temp_b = 0;
3777 const transform_neon row_txfm =
3778 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
3779 const transform_neon col_txfm =
3780 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3781
3782 assert(col_txfm != NULL);
3783 assert(row_txfm != NULL);
3784
3785 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3786
3787 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3788 input_1 = input;
3789 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3790 int k = j * 8 + i * txfm_size_col;
3791 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
3792 transpose_s16_8x8q(&a[k], &a[k]);
3793 input_1 += 8;
3794 }
3795 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303796 if (abs(rect_type) == 1) {
3797 int y = i * txfm_size_col;
3798 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3799 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303800 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
3801 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
3802 -shift[0]);
3803 if (lr_flip == 1) {
3804 for (int j = 0; j < buf_size_w_div8; ++j) {
3805 int k = j * 8 + i * txfm_size_col;
3806 flip_buf_ud_neon(&a[k], 8);
3807 transpose_s16_8x8q(
3808 &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
3809 }
3810 temp_b += 8;
3811 } else {
3812 for (int j = 0; j < buf_size_w_div8; ++j) {
3813 int k = j * 8 + i * txfm_size_col;
3814 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3815 }
3816 temp_b += 8;
3817 }
3818 }
3819 for (int j = 0; j < buf_size_w_div8; ++j) {
3820 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
3821 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
3822 -shift[1]);
3823 }
3824 if (txfm_size_col >= 16) {
3825 for (int i = 0; i < (txfm_size_col >> 4); i++) {
3826 lowbd_add_flip_buffer_16xn_neon(
3827 &b[i * txfm_size_row * 2], output + 16 * i, stride, 0, txfm_size_row);
3828 }
3829 } else if (txfm_size_col == 8) {
3830 lowbd_add_flip_buffer_8xn_neon(b, output, stride, 0, txfm_size_row);
3831 }
3832}
3833
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303834static INLINE void lowbd_inv_txfm2d_add_h_identity_neon(
3835 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
3836 TX_SIZE tx_size, int eob) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303837 int16x8_t a[16 * 2];
3838 int16x8_t b[16 * 2];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303839 int eobx, eoby, ud_flip, lr_flip;
3840 get_eobx_eoby_scan_h_identity(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07003841 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303842 const int txw_idx = get_txw_idx(tx_size);
3843 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07003844 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
3845 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303846 const int txfm_size_col = tx_size_wide[tx_size];
3847 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303848 lowbd_inv_txfm2d_memset_neon(&a[0], (txfm_size_col * (txfm_size_row) >> 3),
3849 0);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303850 const int buf_size_w_div8 = txfm_size_col >> 3;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303851 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303852 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
3853 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
3854 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
3855 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
3856 const int32_t *input_1;
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303857 int temp_b = 0;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303858 const transform_neon row_txfm =
3859 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
3860 const transform_neon col_txfm =
3861 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
3862
3863 assert(col_txfm != NULL);
3864 assert(row_txfm != NULL);
3865
3866 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3867
3868 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
3869 input_1 = input;
3870 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
3871 int k = j * 8 + i * txfm_size_col;
3872 load_buffer_32bit_to_16bit_neon(input_1, &a[k], txfm_size_col);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303873 transpose_s16_8x8q(&a[k], &a[k]);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303874 input_1 += 8;
3875 }
3876 input += (txfm_size_col * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303877 if (abs(rect_type) == 1) {
3878 int y = i * txfm_size_col;
3879 round_shift_for_rect(&a[y], &a[y], txfm_size_col);
3880 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303881 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
3882 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
3883 -shift[0]);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303884 for (int j = 0; j < buf_size_w_div8; ++j) {
3885 int k = j * 8 + i * txfm_size_col;
3886 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
3887 }
3888 temp_b += 8;
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303889 }
3890 for (int j = 0; j < buf_size_w_div8; ++j) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303891 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
3892 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303893 -shift[1]);
3894 }
3895 if (txfm_size_col >= 16) {
3896 for (int i = 0; i < (txfm_size_col >> 4); i++) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303897 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303898 output + 16 * i, stride, ud_flip,
3899 txfm_size_row);
3900 }
3901 } else if (txfm_size_col == 8) {
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05303902 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05303903 }
3904}
3905
Venkat000f2f62018-07-05 12:03:05 +05303906static INLINE void lowbd_inv_txfm2d_add_4x4_neon(const int32_t *input,
3907 uint8_t *output, int stride,
sachin garg56f10202018-09-24 14:05:25 +00003908 TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303909 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003910 TX_SIZE tx_size = TX_4X4;
Venkat000f2f62018-07-05 12:03:05 +05303911 DECLARE_ALIGNED(32, int, txfm_buf[4 * 4 + 8 + 8]);
3912 int32_t *temp_in = txfm_buf;
3913
Yaowu Xua19e7622019-04-29 14:12:44 -07003914 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303915 const int txw_idx = get_txw_idx(tx_size);
3916 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07003917 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
3918 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
Venkat000f2f62018-07-05 12:03:05 +05303919 const int txfm_size_col = tx_size_wide[tx_size];
3920 const int txfm_size_row = tx_size_high[tx_size];
3921 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3922 int32_t *temp_out = temp_in + buf_offset;
3923 int32_t *buf = temp_out + buf_offset;
3924 int32_t *buf_ptr = buf;
3925 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
3926 int r, bd = 8;
3927 const transform_1d_neon row_txfm =
3928 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3929 const transform_1d_neon col_txfm =
3930 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3931
3932 int ud_flip, lr_flip;
3933 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3934
3935 for (int i = 0; i < txfm_size_row; i++) {
3936 row_txfm(input, buf_ptr, cos_bit_row, stage_range);
3937
3938 input += txfm_size_col;
3939 buf_ptr += txfm_size_col;
3940 }
3941
3942 for (int c = 0; c < txfm_size_col; ++c) {
3943 if (lr_flip == 0) {
3944 for (r = 0; r < txfm_size_row; ++r)
3945 temp_in[r] = buf[r * txfm_size_col + c];
3946 } else {
3947 // flip left right
3948 for (r = 0; r < txfm_size_row; ++r)
3949 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
3950 }
3951 col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
3952 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
3953
3954 if (ud_flip == 0) {
3955 for (r = 0; r < txfm_size_row; ++r) {
3956 output[r * stride + c] =
3957 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
3958 }
3959 } else {
3960 // flip upside down
3961 for (r = 0; r < txfm_size_row; ++r) {
3962 output[r * stride + c] = highbd_clip_pixel_add(
3963 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
3964 }
3965 }
3966 }
3967}
3968
3969void lowbd_inv_txfm2d_add_4x8_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00003970 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05303971 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00003972 TX_SIZE tx_size = TX_4X8;
Venkat000f2f62018-07-05 12:03:05 +05303973 DECLARE_ALIGNED(32, int, txfm_buf[4 * 8 + 8 + 8]);
3974 int32_t *temp_in = txfm_buf;
3975
Yaowu Xua19e7622019-04-29 14:12:44 -07003976 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05303977 const int txw_idx = get_txw_idx(tx_size);
3978 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07003979 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
3980 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
Venkat000f2f62018-07-05 12:03:05 +05303981 const int txfm_size_col = tx_size_wide[tx_size];
3982 const int txfm_size_row = tx_size_high[tx_size];
3983 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
3984 int32_t *temp_out = temp_in + buf_offset;
3985 int32_t *buf = temp_out + buf_offset;
3986 int32_t *buf_ptr = buf;
3987 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
3988 int r, bd = 8;
3989 const transform_1d_neon row_txfm =
3990 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
3991 const transform_1d_neon col_txfm =
3992 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
3993
3994 int ud_flip, lr_flip;
3995 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
3996
3997 for (int i = 0; i < txfm_size_row; i++) {
3998 for (int j = 0; j < txfm_size_col; j++)
3999 temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
4000
4001 row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
4002 input += txfm_size_col;
4003 buf_ptr += txfm_size_col;
4004 }
4005
4006 for (int c = 0; c < txfm_size_col; ++c) {
4007 if (lr_flip == 0) {
4008 for (r = 0; r < txfm_size_row; ++r)
4009 temp_in[r] = buf[r * txfm_size_col + c];
4010 } else {
4011 // flip left right
4012 for (r = 0; r < txfm_size_row; ++r)
4013 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4014 }
4015 col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
4016 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4017
4018 if (ud_flip == 0) {
4019 for (r = 0; r < txfm_size_row; ++r) {
4020 output[r * stride + c] =
4021 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4022 }
4023 } else {
4024 // flip upside down
4025 for (r = 0; r < txfm_size_row; ++r) {
4026 output[r * stride + c] = highbd_clip_pixel_add(
4027 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4028 }
4029 }
4030 }
4031}
4032
4033void lowbd_inv_txfm2d_add_8x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00004034 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304035 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00004036 TX_SIZE tx_size = TX_8X4;
Venkat000f2f62018-07-05 12:03:05 +05304037 DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
4038 int32_t *temp_in = txfm_buf;
4039
Yaowu Xua19e7622019-04-29 14:12:44 -07004040 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05304041 const int txw_idx = get_txw_idx(tx_size);
4042 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07004043 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
4044 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
Venkat000f2f62018-07-05 12:03:05 +05304045 const int txfm_size_col = tx_size_wide[tx_size];
4046 const int txfm_size_row = tx_size_high[tx_size];
4047 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4048 int32_t *temp_out = temp_in + buf_offset;
4049 int32_t *buf = temp_out + buf_offset;
4050 int32_t *buf_ptr = buf;
4051 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
4052 int r, bd = 8;
4053 const transform_1d_neon row_txfm =
4054 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4055 const transform_1d_neon col_txfm =
4056 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4057
4058 int ud_flip, lr_flip;
4059 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4060
4061 for (int i = 0; i < txfm_size_row; i++) {
4062 for (int j = 0; j < txfm_size_col; j++)
4063 temp_in[j] = round_shift((int64_t)input[j] * NewInvSqrt2, NewSqrt2Bits);
4064
4065 row_txfm(temp_in, buf_ptr, cos_bit_row, stage_range);
4066 input += txfm_size_col;
4067 buf_ptr += txfm_size_col;
4068 }
4069
4070 for (int c = 0; c < txfm_size_col; ++c) {
4071 if (lr_flip == 0) {
4072 for (r = 0; r < txfm_size_row; ++r)
4073 temp_in[r] = buf[r * txfm_size_col + c];
4074 } else {
4075 // flip left right
4076 for (r = 0; r < txfm_size_row; ++r)
4077 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4078 }
4079 col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
4080 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4081
4082 if (ud_flip == 0) {
4083 for (r = 0; r < txfm_size_row; ++r) {
4084 output[r * stride + c] =
4085 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4086 }
4087 } else {
4088 // flip upside down
4089 for (r = 0; r < txfm_size_row; ++r) {
4090 output[r * stride + c] = highbd_clip_pixel_add(
4091 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4092 }
4093 }
4094 }
4095}
4096
4097void lowbd_inv_txfm2d_add_4x16_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00004098 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304099 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00004100 TX_SIZE tx_size = TX_4X16;
Venkat000f2f62018-07-05 12:03:05 +05304101 DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
4102 int32_t *temp_in = txfm_buf;
4103
Yaowu Xua19e7622019-04-29 14:12:44 -07004104 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05304105 const int txw_idx = get_txw_idx(tx_size);
4106 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07004107 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
4108 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
Venkat000f2f62018-07-05 12:03:05 +05304109 const int txfm_size_col = tx_size_wide[tx_size];
4110 const int txfm_size_row = tx_size_high[tx_size];
4111 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4112 int32_t *temp_out = temp_in + buf_offset;
4113 int32_t *buf = temp_out + buf_offset;
4114 int32_t *buf_ptr = buf;
4115 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
4116 int r, bd = 8;
4117 const transform_1d_neon row_txfm =
4118 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4119 const transform_1d_neon col_txfm =
4120 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4121
4122 int ud_flip, lr_flip;
4123 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4124
4125 for (int i = 0; i < txfm_size_row; i++) {
4126 row_txfm(input, buf_ptr, cos_bit_row, stage_range);
4127 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
4128 input += txfm_size_col;
4129 buf_ptr += txfm_size_col;
4130 }
4131
4132 for (int c = 0; c < txfm_size_col; ++c) {
4133 if (lr_flip == 0) {
4134 for (r = 0; r < txfm_size_row; ++r)
4135 temp_in[r] = buf[r * txfm_size_col + c];
4136 } else {
4137 // flip left right
4138 for (r = 0; r < txfm_size_row; ++r)
4139 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4140 }
4141 col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
4142 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4143
4144 if (ud_flip == 0) {
4145 for (r = 0; r < txfm_size_row; ++r) {
4146 output[r * stride + c] =
4147 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4148 }
4149 } else {
4150 // flip upside down
4151 for (r = 0; r < txfm_size_row; ++r) {
4152 output[r * stride + c] = highbd_clip_pixel_add(
4153 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4154 }
4155 }
4156 }
4157}
4158
4159void lowbd_inv_txfm2d_add_16x4_neon(const int32_t *input, uint8_t *output,
sachin garg56f10202018-09-24 14:05:25 +00004160 int stride, TX_TYPE tx_type, int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304161 (void)eob;
sachin garg56f10202018-09-24 14:05:25 +00004162 TX_SIZE tx_size = TX_16X4;
Venkat000f2f62018-07-05 12:03:05 +05304163 DECLARE_ALIGNED(32, int, txfm_buf[16 * 4 + 16 + 16]);
4164 int32_t *temp_in = txfm_buf;
4165
Yaowu Xua19e7622019-04-29 14:12:44 -07004166 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Venkat000f2f62018-07-05 12:03:05 +05304167 const int txw_idx = get_txw_idx(tx_size);
4168 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07004169 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
4170 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
Venkat000f2f62018-07-05 12:03:05 +05304171 const int txfm_size_col = tx_size_wide[tx_size];
4172 const int txfm_size_row = tx_size_high[tx_size];
4173 const int buf_offset = AOMMAX(txfm_size_row, txfm_size_col);
4174 int32_t *temp_out = temp_in + buf_offset;
4175 int32_t *buf = temp_out + buf_offset;
4176 int32_t *buf_ptr = buf;
4177 const int8_t stage_range[MAX_TXFM_STAGE_NUM] = { 16 };
4178 int r, bd = 8;
4179 const transform_1d_neon row_txfm =
4180 lowbd_txfm_all_1d_arr[txw_idx][hitx_1d_tab[tx_type]];
4181 const transform_1d_neon col_txfm =
4182 lowbd_txfm_all_1d_arr[txh_idx][vitx_1d_tab[tx_type]];
4183
4184 int ud_flip, lr_flip;
4185 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4186
4187 for (int i = 0; i < txfm_size_row; i++) {
4188 row_txfm(input, buf_ptr, cos_bit_row, stage_range);
4189 av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
4190 input += txfm_size_col;
4191 buf_ptr += txfm_size_col;
4192 }
4193
4194 for (int c = 0; c < txfm_size_col; ++c) {
4195 if (lr_flip == 0) {
4196 for (r = 0; r < txfm_size_row; ++r)
4197 temp_in[r] = buf[r * txfm_size_col + c];
4198 } else {
4199 // flip left right
4200 for (r = 0; r < txfm_size_row; ++r)
4201 temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
4202 }
4203 col_txfm(temp_in, temp_out, cos_bit_col, stage_range);
4204 av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
4205
4206 if (ud_flip == 0) {
4207 for (r = 0; r < txfm_size_row; ++r) {
4208 output[r * stride + c] =
4209 highbd_clip_pixel_add(output[r * stride + c], temp_out[r], bd);
4210 }
4211 } else {
4212 // flip upside down
4213 for (r = 0; r < txfm_size_row; ++r) {
4214 output[r * stride + c] = highbd_clip_pixel_add(
4215 output[r * stride + c], temp_out[txfm_size_row - r - 1], bd);
4216 }
4217 }
4218 }
4219}
4220
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304221static INLINE void lowbd_inv_txfm2d_add_no_identity_neon(
4222 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4223 TX_SIZE tx_size, int eob) {
4224 int16x8_t a[64 * 8];
4225 int16x8_t b[64 * 8];
4226 int eobx, eoby, ud_flip, lr_flip;
4227 get_eobx_eoby_scan_default(&eobx, &eoby, tx_size, eob);
Yaowu Xua19e7622019-04-29 14:12:44 -07004228 const int8_t *shift = av1_inv_txfm_shift_ls[tx_size];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304229 const int txw_idx = get_txw_idx(tx_size);
4230 const int txh_idx = get_txh_idx(tx_size);
Yaowu Xua19e7622019-04-29 14:12:44 -07004231 const int cos_bit_col = av1_inv_cos_bit_col[txw_idx][txh_idx];
4232 const int cos_bit_row = av1_inv_cos_bit_row[txw_idx][txh_idx];
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304233 const int txfm_size_col = tx_size_wide[tx_size];
4234 const int txfm_size_row = tx_size_high[tx_size];
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304235 const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304236 const int buf_size_w_div8 = txfm_size_col >> 3;
4237 const int buf_size_nonzero_h_div8 = (eoby + 8) >> 3;
4238 const int buf_size_nonzero_w_div8 = (eobx + 8) >> 3;
sachin garg56f10202018-09-24 14:05:25 +00004239 const int input_stride = AOMMIN(32, txfm_size_col);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304240 const int fun_idx_x = lowbd_txfm_all_1d_zeros_idx[eobx];
4241 const int fun_idx_y = lowbd_txfm_all_1d_zeros_idx[eoby];
4242 const int32_t *input_1;
4243 int temp_b = 0;
4244
4245 const transform_neon row_txfm =
4246 lowbd_txfm_all_1d_zeros_w_arr[txw_idx][hitx_1d_tab[tx_type]][fun_idx_x];
4247 const transform_neon col_txfm =
4248 lowbd_txfm_all_1d_zeros_w_arr[txh_idx][vitx_1d_tab[tx_type]][fun_idx_y];
4249
4250 assert(col_txfm != NULL);
4251 assert(row_txfm != NULL);
4252
4253 get_flip_cfg(tx_type, &ud_flip, &lr_flip);
4254
4255 for (int i = 0; i < buf_size_nonzero_h_div8; i++) {
4256 input_1 = input;
4257 for (int j = 0; j < buf_size_nonzero_w_div8; ++j) {
4258 int k = j * 8 + i * txfm_size_col;
sachin garg56f10202018-09-24 14:05:25 +00004259 load_buffer_32bit_to_16bit_neon(input_1, &a[k], input_stride);
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304260 transpose_s16_8x8q(&a[k], &a[k]);
4261 input_1 += 8;
4262 }
sachin garg56f10202018-09-24 14:05:25 +00004263 input += (input_stride * 8);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304264 if (abs(rect_type) == 1) {
4265 int y = i * txfm_size_col;
sachin garg56f10202018-09-24 14:05:25 +00004266 round_shift_for_rect(&a[y], &a[y], input_stride);
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304267 }
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304268 row_txfm(&a[i * txfm_size_col], &a[i * txfm_size_col], cos_bit_row, 0);
4269 av1_round_shift_array_16_neon(&a[i * txfm_size_col], txfm_size_col,
4270 -shift[0]);
4271 if (lr_flip == 1) {
4272 for (int j = 0; j < buf_size_w_div8; ++j) {
4273 int k = j * 8 + i * txfm_size_col;
4274 flip_buf_ud_neon(&a[k], 8);
4275 transpose_s16_8x8q(
4276 &a[k], &b[temp_b + txfm_size_row * (buf_size_w_div8 - 1 - j)]);
4277 }
4278 temp_b += 8;
4279 } else {
4280 for (int j = 0; j < buf_size_w_div8; ++j) {
4281 int k = j * 8 + i * txfm_size_col;
4282 transpose_s16_8x8q(&a[k], &b[temp_b + txfm_size_row * j]);
4283 }
4284 temp_b += 8;
4285 }
4286 }
4287 for (int j = 0; j < buf_size_w_div8; ++j) {
4288 col_txfm(&b[j * txfm_size_row], &b[j * txfm_size_row], cos_bit_col, 0);
4289 av1_round_shift_array_16_neon(&b[j * txfm_size_row], txfm_size_row,
4290 -shift[1]);
4291 }
4292
4293 if (txfm_size_col >= 16) {
4294 for (int i = 0; i < (txfm_size_col >> 4); i++) {
4295 lowbd_add_flip_buffer_16xn_neon(&b[i * txfm_size_row * 2],
4296 output + 16 * i, stride, ud_flip,
4297 txfm_size_row);
4298 }
4299 } else if (txfm_size_col == 8) {
4300 lowbd_add_flip_buffer_8xn_neon(b, output, stride, ud_flip, txfm_size_row);
4301 }
4302}
4303
Venkat000f2f62018-07-05 12:03:05 +05304304static INLINE void lowbd_inv_txfm2d_add_universe_neon(
4305 const int32_t *input, uint8_t *output, int stride, TX_TYPE tx_type,
4306 TX_SIZE tx_size, int eob) {
4307 switch (tx_type) {
4308 case IDTX:
4309 lowbd_inv_txfm2d_add_idtx_neon(input, output, stride, tx_type, tx_size,
4310 eob);
4311 break;
4312
4313 case H_DCT:
4314 case H_ADST:
4315 case H_FLIPADST:
4316 lowbd_inv_txfm2d_add_v_identity_neon(input, output, stride, tx_type,
4317 tx_size, eob);
4318 break;
4319
4320 case V_DCT:
4321 case V_ADST:
4322 case V_FLIPADST:
4323 lowbd_inv_txfm2d_add_h_identity_neon(input, output, stride, tx_type,
4324 tx_size, eob);
4325 break;
4326
4327 default:
4328 lowbd_inv_txfm2d_add_no_identity_neon(input, output, stride, tx_type,
4329 tx_size, eob);
4330 break;
4331 }
4332}
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304333
Venkat000f2f62018-07-05 12:03:05 +05304334void av1_lowbd_inv_txfm2d_add_neon(const int32_t *input, uint8_t *output,
4335 int stride, TX_TYPE tx_type, TX_SIZE tx_size,
4336 int eob) {
Venkat000f2f62018-07-05 12:03:05 +05304337 switch (tx_size) {
4338 case TX_4X4:
sachin garg56f10202018-09-24 14:05:25 +00004339 lowbd_inv_txfm2d_add_4x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304340 break;
4341
4342 case TX_4X8:
sachin garg56f10202018-09-24 14:05:25 +00004343 lowbd_inv_txfm2d_add_4x8_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304344 break;
4345
4346 case TX_8X4:
sachin garg56f10202018-09-24 14:05:25 +00004347 lowbd_inv_txfm2d_add_8x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304348 break;
4349
4350 case TX_4X16:
sachin garg56f10202018-09-24 14:05:25 +00004351 lowbd_inv_txfm2d_add_4x16_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304352 break;
4353
4354 case TX_16X4:
sachin garg56f10202018-09-24 14:05:25 +00004355 lowbd_inv_txfm2d_add_16x4_neon(input, output, stride, tx_type, eob);
Venkat000f2f62018-07-05 12:03:05 +05304356 break;
4357
Sachin Kumar Gargd4f25b62018-07-27 17:04:58 +05304358 default:
Sachin Kumar Garg11e09372018-07-17 18:02:10 +05304359 lowbd_inv_txfm2d_add_universe_neon(input, output, stride, tx_type,
Venkat000f2f62018-07-05 12:03:05 +05304360 tx_size, eob);
Venkat000f2f62018-07-05 12:03:05 +05304361 break;
4362 }
4363}
4364void av1_inv_txfm_add_neon(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
4365 const TxfmParam *txfm_param) {
4366 const TX_TYPE tx_type = txfm_param->tx_type;
4367 if (!txfm_param->lossless) {
4368 av1_lowbd_inv_txfm2d_add_neon(dqcoeff, dst, stride, tx_type,
4369 txfm_param->tx_size, txfm_param->eob);
4370 } else {
4371 av1_inv_txfm_add_c(dqcoeff, dst, stride, txfm_param);
4372 }
4373}