blob: dee8f1658180c054a00560c4318f0428a7ddfae3 [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <math.h>
13#include <string.h>
14
Yaowu Xuf883b422016-08-30 14:01:10 -070015#include "./aom_dsp_rtcd.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070016#include "aom_dsp/inv_txfm.h"
17
Yaowu Xuf883b422016-08-30 14:01:10 -070018void aom_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070019 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
20 0.5 shifts per pixel. */
21 int i;
22 tran_low_t output[16];
23 tran_high_t a1, b1, c1, d1, e1;
24 const tran_low_t *ip = input;
25 tran_low_t *op = output;
26
27 for (i = 0; i < 4; i++) {
28 a1 = ip[0] >> UNIT_QUANT_SHIFT;
29 c1 = ip[1] >> UNIT_QUANT_SHIFT;
30 d1 = ip[2] >> UNIT_QUANT_SHIFT;
31 b1 = ip[3] >> UNIT_QUANT_SHIFT;
32 a1 += c1;
33 d1 -= b1;
34 e1 = (a1 - d1) >> 1;
35 b1 = e1 - b1;
36 c1 = e1 - c1;
37 a1 -= b1;
38 d1 += c1;
39 op[0] = WRAPLOW(a1);
40 op[1] = WRAPLOW(b1);
41 op[2] = WRAPLOW(c1);
42 op[3] = WRAPLOW(d1);
43 ip += 4;
44 op += 4;
45 }
46
47 ip = output;
48 for (i = 0; i < 4; i++) {
49 a1 = ip[4 * 0];
50 c1 = ip[4 * 1];
51 d1 = ip[4 * 2];
52 b1 = ip[4 * 3];
53 a1 += c1;
54 d1 -= b1;
55 e1 = (a1 - d1) >> 1;
56 b1 = e1 - b1;
57 c1 = e1 - c1;
58 a1 -= b1;
59 d1 += c1;
60 dest[stride * 0] = clip_pixel_add(dest[stride * 0], WRAPLOW(a1));
61 dest[stride * 1] = clip_pixel_add(dest[stride * 1], WRAPLOW(b1));
62 dest[stride * 2] = clip_pixel_add(dest[stride * 2], WRAPLOW(c1));
63 dest[stride * 3] = clip_pixel_add(dest[stride * 3], WRAPLOW(d1));
64
65 ip++;
66 dest++;
67 }
68}
69
Yaowu Xuf883b422016-08-30 14:01:10 -070070void aom_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070071 int i;
72 tran_high_t a1, e1;
73 tran_low_t tmp[4];
74 const tran_low_t *ip = in;
75 tran_low_t *op = tmp;
76
77 a1 = ip[0] >> UNIT_QUANT_SHIFT;
78 e1 = a1 >> 1;
79 a1 -= e1;
80 op[0] = WRAPLOW(a1);
81 op[1] = op[2] = op[3] = WRAPLOW(e1);
82
83 ip = tmp;
84 for (i = 0; i < 4; i++) {
85 e1 = ip[0] >> 1;
86 a1 = ip[0] - e1;
87 dest[dest_stride * 0] = clip_pixel_add(dest[dest_stride * 0], a1);
88 dest[dest_stride * 1] = clip_pixel_add(dest[dest_stride * 1], e1);
89 dest[dest_stride * 2] = clip_pixel_add(dest[dest_stride * 2], e1);
90 dest[dest_stride * 3] = clip_pixel_add(dest[dest_stride * 3], e1);
91 ip++;
92 dest++;
93 }
94}
95
Luca Barbatof0f98572016-09-03 12:14:15 +020096void aom_idct4_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -070097 tran_low_t step[4];
98 tran_high_t temp1, temp2;
99 // stage 1
100 temp1 = (input[0] + input[2]) * cospi_16_64;
101 temp2 = (input[0] - input[2]) * cospi_16_64;
102 step[0] = WRAPLOW(dct_const_round_shift(temp1));
103 step[1] = WRAPLOW(dct_const_round_shift(temp2));
104 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
105 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
106 step[2] = WRAPLOW(dct_const_round_shift(temp1));
107 step[3] = WRAPLOW(dct_const_round_shift(temp2));
108
109 // stage 2
110 output[0] = WRAPLOW(step[0] + step[3]);
111 output[1] = WRAPLOW(step[1] + step[2]);
112 output[2] = WRAPLOW(step[1] - step[2]);
113 output[3] = WRAPLOW(step[0] - step[3]);
114}
115
Yaowu Xuf883b422016-08-30 14:01:10 -0700116void aom_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700117 tran_low_t out[4 * 4];
118 tran_low_t *outptr = out;
119 int i, j;
120 tran_low_t temp_in[4], temp_out[4];
121
122 // Rows
123 for (i = 0; i < 4; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +0200124 aom_idct4_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700125 input += 4;
126 outptr += 4;
127 }
128
129 // Columns
130 for (i = 0; i < 4; ++i) {
131 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +0200132 aom_idct4_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700133 for (j = 0; j < 4; ++j) {
134 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
135 ROUND_POWER_OF_TWO(temp_out[j], 4));
136 }
137 }
138}
139
Yaowu Xuf883b422016-08-30 14:01:10 -0700140void aom_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700141 int dest_stride) {
142 int i;
143 tran_high_t a1;
144 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
145 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
146 a1 = ROUND_POWER_OF_TWO(out, 4);
147
148 for (i = 0; i < 4; i++) {
149 dest[0] = clip_pixel_add(dest[0], a1);
150 dest[1] = clip_pixel_add(dest[1], a1);
151 dest[2] = clip_pixel_add(dest[2], a1);
152 dest[3] = clip_pixel_add(dest[3], a1);
153 dest += dest_stride;
154 }
155}
156
Luca Barbatof0f98572016-09-03 12:14:15 +0200157void aom_idct8_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700158 tran_low_t step1[8], step2[8];
159 tran_high_t temp1, temp2;
160 // stage 1
161 step1[0] = input[0];
162 step1[2] = input[4];
163 step1[1] = input[2];
164 step1[3] = input[6];
165 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
166 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
167 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
168 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
169 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
170 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
171 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
172 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
173
174 // stage 2
175 temp1 = (step1[0] + step1[2]) * cospi_16_64;
176 temp2 = (step1[0] - step1[2]) * cospi_16_64;
177 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
178 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
179 temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
180 temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
181 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
182 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
183 step2[4] = WRAPLOW(step1[4] + step1[5]);
184 step2[5] = WRAPLOW(step1[4] - step1[5]);
185 step2[6] = WRAPLOW(-step1[6] + step1[7]);
186 step2[7] = WRAPLOW(step1[6] + step1[7]);
187
188 // stage 3
189 step1[0] = WRAPLOW(step2[0] + step2[3]);
190 step1[1] = WRAPLOW(step2[1] + step2[2]);
191 step1[2] = WRAPLOW(step2[1] - step2[2]);
192 step1[3] = WRAPLOW(step2[0] - step2[3]);
193 step1[4] = step2[4];
194 temp1 = (step2[6] - step2[5]) * cospi_16_64;
195 temp2 = (step2[5] + step2[6]) * cospi_16_64;
196 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
197 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
198 step1[7] = step2[7];
199
200 // stage 4
201 output[0] = WRAPLOW(step1[0] + step1[7]);
202 output[1] = WRAPLOW(step1[1] + step1[6]);
203 output[2] = WRAPLOW(step1[2] + step1[5]);
204 output[3] = WRAPLOW(step1[3] + step1[4]);
205 output[4] = WRAPLOW(step1[3] - step1[4]);
206 output[5] = WRAPLOW(step1[2] - step1[5]);
207 output[6] = WRAPLOW(step1[1] - step1[6]);
208 output[7] = WRAPLOW(step1[0] - step1[7]);
209}
210
Yaowu Xuf883b422016-08-30 14:01:10 -0700211void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700212 tran_low_t out[8 * 8];
213 tran_low_t *outptr = out;
214 int i, j;
215 tran_low_t temp_in[8], temp_out[8];
216
217 // First transform rows
218 for (i = 0; i < 8; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +0200219 aom_idct8_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700220 input += 8;
221 outptr += 8;
222 }
223
224 // Then transform columns
225 for (i = 0; i < 8; ++i) {
226 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +0200227 aom_idct8_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700228 for (j = 0; j < 8; ++j) {
229 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
230 ROUND_POWER_OF_TWO(temp_out[j], 5));
231 }
232 }
233}
234
Yaowu Xuf883b422016-08-30 14:01:10 -0700235void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700236 int i, j;
237 tran_high_t a1;
238 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
239 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
240 a1 = ROUND_POWER_OF_TWO(out, 5);
241 for (j = 0; j < 8; ++j) {
242 for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
243 dest += stride;
244 }
245}
246
Luca Barbatof0f98572016-09-03 12:14:15 +0200247void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700248 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
249
250 tran_low_t x0 = input[0];
251 tran_low_t x1 = input[1];
252 tran_low_t x2 = input[2];
253 tran_low_t x3 = input[3];
254
255 if (!(x0 | x1 | x2 | x3)) {
256 output[0] = output[1] = output[2] = output[3] = 0;
257 return;
258 }
259
260 s0 = sinpi_1_9 * x0;
261 s1 = sinpi_2_9 * x0;
262 s2 = sinpi_3_9 * x1;
263 s3 = sinpi_4_9 * x2;
264 s4 = sinpi_1_9 * x2;
265 s5 = sinpi_2_9 * x3;
266 s6 = sinpi_4_9 * x3;
267 s7 = WRAPLOW(x0 - x2 + x3);
268
269 s0 = s0 + s3 + s5;
270 s1 = s1 - s4 - s6;
271 s3 = s2;
272 s2 = sinpi_3_9 * s7;
273
274 // 1-D transform scaling factor is sqrt(2).
275 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
276 // + 1b (addition) = 29b.
277 // Hence the output bit depth is 15b.
278 output[0] = WRAPLOW(dct_const_round_shift(s0 + s3));
279 output[1] = WRAPLOW(dct_const_round_shift(s1 + s3));
280 output[2] = WRAPLOW(dct_const_round_shift(s2));
281 output[3] = WRAPLOW(dct_const_round_shift(s0 + s1 - s3));
282}
283
Luca Barbatof0f98572016-09-03 12:14:15 +0200284void aom_iadst8_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700285 int s0, s1, s2, s3, s4, s5, s6, s7;
286
287 tran_high_t x0 = input[7];
288 tran_high_t x1 = input[0];
289 tran_high_t x2 = input[5];
290 tran_high_t x3 = input[2];
291 tran_high_t x4 = input[3];
292 tran_high_t x5 = input[4];
293 tran_high_t x6 = input[1];
294 tran_high_t x7 = input[6];
295
296 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
297 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
298 output[6] = output[7] = 0;
299 return;
300 }
301
302 // stage 1
303 s0 = (int)(cospi_2_64 * x0 + cospi_30_64 * x1);
304 s1 = (int)(cospi_30_64 * x0 - cospi_2_64 * x1);
305 s2 = (int)(cospi_10_64 * x2 + cospi_22_64 * x3);
306 s3 = (int)(cospi_22_64 * x2 - cospi_10_64 * x3);
307 s4 = (int)(cospi_18_64 * x4 + cospi_14_64 * x5);
308 s5 = (int)(cospi_14_64 * x4 - cospi_18_64 * x5);
309 s6 = (int)(cospi_26_64 * x6 + cospi_6_64 * x7);
310 s7 = (int)(cospi_6_64 * x6 - cospi_26_64 * x7);
311
312 x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
313 x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
314 x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
315 x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
316 x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
317 x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
318 x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
319 x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
320
321 // stage 2
322 s0 = (int)x0;
323 s1 = (int)x1;
324 s2 = (int)x2;
325 s3 = (int)x3;
326 s4 = (int)(cospi_8_64 * x4 + cospi_24_64 * x5);
327 s5 = (int)(cospi_24_64 * x4 - cospi_8_64 * x5);
328 s6 = (int)(-cospi_24_64 * x6 + cospi_8_64 * x7);
329 s7 = (int)(cospi_8_64 * x6 + cospi_24_64 * x7);
330
331 x0 = WRAPLOW(s0 + s2);
332 x1 = WRAPLOW(s1 + s3);
333 x2 = WRAPLOW(s0 - s2);
334 x3 = WRAPLOW(s1 - s3);
335 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
336 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
337 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
338 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
339
340 // stage 3
341 s2 = (int)(cospi_16_64 * (x2 + x3));
342 s3 = (int)(cospi_16_64 * (x2 - x3));
343 s6 = (int)(cospi_16_64 * (x6 + x7));
344 s7 = (int)(cospi_16_64 * (x6 - x7));
345
346 x2 = WRAPLOW(dct_const_round_shift(s2));
347 x3 = WRAPLOW(dct_const_round_shift(s3));
348 x6 = WRAPLOW(dct_const_round_shift(s6));
349 x7 = WRAPLOW(dct_const_round_shift(s7));
350
351 output[0] = WRAPLOW(x0);
352 output[1] = WRAPLOW(-x4);
353 output[2] = WRAPLOW(x6);
354 output[3] = WRAPLOW(-x2);
355 output[4] = WRAPLOW(x3);
356 output[5] = WRAPLOW(-x7);
357 output[6] = WRAPLOW(x5);
358 output[7] = WRAPLOW(-x1);
359}
360
Yaowu Xuf883b422016-08-30 14:01:10 -0700361void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700362 tran_low_t out[8 * 8] = { 0 };
363 tran_low_t *outptr = out;
364 int i, j;
365 tran_low_t temp_in[8], temp_out[8];
366
367 // First transform rows
368 // only first 4 row has non-zero coefs
369 for (i = 0; i < 4; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +0200370 aom_idct8_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700371 input += 8;
372 outptr += 8;
373 }
374
375 // Then transform columns
376 for (i = 0; i < 8; ++i) {
377 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +0200378 aom_idct8_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700379 for (j = 0; j < 8; ++j) {
380 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
381 ROUND_POWER_OF_TWO(temp_out[j], 5));
382 }
383 }
384}
385
Luca Barbatof0f98572016-09-03 12:14:15 +0200386void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700387 tran_low_t step1[16], step2[16];
388 tran_high_t temp1, temp2;
389
390 // stage 1
391 step1[0] = input[0 / 2];
392 step1[1] = input[16 / 2];
393 step1[2] = input[8 / 2];
394 step1[3] = input[24 / 2];
395 step1[4] = input[4 / 2];
396 step1[5] = input[20 / 2];
397 step1[6] = input[12 / 2];
398 step1[7] = input[28 / 2];
399 step1[8] = input[2 / 2];
400 step1[9] = input[18 / 2];
401 step1[10] = input[10 / 2];
402 step1[11] = input[26 / 2];
403 step1[12] = input[6 / 2];
404 step1[13] = input[22 / 2];
405 step1[14] = input[14 / 2];
406 step1[15] = input[30 / 2];
407
408 // stage 2
409 step2[0] = step1[0];
410 step2[1] = step1[1];
411 step2[2] = step1[2];
412 step2[3] = step1[3];
413 step2[4] = step1[4];
414 step2[5] = step1[5];
415 step2[6] = step1[6];
416 step2[7] = step1[7];
417
418 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
419 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
420 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
421 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
422
423 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
424 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
425 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
426 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
427
428 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
429 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
430 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
431 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
432
433 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
434 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
435 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
436 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
437
438 // stage 3
439 step1[0] = step2[0];
440 step1[1] = step2[1];
441 step1[2] = step2[2];
442 step1[3] = step2[3];
443
444 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
445 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
446 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
447 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
448 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
449 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
450 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
451 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
452
453 step1[8] = WRAPLOW(step2[8] + step2[9]);
454 step1[9] = WRAPLOW(step2[8] - step2[9]);
455 step1[10] = WRAPLOW(-step2[10] + step2[11]);
456 step1[11] = WRAPLOW(step2[10] + step2[11]);
457 step1[12] = WRAPLOW(step2[12] + step2[13]);
458 step1[13] = WRAPLOW(step2[12] - step2[13]);
459 step1[14] = WRAPLOW(-step2[14] + step2[15]);
460 step1[15] = WRAPLOW(step2[14] + step2[15]);
461
462 // stage 4
463 temp1 = (step1[0] + step1[1]) * cospi_16_64;
464 temp2 = (step1[0] - step1[1]) * cospi_16_64;
465 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
466 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
467 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
468 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
469 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
470 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
471 step2[4] = WRAPLOW(step1[4] + step1[5]);
472 step2[5] = WRAPLOW(step1[4] - step1[5]);
473 step2[6] = WRAPLOW(-step1[6] + step1[7]);
474 step2[7] = WRAPLOW(step1[6] + step1[7]);
475
476 step2[8] = step1[8];
477 step2[15] = step1[15];
478 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
479 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
480 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
481 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
482 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
483 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
484 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
485 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
486 step2[11] = step1[11];
487 step2[12] = step1[12];
488
489 // stage 5
490 step1[0] = WRAPLOW(step2[0] + step2[3]);
491 step1[1] = WRAPLOW(step2[1] + step2[2]);
492 step1[2] = WRAPLOW(step2[1] - step2[2]);
493 step1[3] = WRAPLOW(step2[0] - step2[3]);
494 step1[4] = step2[4];
495 temp1 = (step2[6] - step2[5]) * cospi_16_64;
496 temp2 = (step2[5] + step2[6]) * cospi_16_64;
497 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
498 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
499 step1[7] = step2[7];
500
501 step1[8] = WRAPLOW(step2[8] + step2[11]);
502 step1[9] = WRAPLOW(step2[9] + step2[10]);
503 step1[10] = WRAPLOW(step2[9] - step2[10]);
504 step1[11] = WRAPLOW(step2[8] - step2[11]);
505 step1[12] = WRAPLOW(-step2[12] + step2[15]);
506 step1[13] = WRAPLOW(-step2[13] + step2[14]);
507 step1[14] = WRAPLOW(step2[13] + step2[14]);
508 step1[15] = WRAPLOW(step2[12] + step2[15]);
509
510 // stage 6
511 step2[0] = WRAPLOW(step1[0] + step1[7]);
512 step2[1] = WRAPLOW(step1[1] + step1[6]);
513 step2[2] = WRAPLOW(step1[2] + step1[5]);
514 step2[3] = WRAPLOW(step1[3] + step1[4]);
515 step2[4] = WRAPLOW(step1[3] - step1[4]);
516 step2[5] = WRAPLOW(step1[2] - step1[5]);
517 step2[6] = WRAPLOW(step1[1] - step1[6]);
518 step2[7] = WRAPLOW(step1[0] - step1[7]);
519 step2[8] = step1[8];
520 step2[9] = step1[9];
521 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
522 temp2 = (step1[10] + step1[13]) * cospi_16_64;
523 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
524 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
525 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
526 temp2 = (step1[11] + step1[12]) * cospi_16_64;
527 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
528 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
529 step2[14] = step1[14];
530 step2[15] = step1[15];
531
532 // stage 7
533 output[0] = WRAPLOW(step2[0] + step2[15]);
534 output[1] = WRAPLOW(step2[1] + step2[14]);
535 output[2] = WRAPLOW(step2[2] + step2[13]);
536 output[3] = WRAPLOW(step2[3] + step2[12]);
537 output[4] = WRAPLOW(step2[4] + step2[11]);
538 output[5] = WRAPLOW(step2[5] + step2[10]);
539 output[6] = WRAPLOW(step2[6] + step2[9]);
540 output[7] = WRAPLOW(step2[7] + step2[8]);
541 output[8] = WRAPLOW(step2[7] - step2[8]);
542 output[9] = WRAPLOW(step2[6] - step2[9]);
543 output[10] = WRAPLOW(step2[5] - step2[10]);
544 output[11] = WRAPLOW(step2[4] - step2[11]);
545 output[12] = WRAPLOW(step2[3] - step2[12]);
546 output[13] = WRAPLOW(step2[2] - step2[13]);
547 output[14] = WRAPLOW(step2[1] - step2[14]);
548 output[15] = WRAPLOW(step2[0] - step2[15]);
549}
550
Yaowu Xuf883b422016-08-30 14:01:10 -0700551void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700552 int stride) {
553 tran_low_t out[16 * 16];
554 tran_low_t *outptr = out;
555 int i, j;
556 tran_low_t temp_in[16], temp_out[16];
557
558 // First transform rows
559 for (i = 0; i < 16; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +0200560 aom_idct16_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700561 input += 16;
562 outptr += 16;
563 }
564
565 // Then transform columns
566 for (i = 0; i < 16; ++i) {
567 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +0200568 aom_idct16_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700569 for (j = 0; j < 16; ++j) {
570 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
571 ROUND_POWER_OF_TWO(temp_out[j], 6));
572 }
573 }
574}
575
Luca Barbatof0f98572016-09-03 12:14:15 +0200576void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700577 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
578 tran_high_t s9, s10, s11, s12, s13, s14, s15;
579
580 tran_high_t x0 = input[15];
581 tran_high_t x1 = input[0];
582 tran_high_t x2 = input[13];
583 tran_high_t x3 = input[2];
584 tran_high_t x4 = input[11];
585 tran_high_t x5 = input[4];
586 tran_high_t x6 = input[9];
587 tran_high_t x7 = input[6];
588 tran_high_t x8 = input[7];
589 tran_high_t x9 = input[8];
590 tran_high_t x10 = input[5];
591 tran_high_t x11 = input[10];
592 tran_high_t x12 = input[3];
593 tran_high_t x13 = input[12];
594 tran_high_t x14 = input[1];
595 tran_high_t x15 = input[14];
596
597 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
598 x13 | x14 | x15)) {
599 output[0] = output[1] = output[2] = output[3] = output[4] = output[5] =
600 output[6] = output[7] = output[8] = output[9] = output[10] =
601 output[11] = output[12] = output[13] = output[14] = output[15] = 0;
602 return;
603 }
604
605 // stage 1
606 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
607 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
608 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
609 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
610 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
611 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
612 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
613 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
614 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
615 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
616 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
617 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
618 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
619 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
620 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
621 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
622
623 x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
624 x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
625 x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
626 x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
627 x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
628 x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
629 x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
630 x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
631 x8 = WRAPLOW(dct_const_round_shift(s0 - s8));
632 x9 = WRAPLOW(dct_const_round_shift(s1 - s9));
633 x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
634 x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
635 x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
636 x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
637 x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
638 x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
639
640 // stage 2
641 s0 = x0;
642 s1 = x1;
643 s2 = x2;
644 s3 = x3;
645 s4 = x4;
646 s5 = x5;
647 s6 = x6;
648 s7 = x7;
649 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
650 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
651 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
652 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
653 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
654 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
655 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
656 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
657
658 x0 = WRAPLOW(s0 + s4);
659 x1 = WRAPLOW(s1 + s5);
660 x2 = WRAPLOW(s2 + s6);
661 x3 = WRAPLOW(s3 + s7);
662 x4 = WRAPLOW(s0 - s4);
663 x5 = WRAPLOW(s1 - s5);
664 x6 = WRAPLOW(s2 - s6);
665 x7 = WRAPLOW(s3 - s7);
666 x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
667 x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
668 x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
669 x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
670 x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
671 x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
672 x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
673 x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
674
675 // stage 3
676 s0 = x0;
677 s1 = x1;
678 s2 = x2;
679 s3 = x3;
680 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
681 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
682 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
683 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
684 s8 = x8;
685 s9 = x9;
686 s10 = x10;
687 s11 = x11;
688 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
689 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
690 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
691 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
692
693 x0 = WRAPLOW(s0 + s2);
694 x1 = WRAPLOW(s1 + s3);
695 x2 = WRAPLOW(s0 - s2);
696 x3 = WRAPLOW(s1 - s3);
697 x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
698 x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
699 x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
700 x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
701 x8 = WRAPLOW(s8 + s10);
702 x9 = WRAPLOW(s9 + s11);
703 x10 = WRAPLOW(s8 - s10);
704 x11 = WRAPLOW(s9 - s11);
705 x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
706 x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
707 x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
708 x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
709
710 // stage 4
711 s2 = (-cospi_16_64) * (x2 + x3);
712 s3 = cospi_16_64 * (x2 - x3);
713 s6 = cospi_16_64 * (x6 + x7);
714 s7 = cospi_16_64 * (-x6 + x7);
715 s10 = cospi_16_64 * (x10 + x11);
716 s11 = cospi_16_64 * (-x10 + x11);
717 s14 = (-cospi_16_64) * (x14 + x15);
718 s15 = cospi_16_64 * (x14 - x15);
719
720 x2 = WRAPLOW(dct_const_round_shift(s2));
721 x3 = WRAPLOW(dct_const_round_shift(s3));
722 x6 = WRAPLOW(dct_const_round_shift(s6));
723 x7 = WRAPLOW(dct_const_round_shift(s7));
724 x10 = WRAPLOW(dct_const_round_shift(s10));
725 x11 = WRAPLOW(dct_const_round_shift(s11));
726 x14 = WRAPLOW(dct_const_round_shift(s14));
727 x15 = WRAPLOW(dct_const_round_shift(s15));
728
729 output[0] = WRAPLOW(x0);
730 output[1] = WRAPLOW(-x8);
731 output[2] = WRAPLOW(x12);
732 output[3] = WRAPLOW(-x4);
733 output[4] = WRAPLOW(x6);
734 output[5] = WRAPLOW(x14);
735 output[6] = WRAPLOW(x10);
736 output[7] = WRAPLOW(x2);
737 output[8] = WRAPLOW(x3);
738 output[9] = WRAPLOW(x11);
739 output[10] = WRAPLOW(x15);
740 output[11] = WRAPLOW(x7);
741 output[12] = WRAPLOW(x5);
742 output[13] = WRAPLOW(-x13);
743 output[14] = WRAPLOW(x9);
744 output[15] = WRAPLOW(-x1);
745}
746
Yaowu Xuf883b422016-08-30 14:01:10 -0700747void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700748 int stride) {
749 tran_low_t out[16 * 16] = { 0 };
750 tran_low_t *outptr = out;
751 int i, j;
752 tran_low_t temp_in[16], temp_out[16];
753
754 // First transform rows. Since all non-zero dct coefficients are in
755 // upper-left 4x4 area, we only need to calculate first 4 rows here.
756 for (i = 0; i < 4; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +0200757 aom_idct16_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700758 input += 16;
759 outptr += 16;
760 }
761
762 // Then transform columns
763 for (i = 0; i < 16; ++i) {
764 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +0200765 aom_idct16_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -0700766 for (j = 0; j < 16; ++j) {
767 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
768 ROUND_POWER_OF_TWO(temp_out[j], 6));
769 }
770 }
771}
772
Yaowu Xuf883b422016-08-30 14:01:10 -0700773void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700774 int i, j;
775 tran_high_t a1;
776 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
777 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
778 a1 = ROUND_POWER_OF_TWO(out, 6);
779 for (j = 0; j < 16; ++j) {
780 for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
781 dest += stride;
782 }
783}
784
Luca Barbatof0f98572016-09-03 12:14:15 +0200785void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700786 tran_low_t step1[32], step2[32];
787 tran_high_t temp1, temp2;
788
789 // stage 1
790 step1[0] = input[0];
791 step1[1] = input[16];
792 step1[2] = input[8];
793 step1[3] = input[24];
794 step1[4] = input[4];
795 step1[5] = input[20];
796 step1[6] = input[12];
797 step1[7] = input[28];
798 step1[8] = input[2];
799 step1[9] = input[18];
800 step1[10] = input[10];
801 step1[11] = input[26];
802 step1[12] = input[6];
803 step1[13] = input[22];
804 step1[14] = input[14];
805 step1[15] = input[30];
806
807 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
808 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
809 step1[16] = WRAPLOW(dct_const_round_shift(temp1));
810 step1[31] = WRAPLOW(dct_const_round_shift(temp2));
811
812 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
813 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
814 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
815 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
816
817 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
818 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
819 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
820 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
821
822 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
823 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
824 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
825 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
826
827 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
828 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
829 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
830 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
831
832 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
833 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
834 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
835 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
836
837 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
838 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
839 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
840 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
841
842 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
843 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
844 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
845 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
846
847 // stage 2
848 step2[0] = step1[0];
849 step2[1] = step1[1];
850 step2[2] = step1[2];
851 step2[3] = step1[3];
852 step2[4] = step1[4];
853 step2[5] = step1[5];
854 step2[6] = step1[6];
855 step2[7] = step1[7];
856
857 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
858 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
859 step2[8] = WRAPLOW(dct_const_round_shift(temp1));
860 step2[15] = WRAPLOW(dct_const_round_shift(temp2));
861
862 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
863 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
864 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
865 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
866
867 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
868 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
869 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
870 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
871
872 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
873 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
874 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
875 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
876
877 step2[16] = WRAPLOW(step1[16] + step1[17]);
878 step2[17] = WRAPLOW(step1[16] - step1[17]);
879 step2[18] = WRAPLOW(-step1[18] + step1[19]);
880 step2[19] = WRAPLOW(step1[18] + step1[19]);
881 step2[20] = WRAPLOW(step1[20] + step1[21]);
882 step2[21] = WRAPLOW(step1[20] - step1[21]);
883 step2[22] = WRAPLOW(-step1[22] + step1[23]);
884 step2[23] = WRAPLOW(step1[22] + step1[23]);
885 step2[24] = WRAPLOW(step1[24] + step1[25]);
886 step2[25] = WRAPLOW(step1[24] - step1[25]);
887 step2[26] = WRAPLOW(-step1[26] + step1[27]);
888 step2[27] = WRAPLOW(step1[26] + step1[27]);
889 step2[28] = WRAPLOW(step1[28] + step1[29]);
890 step2[29] = WRAPLOW(step1[28] - step1[29]);
891 step2[30] = WRAPLOW(-step1[30] + step1[31]);
892 step2[31] = WRAPLOW(step1[30] + step1[31]);
893
894 // stage 3
895 step1[0] = step2[0];
896 step1[1] = step2[1];
897 step1[2] = step2[2];
898 step1[3] = step2[3];
899
900 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
901 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
902 step1[4] = WRAPLOW(dct_const_round_shift(temp1));
903 step1[7] = WRAPLOW(dct_const_round_shift(temp2));
904 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
905 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
906 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
907 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
908
909 step1[8] = WRAPLOW(step2[8] + step2[9]);
910 step1[9] = WRAPLOW(step2[8] - step2[9]);
911 step1[10] = WRAPLOW(-step2[10] + step2[11]);
912 step1[11] = WRAPLOW(step2[10] + step2[11]);
913 step1[12] = WRAPLOW(step2[12] + step2[13]);
914 step1[13] = WRAPLOW(step2[12] - step2[13]);
915 step1[14] = WRAPLOW(-step2[14] + step2[15]);
916 step1[15] = WRAPLOW(step2[14] + step2[15]);
917
918 step1[16] = step2[16];
919 step1[31] = step2[31];
920 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
921 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
922 step1[17] = WRAPLOW(dct_const_round_shift(temp1));
923 step1[30] = WRAPLOW(dct_const_round_shift(temp2));
924 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
925 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
926 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
927 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
928 step1[19] = step2[19];
929 step1[20] = step2[20];
930 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
931 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
932 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
933 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
934 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
935 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
936 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
937 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
938 step1[23] = step2[23];
939 step1[24] = step2[24];
940 step1[27] = step2[27];
941 step1[28] = step2[28];
942
943 // stage 4
944 temp1 = (step1[0] + step1[1]) * cospi_16_64;
945 temp2 = (step1[0] - step1[1]) * cospi_16_64;
946 step2[0] = WRAPLOW(dct_const_round_shift(temp1));
947 step2[1] = WRAPLOW(dct_const_round_shift(temp2));
948 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
949 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
950 step2[2] = WRAPLOW(dct_const_round_shift(temp1));
951 step2[3] = WRAPLOW(dct_const_round_shift(temp2));
952 step2[4] = WRAPLOW(step1[4] + step1[5]);
953 step2[5] = WRAPLOW(step1[4] - step1[5]);
954 step2[6] = WRAPLOW(-step1[6] + step1[7]);
955 step2[7] = WRAPLOW(step1[6] + step1[7]);
956
957 step2[8] = step1[8];
958 step2[15] = step1[15];
959 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
960 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
961 step2[9] = WRAPLOW(dct_const_round_shift(temp1));
962 step2[14] = WRAPLOW(dct_const_round_shift(temp2));
963 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
964 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
965 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
966 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
967 step2[11] = step1[11];
968 step2[12] = step1[12];
969
970 step2[16] = WRAPLOW(step1[16] + step1[19]);
971 step2[17] = WRAPLOW(step1[17] + step1[18]);
972 step2[18] = WRAPLOW(step1[17] - step1[18]);
973 step2[19] = WRAPLOW(step1[16] - step1[19]);
974 step2[20] = WRAPLOW(-step1[20] + step1[23]);
975 step2[21] = WRAPLOW(-step1[21] + step1[22]);
976 step2[22] = WRAPLOW(step1[21] + step1[22]);
977 step2[23] = WRAPLOW(step1[20] + step1[23]);
978
979 step2[24] = WRAPLOW(step1[24] + step1[27]);
980 step2[25] = WRAPLOW(step1[25] + step1[26]);
981 step2[26] = WRAPLOW(step1[25] - step1[26]);
982 step2[27] = WRAPLOW(step1[24] - step1[27]);
983 step2[28] = WRAPLOW(-step1[28] + step1[31]);
984 step2[29] = WRAPLOW(-step1[29] + step1[30]);
985 step2[30] = WRAPLOW(step1[29] + step1[30]);
986 step2[31] = WRAPLOW(step1[28] + step1[31]);
987
988 // stage 5
989 step1[0] = WRAPLOW(step2[0] + step2[3]);
990 step1[1] = WRAPLOW(step2[1] + step2[2]);
991 step1[2] = WRAPLOW(step2[1] - step2[2]);
992 step1[3] = WRAPLOW(step2[0] - step2[3]);
993 step1[4] = step2[4];
994 temp1 = (step2[6] - step2[5]) * cospi_16_64;
995 temp2 = (step2[5] + step2[6]) * cospi_16_64;
996 step1[5] = WRAPLOW(dct_const_round_shift(temp1));
997 step1[6] = WRAPLOW(dct_const_round_shift(temp2));
998 step1[7] = step2[7];
999
1000 step1[8] = WRAPLOW(step2[8] + step2[11]);
1001 step1[9] = WRAPLOW(step2[9] + step2[10]);
1002 step1[10] = WRAPLOW(step2[9] - step2[10]);
1003 step1[11] = WRAPLOW(step2[8] - step2[11]);
1004 step1[12] = WRAPLOW(-step2[12] + step2[15]);
1005 step1[13] = WRAPLOW(-step2[13] + step2[14]);
1006 step1[14] = WRAPLOW(step2[13] + step2[14]);
1007 step1[15] = WRAPLOW(step2[12] + step2[15]);
1008
1009 step1[16] = step2[16];
1010 step1[17] = step2[17];
1011 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
1012 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
1013 step1[18] = WRAPLOW(dct_const_round_shift(temp1));
1014 step1[29] = WRAPLOW(dct_const_round_shift(temp2));
1015 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
1016 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
1017 step1[19] = WRAPLOW(dct_const_round_shift(temp1));
1018 step1[28] = WRAPLOW(dct_const_round_shift(temp2));
1019 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
1020 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
1021 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1022 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1023 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
1024 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
1025 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1026 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1027 step1[22] = step2[22];
1028 step1[23] = step2[23];
1029 step1[24] = step2[24];
1030 step1[25] = step2[25];
1031 step1[30] = step2[30];
1032 step1[31] = step2[31];
1033
1034 // stage 6
1035 step2[0] = WRAPLOW(step1[0] + step1[7]);
1036 step2[1] = WRAPLOW(step1[1] + step1[6]);
1037 step2[2] = WRAPLOW(step1[2] + step1[5]);
1038 step2[3] = WRAPLOW(step1[3] + step1[4]);
1039 step2[4] = WRAPLOW(step1[3] - step1[4]);
1040 step2[5] = WRAPLOW(step1[2] - step1[5]);
1041 step2[6] = WRAPLOW(step1[1] - step1[6]);
1042 step2[7] = WRAPLOW(step1[0] - step1[7]);
1043 step2[8] = step1[8];
1044 step2[9] = step1[9];
1045 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1046 temp2 = (step1[10] + step1[13]) * cospi_16_64;
1047 step2[10] = WRAPLOW(dct_const_round_shift(temp1));
1048 step2[13] = WRAPLOW(dct_const_round_shift(temp2));
1049 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1050 temp2 = (step1[11] + step1[12]) * cospi_16_64;
1051 step2[11] = WRAPLOW(dct_const_round_shift(temp1));
1052 step2[12] = WRAPLOW(dct_const_round_shift(temp2));
1053 step2[14] = step1[14];
1054 step2[15] = step1[15];
1055
1056 step2[16] = WRAPLOW(step1[16] + step1[23]);
1057 step2[17] = WRAPLOW(step1[17] + step1[22]);
1058 step2[18] = WRAPLOW(step1[18] + step1[21]);
1059 step2[19] = WRAPLOW(step1[19] + step1[20]);
1060 step2[20] = WRAPLOW(step1[19] - step1[20]);
1061 step2[21] = WRAPLOW(step1[18] - step1[21]);
1062 step2[22] = WRAPLOW(step1[17] - step1[22]);
1063 step2[23] = WRAPLOW(step1[16] - step1[23]);
1064
1065 step2[24] = WRAPLOW(-step1[24] + step1[31]);
1066 step2[25] = WRAPLOW(-step1[25] + step1[30]);
1067 step2[26] = WRAPLOW(-step1[26] + step1[29]);
1068 step2[27] = WRAPLOW(-step1[27] + step1[28]);
1069 step2[28] = WRAPLOW(step1[27] + step1[28]);
1070 step2[29] = WRAPLOW(step1[26] + step1[29]);
1071 step2[30] = WRAPLOW(step1[25] + step1[30]);
1072 step2[31] = WRAPLOW(step1[24] + step1[31]);
1073
1074 // stage 7
1075 step1[0] = WRAPLOW(step2[0] + step2[15]);
1076 step1[1] = WRAPLOW(step2[1] + step2[14]);
1077 step1[2] = WRAPLOW(step2[2] + step2[13]);
1078 step1[3] = WRAPLOW(step2[3] + step2[12]);
1079 step1[4] = WRAPLOW(step2[4] + step2[11]);
1080 step1[5] = WRAPLOW(step2[5] + step2[10]);
1081 step1[6] = WRAPLOW(step2[6] + step2[9]);
1082 step1[7] = WRAPLOW(step2[7] + step2[8]);
1083 step1[8] = WRAPLOW(step2[7] - step2[8]);
1084 step1[9] = WRAPLOW(step2[6] - step2[9]);
1085 step1[10] = WRAPLOW(step2[5] - step2[10]);
1086 step1[11] = WRAPLOW(step2[4] - step2[11]);
1087 step1[12] = WRAPLOW(step2[3] - step2[12]);
1088 step1[13] = WRAPLOW(step2[2] - step2[13]);
1089 step1[14] = WRAPLOW(step2[1] - step2[14]);
1090 step1[15] = WRAPLOW(step2[0] - step2[15]);
1091
1092 step1[16] = step2[16];
1093 step1[17] = step2[17];
1094 step1[18] = step2[18];
1095 step1[19] = step2[19];
1096 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
1097 temp2 = (step2[20] + step2[27]) * cospi_16_64;
1098 step1[20] = WRAPLOW(dct_const_round_shift(temp1));
1099 step1[27] = WRAPLOW(dct_const_round_shift(temp2));
1100 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
1101 temp2 = (step2[21] + step2[26]) * cospi_16_64;
1102 step1[21] = WRAPLOW(dct_const_round_shift(temp1));
1103 step1[26] = WRAPLOW(dct_const_round_shift(temp2));
1104 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
1105 temp2 = (step2[22] + step2[25]) * cospi_16_64;
1106 step1[22] = WRAPLOW(dct_const_round_shift(temp1));
1107 step1[25] = WRAPLOW(dct_const_round_shift(temp2));
1108 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
1109 temp2 = (step2[23] + step2[24]) * cospi_16_64;
1110 step1[23] = WRAPLOW(dct_const_round_shift(temp1));
1111 step1[24] = WRAPLOW(dct_const_round_shift(temp2));
1112 step1[28] = step2[28];
1113 step1[29] = step2[29];
1114 step1[30] = step2[30];
1115 step1[31] = step2[31];
1116
1117 // final stage
1118 output[0] = WRAPLOW(step1[0] + step1[31]);
1119 output[1] = WRAPLOW(step1[1] + step1[30]);
1120 output[2] = WRAPLOW(step1[2] + step1[29]);
1121 output[3] = WRAPLOW(step1[3] + step1[28]);
1122 output[4] = WRAPLOW(step1[4] + step1[27]);
1123 output[5] = WRAPLOW(step1[5] + step1[26]);
1124 output[6] = WRAPLOW(step1[6] + step1[25]);
1125 output[7] = WRAPLOW(step1[7] + step1[24]);
1126 output[8] = WRAPLOW(step1[8] + step1[23]);
1127 output[9] = WRAPLOW(step1[9] + step1[22]);
1128 output[10] = WRAPLOW(step1[10] + step1[21]);
1129 output[11] = WRAPLOW(step1[11] + step1[20]);
1130 output[12] = WRAPLOW(step1[12] + step1[19]);
1131 output[13] = WRAPLOW(step1[13] + step1[18]);
1132 output[14] = WRAPLOW(step1[14] + step1[17]);
1133 output[15] = WRAPLOW(step1[15] + step1[16]);
1134 output[16] = WRAPLOW(step1[15] - step1[16]);
1135 output[17] = WRAPLOW(step1[14] - step1[17]);
1136 output[18] = WRAPLOW(step1[13] - step1[18]);
1137 output[19] = WRAPLOW(step1[12] - step1[19]);
1138 output[20] = WRAPLOW(step1[11] - step1[20]);
1139 output[21] = WRAPLOW(step1[10] - step1[21]);
1140 output[22] = WRAPLOW(step1[9] - step1[22]);
1141 output[23] = WRAPLOW(step1[8] - step1[23]);
1142 output[24] = WRAPLOW(step1[7] - step1[24]);
1143 output[25] = WRAPLOW(step1[6] - step1[25]);
1144 output[26] = WRAPLOW(step1[5] - step1[26]);
1145 output[27] = WRAPLOW(step1[4] - step1[27]);
1146 output[28] = WRAPLOW(step1[3] - step1[28]);
1147 output[29] = WRAPLOW(step1[2] - step1[29]);
1148 output[30] = WRAPLOW(step1[1] - step1[30]);
1149 output[31] = WRAPLOW(step1[0] - step1[31]);
1150}
1151
Yaowu Xuf883b422016-08-30 14:01:10 -07001152void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001153 int stride) {
1154 tran_low_t out[32 * 32];
1155 tran_low_t *outptr = out;
1156 int i, j;
1157 tran_low_t temp_in[32], temp_out[32];
1158
1159 // Rows
1160 for (i = 0; i < 32; ++i) {
1161 int16_t zero_coeff[16];
1162 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
1163 for (j = 0; j < 8; ++j)
1164 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1165 for (j = 0; j < 4; ++j)
1166 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1167 for (j = 0; j < 2; ++j)
1168 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
1169
1170 if (zero_coeff[0] | zero_coeff[1])
Luca Barbatof0f98572016-09-03 12:14:15 +02001171 aom_idct32_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001172 else
1173 memset(outptr, 0, sizeof(tran_low_t) * 32);
1174 input += 32;
1175 outptr += 32;
1176 }
1177
1178 // Columns
1179 for (i = 0; i < 32; ++i) {
1180 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +02001181 aom_idct32_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001182 for (j = 0; j < 32; ++j) {
1183 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1184 ROUND_POWER_OF_TWO(temp_out[j], 6));
1185 }
1186 }
1187}
1188
Yaowu Xuf883b422016-08-30 14:01:10 -07001189void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001190 int stride) {
1191 tran_low_t out[32 * 32] = { 0 };
1192 tran_low_t *outptr = out;
1193 int i, j;
1194 tran_low_t temp_in[32], temp_out[32];
1195
1196 // Rows
1197 // only upper-left 16x16 has non-zero coeff
1198 for (i = 0; i < 16; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +02001199 aom_idct32_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001200 input += 32;
1201 outptr += 32;
1202 }
1203
1204 // Columns
1205 for (i = 0; i < 32; ++i) {
1206 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +02001207 aom_idct32_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001208 for (j = 0; j < 32; ++j) {
1209 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1210 ROUND_POWER_OF_TWO(temp_out[j], 6));
1211 }
1212 }
1213}
1214
Yaowu Xuf883b422016-08-30 14:01:10 -07001215void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001216 int stride) {
1217 tran_low_t out[32 * 32] = { 0 };
1218 tran_low_t *outptr = out;
1219 int i, j;
1220 tran_low_t temp_in[32], temp_out[32];
1221
1222 // Rows
1223 // only upper-left 8x8 has non-zero coeff
1224 for (i = 0; i < 8; ++i) {
Luca Barbatof0f98572016-09-03 12:14:15 +02001225 aom_idct32_c(input, outptr);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001226 input += 32;
1227 outptr += 32;
1228 }
1229
1230 // Columns
1231 for (i = 0; i < 32; ++i) {
1232 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
Luca Barbatof0f98572016-09-03 12:14:15 +02001233 aom_idct32_c(temp_in, temp_out);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001234 for (j = 0; j < 32; ++j) {
1235 dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
1236 ROUND_POWER_OF_TWO(temp_out[j], 6));
1237 }
1238 }
1239}
1240
Yaowu Xuf883b422016-08-30 14:01:10 -07001241void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001242 int i, j;
1243 tran_high_t a1;
1244
1245 tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
1246 out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
1247 a1 = ROUND_POWER_OF_TWO(out, 6);
1248
1249 for (j = 0; j < 32; ++j) {
1250 for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
1251 dest += stride;
1252 }
1253}
1254
Yaowu Xuf883b422016-08-30 14:01:10 -07001255#if CONFIG_AOM_HIGHBITDEPTH
1256void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001257 int stride, int bd) {
1258 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1259 0.5 shifts per pixel. */
1260 int i;
1261 tran_low_t output[16];
1262 tran_high_t a1, b1, c1, d1, e1;
1263 const tran_low_t *ip = input;
1264 tran_low_t *op = output;
1265 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1266
1267 for (i = 0; i < 4; i++) {
1268 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1269 c1 = ip[1] >> UNIT_QUANT_SHIFT;
1270 d1 = ip[2] >> UNIT_QUANT_SHIFT;
1271 b1 = ip[3] >> UNIT_QUANT_SHIFT;
1272 a1 += c1;
1273 d1 -= b1;
1274 e1 = (a1 - d1) >> 1;
1275 b1 = e1 - b1;
1276 c1 = e1 - c1;
1277 a1 -= b1;
1278 d1 += c1;
1279 op[0] = HIGHBD_WRAPLOW(a1, bd);
1280 op[1] = HIGHBD_WRAPLOW(b1, bd);
1281 op[2] = HIGHBD_WRAPLOW(c1, bd);
1282 op[3] = HIGHBD_WRAPLOW(d1, bd);
1283 ip += 4;
1284 op += 4;
1285 }
1286
1287 ip = output;
1288 for (i = 0; i < 4; i++) {
1289 a1 = ip[4 * 0];
1290 c1 = ip[4 * 1];
1291 d1 = ip[4 * 2];
1292 b1 = ip[4 * 3];
1293 a1 += c1;
1294 d1 -= b1;
1295 e1 = (a1 - d1) >> 1;
1296 b1 = e1 - b1;
1297 c1 = e1 - c1;
1298 a1 -= b1;
1299 d1 += c1;
1300 dest[stride * 0] =
1301 highbd_clip_pixel_add(dest[stride * 0], HIGHBD_WRAPLOW(a1, bd), bd);
1302 dest[stride * 1] =
1303 highbd_clip_pixel_add(dest[stride * 1], HIGHBD_WRAPLOW(b1, bd), bd);
1304 dest[stride * 2] =
1305 highbd_clip_pixel_add(dest[stride * 2], HIGHBD_WRAPLOW(c1, bd), bd);
1306 dest[stride * 3] =
1307 highbd_clip_pixel_add(dest[stride * 3], HIGHBD_WRAPLOW(d1, bd), bd);
1308
1309 ip++;
1310 dest++;
1311 }
1312}
1313
Yaowu Xuf883b422016-08-30 14:01:10 -07001314void aom_highbd_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001315 int dest_stride, int bd) {
1316 int i;
1317 tran_high_t a1, e1;
1318 tran_low_t tmp[4];
1319 const tran_low_t *ip = in;
1320 tran_low_t *op = tmp;
1321 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1322 (void)bd;
1323
1324 a1 = ip[0] >> UNIT_QUANT_SHIFT;
1325 e1 = a1 >> 1;
1326 a1 -= e1;
1327 op[0] = HIGHBD_WRAPLOW(a1, bd);
1328 op[1] = op[2] = op[3] = HIGHBD_WRAPLOW(e1, bd);
1329
1330 ip = tmp;
1331 for (i = 0; i < 4; i++) {
1332 e1 = ip[0] >> 1;
1333 a1 = ip[0] - e1;
1334 dest[dest_stride * 0] =
1335 highbd_clip_pixel_add(dest[dest_stride * 0], a1, bd);
1336 dest[dest_stride * 1] =
1337 highbd_clip_pixel_add(dest[dest_stride * 1], e1, bd);
1338 dest[dest_stride * 2] =
1339 highbd_clip_pixel_add(dest[dest_stride * 2], e1, bd);
1340 dest[dest_stride * 3] =
1341 highbd_clip_pixel_add(dest[dest_stride * 3], e1, bd);
1342 ip++;
1343 dest++;
1344 }
1345}
1346
Yaowu Xuf883b422016-08-30 14:01:10 -07001347void aom_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001348 tran_low_t step[4];
1349 tran_high_t temp1, temp2;
1350 (void)bd;
1351 // stage 1
1352 temp1 = (input[0] + input[2]) * cospi_16_64;
1353 temp2 = (input[0] - input[2]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001354 step[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1355 step[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001356 temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
1357 temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001358 step[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1359 step[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001360
1361 // stage 2
1362 output[0] = HIGHBD_WRAPLOW(step[0] + step[3], bd);
1363 output[1] = HIGHBD_WRAPLOW(step[1] + step[2], bd);
1364 output[2] = HIGHBD_WRAPLOW(step[1] - step[2], bd);
1365 output[3] = HIGHBD_WRAPLOW(step[0] - step[3], bd);
1366}
1367
Yaowu Xuf883b422016-08-30 14:01:10 -07001368void aom_highbd_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001369 int stride, int bd) {
1370 tran_low_t out[4 * 4];
1371 tran_low_t *outptr = out;
1372 int i, j;
1373 tran_low_t temp_in[4], temp_out[4];
1374 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1375
1376 // Rows
1377 for (i = 0; i < 4; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07001378 aom_highbd_idct4_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001379 input += 4;
1380 outptr += 4;
1381 }
1382
1383 // Columns
1384 for (i = 0; i < 4; ++i) {
1385 for (j = 0; j < 4; ++j) temp_in[j] = out[j * 4 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07001386 aom_highbd_idct4_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001387 for (j = 0; j < 4; ++j) {
1388 dest[j * stride + i] = highbd_clip_pixel_add(
1389 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
1390 }
1391 }
1392}
1393
Yaowu Xuf883b422016-08-30 14:01:10 -07001394void aom_highbd_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001395 int dest_stride, int bd) {
1396 int i;
1397 tran_high_t a1;
1398 tran_low_t out =
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001399 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001400 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1401
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001402 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001403 a1 = ROUND_POWER_OF_TWO(out, 4);
1404
1405 for (i = 0; i < 4; i++) {
1406 dest[0] = highbd_clip_pixel_add(dest[0], a1, bd);
1407 dest[1] = highbd_clip_pixel_add(dest[1], a1, bd);
1408 dest[2] = highbd_clip_pixel_add(dest[2], a1, bd);
1409 dest[3] = highbd_clip_pixel_add(dest[3], a1, bd);
1410 dest += dest_stride;
1411 }
1412}
1413
Yaowu Xuf883b422016-08-30 14:01:10 -07001414void aom_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001415 tran_low_t step1[8], step2[8];
1416 tran_high_t temp1, temp2;
1417 // stage 1
1418 step1[0] = input[0];
1419 step1[2] = input[4];
1420 step1[1] = input[2];
1421 step1[3] = input[6];
1422 temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
1423 temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001424 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1425 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001426 temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
1427 temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001428 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1429 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001430
1431 // stage 2 & stage 3 - even half
Yaowu Xuf883b422016-08-30 14:01:10 -07001432 aom_highbd_idct4_c(step1, step1, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001433
1434 // stage 2 - odd half
1435 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1436 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1437 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1438 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1439
1440 // stage 3 - odd half
1441 step1[4] = step2[4];
1442 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1443 temp2 = (step2[5] + step2[6]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001444 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1445 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001446 step1[7] = step2[7];
1447
1448 // stage 4
1449 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1450 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1451 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1452 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1453 output[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1454 output[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1455 output[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1456 output[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1457}
1458
Yaowu Xuf883b422016-08-30 14:01:10 -07001459void aom_highbd_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001460 int stride, int bd) {
1461 tran_low_t out[8 * 8];
1462 tran_low_t *outptr = out;
1463 int i, j;
1464 tran_low_t temp_in[8], temp_out[8];
1465 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1466
1467 // First transform rows.
1468 for (i = 0; i < 8; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07001469 aom_highbd_idct8_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001470 input += 8;
1471 outptr += 8;
1472 }
1473
1474 // Then transform columns.
1475 for (i = 0; i < 8; ++i) {
1476 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07001477 aom_highbd_idct8_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001478 for (j = 0; j < 8; ++j) {
1479 dest[j * stride + i] = highbd_clip_pixel_add(
1480 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1481 }
1482 }
1483}
1484
Yaowu Xuf883b422016-08-30 14:01:10 -07001485void aom_highbd_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001486 int stride, int bd) {
1487 int i, j;
1488 tran_high_t a1;
1489 tran_low_t out =
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001490 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001491 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001492 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001493 a1 = ROUND_POWER_OF_TWO(out, 5);
1494 for (j = 0; j < 8; ++j) {
1495 for (i = 0; i < 8; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
1496 dest += stride;
1497 }
1498}
1499
Yaowu Xuf883b422016-08-30 14:01:10 -07001500void aom_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001501 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1502
1503 tran_low_t x0 = input[0];
1504 tran_low_t x1 = input[1];
1505 tran_low_t x2 = input[2];
1506 tran_low_t x3 = input[3];
1507 (void)bd;
1508
1509 if (!(x0 | x1 | x2 | x3)) {
1510 memset(output, 0, 4 * sizeof(*output));
1511 return;
1512 }
1513
1514 s0 = sinpi_1_9 * x0;
1515 s1 = sinpi_2_9 * x0;
1516 s2 = sinpi_3_9 * x1;
1517 s3 = sinpi_4_9 * x2;
1518 s4 = sinpi_1_9 * x2;
1519 s5 = sinpi_2_9 * x3;
1520 s6 = sinpi_4_9 * x3;
1521 s7 = (tran_high_t)HIGHBD_WRAPLOW(x0 - x2 + x3, bd);
1522
1523 s0 = s0 + s3 + s5;
1524 s1 = s1 - s4 - s6;
1525 s3 = s2;
1526 s2 = sinpi_3_9 * s7;
1527
1528 // 1-D transform scaling factor is sqrt(2).
1529 // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1530 // + 1b (addition) = 29b.
1531 // Hence the output bit depth is 15b.
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001532 output[0] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s3), bd);
1533 output[1] = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s3), bd);
1534 output[2] = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1535 output[3] = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s1 - s3), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001536}
1537
Yaowu Xuf883b422016-08-30 14:01:10 -07001538void aom_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001539 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
1540
1541 tran_low_t x0 = input[7];
1542 tran_low_t x1 = input[0];
1543 tran_low_t x2 = input[5];
1544 tran_low_t x3 = input[2];
1545 tran_low_t x4 = input[3];
1546 tran_low_t x5 = input[4];
1547 tran_low_t x6 = input[1];
1548 tran_low_t x7 = input[6];
1549 (void)bd;
1550
1551 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
1552 memset(output, 0, 8 * sizeof(*output));
1553 return;
1554 }
1555
1556 // stage 1
1557 s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
1558 s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
1559 s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
1560 s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
1561 s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
1562 s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
1563 s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
1564 s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
1565
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001566 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s4), bd);
1567 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s5), bd);
1568 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s6), bd);
1569 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s7), bd);
1570 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s4), bd);
1571 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s5), bd);
1572 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s6), bd);
1573 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s7), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001574
1575 // stage 2
1576 s0 = x0;
1577 s1 = x1;
1578 s2 = x2;
1579 s3 = x3;
1580 s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
1581 s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
1582 s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
1583 s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
1584
1585 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1586 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1587 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1588 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001589 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1590 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1591 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1592 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001593
1594 // stage 3
1595 s2 = cospi_16_64 * (x2 + x3);
1596 s3 = cospi_16_64 * (x2 - x3);
1597 s6 = cospi_16_64 * (x6 + x7);
1598 s7 = cospi_16_64 * (x6 - x7);
1599
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001600 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1601 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1602 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1603 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001604
1605 output[0] = HIGHBD_WRAPLOW(x0, bd);
1606 output[1] = HIGHBD_WRAPLOW(-x4, bd);
1607 output[2] = HIGHBD_WRAPLOW(x6, bd);
1608 output[3] = HIGHBD_WRAPLOW(-x2, bd);
1609 output[4] = HIGHBD_WRAPLOW(x3, bd);
1610 output[5] = HIGHBD_WRAPLOW(-x7, bd);
1611 output[6] = HIGHBD_WRAPLOW(x5, bd);
1612 output[7] = HIGHBD_WRAPLOW(-x1, bd);
1613}
1614
Yaowu Xuf883b422016-08-30 14:01:10 -07001615void aom_highbd_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001616 int stride, int bd) {
1617 tran_low_t out[8 * 8] = { 0 };
1618 tran_low_t *outptr = out;
1619 int i, j;
1620 tran_low_t temp_in[8], temp_out[8];
1621 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1622
1623 // First transform rows.
1624 // Only first 4 row has non-zero coefs.
1625 for (i = 0; i < 4; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07001626 aom_highbd_idct8_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001627 input += 8;
1628 outptr += 8;
1629 }
1630 // Then transform columns.
1631 for (i = 0; i < 8; ++i) {
1632 for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07001633 aom_highbd_idct8_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001634 for (j = 0; j < 8; ++j) {
1635 dest[j * stride + i] = highbd_clip_pixel_add(
1636 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
1637 }
1638 }
1639}
1640
Yaowu Xuf883b422016-08-30 14:01:10 -07001641void aom_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001642 tran_low_t step1[16], step2[16];
1643 tran_high_t temp1, temp2;
1644 (void)bd;
1645
1646 // stage 1
1647 step1[0] = input[0 / 2];
1648 step1[1] = input[16 / 2];
1649 step1[2] = input[8 / 2];
1650 step1[3] = input[24 / 2];
1651 step1[4] = input[4 / 2];
1652 step1[5] = input[20 / 2];
1653 step1[6] = input[12 / 2];
1654 step1[7] = input[28 / 2];
1655 step1[8] = input[2 / 2];
1656 step1[9] = input[18 / 2];
1657 step1[10] = input[10 / 2];
1658 step1[11] = input[26 / 2];
1659 step1[12] = input[6 / 2];
1660 step1[13] = input[22 / 2];
1661 step1[14] = input[14 / 2];
1662 step1[15] = input[30 / 2];
1663
1664 // stage 2
1665 step2[0] = step1[0];
1666 step2[1] = step1[1];
1667 step2[2] = step1[2];
1668 step2[3] = step1[3];
1669 step2[4] = step1[4];
1670 step2[5] = step1[5];
1671 step2[6] = step1[6];
1672 step2[7] = step1[7];
1673
1674 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
1675 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001676 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1677 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001678
1679 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
1680 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001681 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1682 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001683
1684 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
1685 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001686 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1687 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001688
1689 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
1690 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001691 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1692 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001693
1694 // stage 3
1695 step1[0] = step2[0];
1696 step1[1] = step2[1];
1697 step1[2] = step2[2];
1698 step1[3] = step2[3];
1699
1700 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
1701 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001702 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1703 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001704 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
1705 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001706 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1707 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001708
1709 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
1710 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
1711 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
1712 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
1713 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
1714 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
1715 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
1716 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
1717
1718 // stage 4
1719 temp1 = (step1[0] + step1[1]) * cospi_16_64;
1720 temp2 = (step1[0] - step1[1]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001721 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1722 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001723 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
1724 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001725 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1726 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001727 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
1728 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
1729 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
1730 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
1731
1732 step2[8] = step1[8];
1733 step2[15] = step1[15];
1734 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
1735 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001736 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1737 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001738 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
1739 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001740 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1741 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001742 step2[11] = step1[11];
1743 step2[12] = step1[12];
1744
1745 // stage 5
1746 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
1747 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
1748 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
1749 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
1750 step1[4] = step2[4];
1751 temp1 = (step2[6] - step2[5]) * cospi_16_64;
1752 temp2 = (step2[5] + step2[6]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001753 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1754 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001755 step1[7] = step2[7];
1756
1757 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
1758 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
1759 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
1760 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
1761 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
1762 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
1763 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
1764 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
1765
1766 // stage 6
1767 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
1768 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
1769 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
1770 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
1771 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
1772 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
1773 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
1774 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
1775 step2[8] = step1[8];
1776 step2[9] = step1[9];
1777 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
1778 temp2 = (step1[10] + step1[13]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001779 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1780 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001781 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
1782 temp2 = (step1[11] + step1[12]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001783 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
1784 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001785 step2[14] = step1[14];
1786 step2[15] = step1[15];
1787
1788 // stage 7
1789 output[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
1790 output[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
1791 output[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
1792 output[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
1793 output[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
1794 output[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
1795 output[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
1796 output[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
1797 output[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
1798 output[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
1799 output[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
1800 output[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
1801 output[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
1802 output[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
1803 output[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
1804 output[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
1805}
1806
Yaowu Xuf883b422016-08-30 14:01:10 -07001807void aom_highbd_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07001808 int stride, int bd) {
1809 tran_low_t out[16 * 16];
1810 tran_low_t *outptr = out;
1811 int i, j;
1812 tran_low_t temp_in[16], temp_out[16];
1813 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
1814
1815 // First transform rows.
1816 for (i = 0; i < 16; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07001817 aom_highbd_idct16_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001818 input += 16;
1819 outptr += 16;
1820 }
1821
1822 // Then transform columns.
1823 for (i = 0; i < 16; ++i) {
1824 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07001825 aom_highbd_idct16_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001826 for (j = 0; j < 16; ++j) {
1827 dest[j * stride + i] = highbd_clip_pixel_add(
1828 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
1829 }
1830 }
1831}
1832
Yaowu Xuf883b422016-08-30 14:01:10 -07001833void aom_highbd_iadst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07001834 tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
1835 tran_high_t s9, s10, s11, s12, s13, s14, s15;
1836
1837 tran_low_t x0 = input[15];
1838 tran_low_t x1 = input[0];
1839 tran_low_t x2 = input[13];
1840 tran_low_t x3 = input[2];
1841 tran_low_t x4 = input[11];
1842 tran_low_t x5 = input[4];
1843 tran_low_t x6 = input[9];
1844 tran_low_t x7 = input[6];
1845 tran_low_t x8 = input[7];
1846 tran_low_t x9 = input[8];
1847 tran_low_t x10 = input[5];
1848 tran_low_t x11 = input[10];
1849 tran_low_t x12 = input[3];
1850 tran_low_t x13 = input[12];
1851 tran_low_t x14 = input[1];
1852 tran_low_t x15 = input[14];
1853 (void)bd;
1854
1855 if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 |
1856 x13 | x14 | x15)) {
1857 memset(output, 0, 16 * sizeof(*output));
1858 return;
1859 }
1860
1861 // stage 1
1862 s0 = x0 * cospi_1_64 + x1 * cospi_31_64;
1863 s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
1864 s2 = x2 * cospi_5_64 + x3 * cospi_27_64;
1865 s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
1866 s4 = x4 * cospi_9_64 + x5 * cospi_23_64;
1867 s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
1868 s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
1869 s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
1870 s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
1871 s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
1872 s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
1873 s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
1874 s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
1875 s13 = x12 * cospi_7_64 - x13 * cospi_25_64;
1876 s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
1877 s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
1878
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001879 x0 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 + s8), bd);
1880 x1 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 + s9), bd);
1881 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 + s10), bd);
1882 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 + s11), bd);
1883 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s12), bd);
1884 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s13), bd);
1885 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 + s14), bd);
1886 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 + s15), bd);
1887 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s0 - s8), bd);
1888 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s1 - s9), bd);
1889 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s2 - s10), bd);
1890 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s3 - s11), bd);
1891 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s12), bd);
1892 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s13), bd);
1893 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s6 - s14), bd);
1894 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s7 - s15), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001895
1896 // stage 2
1897 s0 = x0;
1898 s1 = x1;
1899 s2 = x2;
1900 s3 = x3;
1901 s4 = x4;
1902 s5 = x5;
1903 s6 = x6;
1904 s7 = x7;
1905 s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
1906 s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
1907 s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
1908 s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
1909 s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
1910 s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
1911 s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
1912 s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
1913
1914 x0 = HIGHBD_WRAPLOW(s0 + s4, bd);
1915 x1 = HIGHBD_WRAPLOW(s1 + s5, bd);
1916 x2 = HIGHBD_WRAPLOW(s2 + s6, bd);
1917 x3 = HIGHBD_WRAPLOW(s3 + s7, bd);
1918 x4 = HIGHBD_WRAPLOW(s0 - s4, bd);
1919 x5 = HIGHBD_WRAPLOW(s1 - s5, bd);
1920 x6 = HIGHBD_WRAPLOW(s2 - s6, bd);
1921 x7 = HIGHBD_WRAPLOW(s3 - s7, bd);
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001922 x8 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 + s12), bd);
1923 x9 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 + s13), bd);
1924 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 + s14), bd);
1925 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 + s15), bd);
1926 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s8 - s12), bd);
1927 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s9 - s13), bd);
1928 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s10 - s14), bd);
1929 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s11 - s15), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001930
1931 // stage 3
1932 s0 = x0;
1933 s1 = x1;
1934 s2 = x2;
1935 s3 = x3;
1936 s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
1937 s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
1938 s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
1939 s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
1940 s8 = x8;
1941 s9 = x9;
1942 s10 = x10;
1943 s11 = x11;
1944 s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
1945 s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
1946 s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
1947 s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
1948
1949 x0 = HIGHBD_WRAPLOW(s0 + s2, bd);
1950 x1 = HIGHBD_WRAPLOW(s1 + s3, bd);
1951 x2 = HIGHBD_WRAPLOW(s0 - s2, bd);
1952 x3 = HIGHBD_WRAPLOW(s1 - s3, bd);
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001953 x4 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 + s6), bd);
1954 x5 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 + s7), bd);
1955 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s4 - s6), bd);
1956 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s5 - s7), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001957 x8 = HIGHBD_WRAPLOW(s8 + s10, bd);
1958 x9 = HIGHBD_WRAPLOW(s9 + s11, bd);
1959 x10 = HIGHBD_WRAPLOW(s8 - s10, bd);
1960 x11 = HIGHBD_WRAPLOW(s9 - s11, bd);
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001961 x12 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 + s14), bd);
1962 x13 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 + s15), bd);
1963 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s12 - s14), bd);
1964 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s13 - s15), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001965
1966 // stage 4
1967 s2 = (-cospi_16_64) * (x2 + x3);
1968 s3 = cospi_16_64 * (x2 - x3);
1969 s6 = cospi_16_64 * (x6 + x7);
1970 s7 = cospi_16_64 * (-x6 + x7);
1971 s10 = cospi_16_64 * (x10 + x11);
1972 s11 = cospi_16_64 * (-x10 + x11);
1973 s14 = (-cospi_16_64) * (x14 + x15);
1974 s15 = cospi_16_64 * (x14 - x15);
1975
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01001976 x2 = HIGHBD_WRAPLOW(dct_const_round_shift(s2), bd);
1977 x3 = HIGHBD_WRAPLOW(dct_const_round_shift(s3), bd);
1978 x6 = HIGHBD_WRAPLOW(dct_const_round_shift(s6), bd);
1979 x7 = HIGHBD_WRAPLOW(dct_const_round_shift(s7), bd);
1980 x10 = HIGHBD_WRAPLOW(dct_const_round_shift(s10), bd);
1981 x11 = HIGHBD_WRAPLOW(dct_const_round_shift(s11), bd);
1982 x14 = HIGHBD_WRAPLOW(dct_const_round_shift(s14), bd);
1983 x15 = HIGHBD_WRAPLOW(dct_const_round_shift(s15), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07001984
1985 output[0] = HIGHBD_WRAPLOW(x0, bd);
1986 output[1] = HIGHBD_WRAPLOW(-x8, bd);
1987 output[2] = HIGHBD_WRAPLOW(x12, bd);
1988 output[3] = HIGHBD_WRAPLOW(-x4, bd);
1989 output[4] = HIGHBD_WRAPLOW(x6, bd);
1990 output[5] = HIGHBD_WRAPLOW(x14, bd);
1991 output[6] = HIGHBD_WRAPLOW(x10, bd);
1992 output[7] = HIGHBD_WRAPLOW(x2, bd);
1993 output[8] = HIGHBD_WRAPLOW(x3, bd);
1994 output[9] = HIGHBD_WRAPLOW(x11, bd);
1995 output[10] = HIGHBD_WRAPLOW(x15, bd);
1996 output[11] = HIGHBD_WRAPLOW(x7, bd);
1997 output[12] = HIGHBD_WRAPLOW(x5, bd);
1998 output[13] = HIGHBD_WRAPLOW(-x13, bd);
1999 output[14] = HIGHBD_WRAPLOW(x9, bd);
2000 output[15] = HIGHBD_WRAPLOW(-x1, bd);
2001}
2002
Yaowu Xuf883b422016-08-30 14:01:10 -07002003void aom_highbd_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07002004 int stride, int bd) {
2005 tran_low_t out[16 * 16] = { 0 };
2006 tran_low_t *outptr = out;
2007 int i, j;
2008 tran_low_t temp_in[16], temp_out[16];
2009 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2010
2011 // First transform rows. Since all non-zero dct coefficients are in
2012 // upper-left 4x4 area, we only need to calculate first 4 rows here.
2013 for (i = 0; i < 4; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07002014 aom_highbd_idct16_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002015 input += 16;
2016 outptr += 16;
2017 }
2018
2019 // Then transform columns.
2020 for (i = 0; i < 16; ++i) {
2021 for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07002022 aom_highbd_idct16_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002023 for (j = 0; j < 16; ++j) {
2024 dest[j * stride + i] = highbd_clip_pixel_add(
2025 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2026 }
2027 }
2028}
2029
Yaowu Xuf883b422016-08-30 14:01:10 -07002030void aom_highbd_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07002031 int stride, int bd) {
2032 int i, j;
2033 tran_high_t a1;
2034 tran_low_t out =
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002035 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002036 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2037
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002038 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002039 a1 = ROUND_POWER_OF_TWO(out, 6);
2040 for (j = 0; j < 16; ++j) {
2041 for (i = 0; i < 16; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2042 dest += stride;
2043 }
2044}
2045
Yaowu Xuf883b422016-08-30 14:01:10 -07002046void aom_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd) {
Yaowu Xuc27fc142016-08-22 16:08:15 -07002047 tran_low_t step1[32], step2[32];
2048 tran_high_t temp1, temp2;
2049 (void)bd;
2050
2051 // stage 1
2052 step1[0] = input[0];
2053 step1[1] = input[16];
2054 step1[2] = input[8];
2055 step1[3] = input[24];
2056 step1[4] = input[4];
2057 step1[5] = input[20];
2058 step1[6] = input[12];
2059 step1[7] = input[28];
2060 step1[8] = input[2];
2061 step1[9] = input[18];
2062 step1[10] = input[10];
2063 step1[11] = input[26];
2064 step1[12] = input[6];
2065 step1[13] = input[22];
2066 step1[14] = input[14];
2067 step1[15] = input[30];
2068
2069 temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
2070 temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002071 step1[16] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2072 step1[31] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002073
2074 temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
2075 temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002076 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2077 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002078
2079 temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
2080 temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002081 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2082 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002083
2084 temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
2085 temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002086 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2087 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002088
2089 temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
2090 temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002091 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2092 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002093
2094 temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
2095 temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002096 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2097 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002098
2099 temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
2100 temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002101 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2102 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002103
2104 temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
2105 temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002106 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2107 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002108
2109 // stage 2
2110 step2[0] = step1[0];
2111 step2[1] = step1[1];
2112 step2[2] = step1[2];
2113 step2[3] = step1[3];
2114 step2[4] = step1[4];
2115 step2[5] = step1[5];
2116 step2[6] = step1[6];
2117 step2[7] = step1[7];
2118
2119 temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
2120 temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002121 step2[8] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2122 step2[15] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002123
2124 temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
2125 temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002126 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2127 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002128
2129 temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
2130 temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002131 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2132 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002133
2134 temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
2135 temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002136 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2137 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002138
2139 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[17], bd);
2140 step2[17] = HIGHBD_WRAPLOW(step1[16] - step1[17], bd);
2141 step2[18] = HIGHBD_WRAPLOW(-step1[18] + step1[19], bd);
2142 step2[19] = HIGHBD_WRAPLOW(step1[18] + step1[19], bd);
2143 step2[20] = HIGHBD_WRAPLOW(step1[20] + step1[21], bd);
2144 step2[21] = HIGHBD_WRAPLOW(step1[20] - step1[21], bd);
2145 step2[22] = HIGHBD_WRAPLOW(-step1[22] + step1[23], bd);
2146 step2[23] = HIGHBD_WRAPLOW(step1[22] + step1[23], bd);
2147 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[25], bd);
2148 step2[25] = HIGHBD_WRAPLOW(step1[24] - step1[25], bd);
2149 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[27], bd);
2150 step2[27] = HIGHBD_WRAPLOW(step1[26] + step1[27], bd);
2151 step2[28] = HIGHBD_WRAPLOW(step1[28] + step1[29], bd);
2152 step2[29] = HIGHBD_WRAPLOW(step1[28] - step1[29], bd);
2153 step2[30] = HIGHBD_WRAPLOW(-step1[30] + step1[31], bd);
2154 step2[31] = HIGHBD_WRAPLOW(step1[30] + step1[31], bd);
2155
2156 // stage 3
2157 step1[0] = step2[0];
2158 step1[1] = step2[1];
2159 step1[2] = step2[2];
2160 step1[3] = step2[3];
2161
2162 temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
2163 temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002164 step1[4] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2165 step1[7] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002166 temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
2167 temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002168 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2169 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002170
2171 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[9], bd);
2172 step1[9] = HIGHBD_WRAPLOW(step2[8] - step2[9], bd);
2173 step1[10] = HIGHBD_WRAPLOW(-step2[10] + step2[11], bd);
2174 step1[11] = HIGHBD_WRAPLOW(step2[10] + step2[11], bd);
2175 step1[12] = HIGHBD_WRAPLOW(step2[12] + step2[13], bd);
2176 step1[13] = HIGHBD_WRAPLOW(step2[12] - step2[13], bd);
2177 step1[14] = HIGHBD_WRAPLOW(-step2[14] + step2[15], bd);
2178 step1[15] = HIGHBD_WRAPLOW(step2[14] + step2[15], bd);
2179
2180 step1[16] = step2[16];
2181 step1[31] = step2[31];
2182 temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
2183 temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002184 step1[17] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2185 step1[30] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002186 temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
2187 temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002188 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2189 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002190 step1[19] = step2[19];
2191 step1[20] = step2[20];
2192 temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
2193 temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002194 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2195 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002196 temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
2197 temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002198 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2199 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002200 step1[23] = step2[23];
2201 step1[24] = step2[24];
2202 step1[27] = step2[27];
2203 step1[28] = step2[28];
2204
2205 // stage 4
2206 temp1 = (step1[0] + step1[1]) * cospi_16_64;
2207 temp2 = (step1[0] - step1[1]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002208 step2[0] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2209 step2[1] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002210 temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
2211 temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002212 step2[2] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2213 step2[3] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002214 step2[4] = HIGHBD_WRAPLOW(step1[4] + step1[5], bd);
2215 step2[5] = HIGHBD_WRAPLOW(step1[4] - step1[5], bd);
2216 step2[6] = HIGHBD_WRAPLOW(-step1[6] + step1[7], bd);
2217 step2[7] = HIGHBD_WRAPLOW(step1[6] + step1[7], bd);
2218
2219 step2[8] = step1[8];
2220 step2[15] = step1[15];
2221 temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
2222 temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002223 step2[9] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2224 step2[14] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002225 temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
2226 temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002227 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2228 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002229 step2[11] = step1[11];
2230 step2[12] = step1[12];
2231
2232 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[19], bd);
2233 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[18], bd);
2234 step2[18] = HIGHBD_WRAPLOW(step1[17] - step1[18], bd);
2235 step2[19] = HIGHBD_WRAPLOW(step1[16] - step1[19], bd);
2236 step2[20] = HIGHBD_WRAPLOW(-step1[20] + step1[23], bd);
2237 step2[21] = HIGHBD_WRAPLOW(-step1[21] + step1[22], bd);
2238 step2[22] = HIGHBD_WRAPLOW(step1[21] + step1[22], bd);
2239 step2[23] = HIGHBD_WRAPLOW(step1[20] + step1[23], bd);
2240
2241 step2[24] = HIGHBD_WRAPLOW(step1[24] + step1[27], bd);
2242 step2[25] = HIGHBD_WRAPLOW(step1[25] + step1[26], bd);
2243 step2[26] = HIGHBD_WRAPLOW(step1[25] - step1[26], bd);
2244 step2[27] = HIGHBD_WRAPLOW(step1[24] - step1[27], bd);
2245 step2[28] = HIGHBD_WRAPLOW(-step1[28] + step1[31], bd);
2246 step2[29] = HIGHBD_WRAPLOW(-step1[29] + step1[30], bd);
2247 step2[30] = HIGHBD_WRAPLOW(step1[29] + step1[30], bd);
2248 step2[31] = HIGHBD_WRAPLOW(step1[28] + step1[31], bd);
2249
2250 // stage 5
2251 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[3], bd);
2252 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[2], bd);
2253 step1[2] = HIGHBD_WRAPLOW(step2[1] - step2[2], bd);
2254 step1[3] = HIGHBD_WRAPLOW(step2[0] - step2[3], bd);
2255 step1[4] = step2[4];
2256 temp1 = (step2[6] - step2[5]) * cospi_16_64;
2257 temp2 = (step2[5] + step2[6]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002258 step1[5] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2259 step1[6] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002260 step1[7] = step2[7];
2261
2262 step1[8] = HIGHBD_WRAPLOW(step2[8] + step2[11], bd);
2263 step1[9] = HIGHBD_WRAPLOW(step2[9] + step2[10], bd);
2264 step1[10] = HIGHBD_WRAPLOW(step2[9] - step2[10], bd);
2265 step1[11] = HIGHBD_WRAPLOW(step2[8] - step2[11], bd);
2266 step1[12] = HIGHBD_WRAPLOW(-step2[12] + step2[15], bd);
2267 step1[13] = HIGHBD_WRAPLOW(-step2[13] + step2[14], bd);
2268 step1[14] = HIGHBD_WRAPLOW(step2[13] + step2[14], bd);
2269 step1[15] = HIGHBD_WRAPLOW(step2[12] + step2[15], bd);
2270
2271 step1[16] = step2[16];
2272 step1[17] = step2[17];
2273 temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
2274 temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002275 step1[18] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2276 step1[29] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002277 temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
2278 temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002279 step1[19] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2280 step1[28] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002281 temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
2282 temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002283 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2284 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002285 temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
2286 temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002287 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2288 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002289 step1[22] = step2[22];
2290 step1[23] = step2[23];
2291 step1[24] = step2[24];
2292 step1[25] = step2[25];
2293 step1[30] = step2[30];
2294 step1[31] = step2[31];
2295
2296 // stage 6
2297 step2[0] = HIGHBD_WRAPLOW(step1[0] + step1[7], bd);
2298 step2[1] = HIGHBD_WRAPLOW(step1[1] + step1[6], bd);
2299 step2[2] = HIGHBD_WRAPLOW(step1[2] + step1[5], bd);
2300 step2[3] = HIGHBD_WRAPLOW(step1[3] + step1[4], bd);
2301 step2[4] = HIGHBD_WRAPLOW(step1[3] - step1[4], bd);
2302 step2[5] = HIGHBD_WRAPLOW(step1[2] - step1[5], bd);
2303 step2[6] = HIGHBD_WRAPLOW(step1[1] - step1[6], bd);
2304 step2[7] = HIGHBD_WRAPLOW(step1[0] - step1[7], bd);
2305 step2[8] = step1[8];
2306 step2[9] = step1[9];
2307 temp1 = (-step1[10] + step1[13]) * cospi_16_64;
2308 temp2 = (step1[10] + step1[13]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002309 step2[10] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2310 step2[13] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002311 temp1 = (-step1[11] + step1[12]) * cospi_16_64;
2312 temp2 = (step1[11] + step1[12]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002313 step2[11] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2314 step2[12] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002315 step2[14] = step1[14];
2316 step2[15] = step1[15];
2317
2318 step2[16] = HIGHBD_WRAPLOW(step1[16] + step1[23], bd);
2319 step2[17] = HIGHBD_WRAPLOW(step1[17] + step1[22], bd);
2320 step2[18] = HIGHBD_WRAPLOW(step1[18] + step1[21], bd);
2321 step2[19] = HIGHBD_WRAPLOW(step1[19] + step1[20], bd);
2322 step2[20] = HIGHBD_WRAPLOW(step1[19] - step1[20], bd);
2323 step2[21] = HIGHBD_WRAPLOW(step1[18] - step1[21], bd);
2324 step2[22] = HIGHBD_WRAPLOW(step1[17] - step1[22], bd);
2325 step2[23] = HIGHBD_WRAPLOW(step1[16] - step1[23], bd);
2326
2327 step2[24] = HIGHBD_WRAPLOW(-step1[24] + step1[31], bd);
2328 step2[25] = HIGHBD_WRAPLOW(-step1[25] + step1[30], bd);
2329 step2[26] = HIGHBD_WRAPLOW(-step1[26] + step1[29], bd);
2330 step2[27] = HIGHBD_WRAPLOW(-step1[27] + step1[28], bd);
2331 step2[28] = HIGHBD_WRAPLOW(step1[27] + step1[28], bd);
2332 step2[29] = HIGHBD_WRAPLOW(step1[26] + step1[29], bd);
2333 step2[30] = HIGHBD_WRAPLOW(step1[25] + step1[30], bd);
2334 step2[31] = HIGHBD_WRAPLOW(step1[24] + step1[31], bd);
2335
2336 // stage 7
2337 step1[0] = HIGHBD_WRAPLOW(step2[0] + step2[15], bd);
2338 step1[1] = HIGHBD_WRAPLOW(step2[1] + step2[14], bd);
2339 step1[2] = HIGHBD_WRAPLOW(step2[2] + step2[13], bd);
2340 step1[3] = HIGHBD_WRAPLOW(step2[3] + step2[12], bd);
2341 step1[4] = HIGHBD_WRAPLOW(step2[4] + step2[11], bd);
2342 step1[5] = HIGHBD_WRAPLOW(step2[5] + step2[10], bd);
2343 step1[6] = HIGHBD_WRAPLOW(step2[6] + step2[9], bd);
2344 step1[7] = HIGHBD_WRAPLOW(step2[7] + step2[8], bd);
2345 step1[8] = HIGHBD_WRAPLOW(step2[7] - step2[8], bd);
2346 step1[9] = HIGHBD_WRAPLOW(step2[6] - step2[9], bd);
2347 step1[10] = HIGHBD_WRAPLOW(step2[5] - step2[10], bd);
2348 step1[11] = HIGHBD_WRAPLOW(step2[4] - step2[11], bd);
2349 step1[12] = HIGHBD_WRAPLOW(step2[3] - step2[12], bd);
2350 step1[13] = HIGHBD_WRAPLOW(step2[2] - step2[13], bd);
2351 step1[14] = HIGHBD_WRAPLOW(step2[1] - step2[14], bd);
2352 step1[15] = HIGHBD_WRAPLOW(step2[0] - step2[15], bd);
2353
2354 step1[16] = step2[16];
2355 step1[17] = step2[17];
2356 step1[18] = step2[18];
2357 step1[19] = step2[19];
2358 temp1 = (-step2[20] + step2[27]) * cospi_16_64;
2359 temp2 = (step2[20] + step2[27]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002360 step1[20] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2361 step1[27] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002362 temp1 = (-step2[21] + step2[26]) * cospi_16_64;
2363 temp2 = (step2[21] + step2[26]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002364 step1[21] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2365 step1[26] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002366 temp1 = (-step2[22] + step2[25]) * cospi_16_64;
2367 temp2 = (step2[22] + step2[25]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002368 step1[22] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2369 step1[25] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002370 temp1 = (-step2[23] + step2[24]) * cospi_16_64;
2371 temp2 = (step2[23] + step2[24]) * cospi_16_64;
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002372 step1[23] = HIGHBD_WRAPLOW(dct_const_round_shift(temp1), bd);
2373 step1[24] = HIGHBD_WRAPLOW(dct_const_round_shift(temp2), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002374 step1[28] = step2[28];
2375 step1[29] = step2[29];
2376 step1[30] = step2[30];
2377 step1[31] = step2[31];
2378
2379 // final stage
2380 output[0] = HIGHBD_WRAPLOW(step1[0] + step1[31], bd);
2381 output[1] = HIGHBD_WRAPLOW(step1[1] + step1[30], bd);
2382 output[2] = HIGHBD_WRAPLOW(step1[2] + step1[29], bd);
2383 output[3] = HIGHBD_WRAPLOW(step1[3] + step1[28], bd);
2384 output[4] = HIGHBD_WRAPLOW(step1[4] + step1[27], bd);
2385 output[5] = HIGHBD_WRAPLOW(step1[5] + step1[26], bd);
2386 output[6] = HIGHBD_WRAPLOW(step1[6] + step1[25], bd);
2387 output[7] = HIGHBD_WRAPLOW(step1[7] + step1[24], bd);
2388 output[8] = HIGHBD_WRAPLOW(step1[8] + step1[23], bd);
2389 output[9] = HIGHBD_WRAPLOW(step1[9] + step1[22], bd);
2390 output[10] = HIGHBD_WRAPLOW(step1[10] + step1[21], bd);
2391 output[11] = HIGHBD_WRAPLOW(step1[11] + step1[20], bd);
2392 output[12] = HIGHBD_WRAPLOW(step1[12] + step1[19], bd);
2393 output[13] = HIGHBD_WRAPLOW(step1[13] + step1[18], bd);
2394 output[14] = HIGHBD_WRAPLOW(step1[14] + step1[17], bd);
2395 output[15] = HIGHBD_WRAPLOW(step1[15] + step1[16], bd);
2396 output[16] = HIGHBD_WRAPLOW(step1[15] - step1[16], bd);
2397 output[17] = HIGHBD_WRAPLOW(step1[14] - step1[17], bd);
2398 output[18] = HIGHBD_WRAPLOW(step1[13] - step1[18], bd);
2399 output[19] = HIGHBD_WRAPLOW(step1[12] - step1[19], bd);
2400 output[20] = HIGHBD_WRAPLOW(step1[11] - step1[20], bd);
2401 output[21] = HIGHBD_WRAPLOW(step1[10] - step1[21], bd);
2402 output[22] = HIGHBD_WRAPLOW(step1[9] - step1[22], bd);
2403 output[23] = HIGHBD_WRAPLOW(step1[8] - step1[23], bd);
2404 output[24] = HIGHBD_WRAPLOW(step1[7] - step1[24], bd);
2405 output[25] = HIGHBD_WRAPLOW(step1[6] - step1[25], bd);
2406 output[26] = HIGHBD_WRAPLOW(step1[5] - step1[26], bd);
2407 output[27] = HIGHBD_WRAPLOW(step1[4] - step1[27], bd);
2408 output[28] = HIGHBD_WRAPLOW(step1[3] - step1[28], bd);
2409 output[29] = HIGHBD_WRAPLOW(step1[2] - step1[29], bd);
2410 output[30] = HIGHBD_WRAPLOW(step1[1] - step1[30], bd);
2411 output[31] = HIGHBD_WRAPLOW(step1[0] - step1[31], bd);
2412}
2413
Yaowu Xuf883b422016-08-30 14:01:10 -07002414void aom_highbd_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07002415 int stride, int bd) {
2416 tran_low_t out[32 * 32];
2417 tran_low_t *outptr = out;
2418 int i, j;
2419 tran_low_t temp_in[32], temp_out[32];
2420 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2421
2422 // Rows
2423 for (i = 0; i < 32; ++i) {
2424 tran_low_t zero_coeff[16];
2425 for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
2426 for (j = 0; j < 8; ++j)
2427 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2428 for (j = 0; j < 4; ++j)
2429 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2430 for (j = 0; j < 2; ++j)
2431 zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
2432
2433 if (zero_coeff[0] | zero_coeff[1])
Yaowu Xuf883b422016-08-30 14:01:10 -07002434 aom_highbd_idct32_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002435 else
2436 memset(outptr, 0, sizeof(tran_low_t) * 32);
2437 input += 32;
2438 outptr += 32;
2439 }
2440
2441 // Columns
2442 for (i = 0; i < 32; ++i) {
2443 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07002444 aom_highbd_idct32_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002445 for (j = 0; j < 32; ++j) {
2446 dest[j * stride + i] = highbd_clip_pixel_add(
2447 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2448 }
2449 }
2450}
2451
Yaowu Xuf883b422016-08-30 14:01:10 -07002452void aom_highbd_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07002453 int stride, int bd) {
2454 tran_low_t out[32 * 32] = { 0 };
2455 tran_low_t *outptr = out;
2456 int i, j;
2457 tran_low_t temp_in[32], temp_out[32];
2458 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2459
2460 // Rows
2461 // Only upper-left 8x8 has non-zero coeff.
2462 for (i = 0; i < 8; ++i) {
Yaowu Xuf883b422016-08-30 14:01:10 -07002463 aom_highbd_idct32_c(input, outptr, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002464 input += 32;
2465 outptr += 32;
2466 }
2467 // Columns
2468 for (i = 0; i < 32; ++i) {
2469 for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
Yaowu Xuf883b422016-08-30 14:01:10 -07002470 aom_highbd_idct32_c(temp_in, temp_out, bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002471 for (j = 0; j < 32; ++j) {
2472 dest[j * stride + i] = highbd_clip_pixel_add(
2473 dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
2474 }
2475 }
2476}
2477
Yaowu Xuf883b422016-08-30 14:01:10 -07002478void aom_highbd_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
Yaowu Xuc27fc142016-08-22 16:08:15 -07002479 int stride, int bd) {
2480 int i, j;
2481 int a1;
2482 uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
2483
2484 tran_low_t out =
Sebastien Alaiwand0e23b42016-12-23 17:55:36 +01002485 HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd);
2486 out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd);
Yaowu Xuc27fc142016-08-22 16:08:15 -07002487 a1 = ROUND_POWER_OF_TWO(out, 6);
2488
2489 for (j = 0; j < 32; ++j) {
2490 for (i = 0; i < 32; ++i) dest[i] = highbd_clip_pixel_add(dest[i], a1, bd);
2491 dest += stride;
2492 }
2493}
Yaowu Xuf883b422016-08-30 14:01:10 -07002494#endif // CONFIG_AOM_HIGHBITDEPTH