blob: 094e113c1285f8fbabfb7faf1cf149a896cbbb9d [file] [log] [blame]
Jingning Han52ae97b2014-05-01 18:34:46 -07001;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07002; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Jingning Han52ae97b2014-05-01 18:34:46 -07003;
Yaowu Xu9c01aa12016-09-01 14:32:49 -07004; This source code is subject to the terms of the BSD 2 Clause License and
5; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6; was not distributed with this source code in the LICENSE file, you can
7; obtain it at www.aomedia.org/license/software. If the Alliance for Open
8; Media Patent License 1.0 was not distributed with this source code in the
9; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10;
11
Jingning Han52ae97b2014-05-01 18:34:46 -070012;
Jingning Han08a453b2015-08-03 14:51:10 -070013
Jingning Han52ae97b2014-05-01 18:34:46 -070014%include "third_party/x86inc/x86inc.asm"
15
16; This file provides SSSE3 version of the inverse transformation. Part
17; of the functions are originally derived from the ffmpeg project.
18; Note that the current version applies to x86 64-bit only.
19
20SECTION_RODATA
21
22pw_11585x2: times 8 dw 23170
Scott LaVarnwayed833042015-11-17 17:42:24 -080023
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -080024pw_m2404x2: times 8 dw -2404*2
25pw_m4756x2: times 8 dw -4756*2
26pw_m5520x2: times 8 dw -5520*2
27pw_m8423x2: times 8 dw -8423*2
28pw_m9102x2: times 8 dw -9102*2
29pw_m10394x2: times 8 dw -10394*2
30pw_m11003x2: times 8 dw -11003*2
Scott LaVarnwayed833042015-11-17 17:42:24 -080031
32pw_16364x2: times 8 dw 16364*2
33pw_16305x2: times 8 dw 16305*2
34pw_16207x2: times 8 dw 16207*2
35pw_16069x2: times 8 dw 16069*2
36pw_15893x2: times 8 dw 15893*2
37pw_15679x2: times 8 dw 15679*2
38pw_15426x2: times 8 dw 15426*2
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -080039pw_15137x2: times 8 dw 15137*2
40pw_14811x2: times 8 dw 14811*2
41pw_14449x2: times 8 dw 14449*2
42pw_14053x2: times 8 dw 14053*2
43pw_13623x2: times 8 dw 13623*2
44pw_13160x2: times 8 dw 13160*2
45pw_12665x2: times 8 dw 12665*2
46pw_12140x2: times 8 dw 12140*2
47pw__9760x2: times 8 dw 9760*2
48pw__7723x2: times 8 dw 7723*2
49pw__7005x2: times 8 dw 7005*2
50pw__6270x2: times 8 dw 6270*2
Scott LaVarnwayed833042015-11-17 17:42:24 -080051pw__3981x2: times 8 dw 3981*2
52pw__3196x2: times 8 dw 3196*2
53pw__1606x2: times 8 dw 1606*2
54pw___804x2: times 8 dw 804*2
55
Jingning Han52ae97b2014-05-01 18:34:46 -070056pd_8192: times 4 dd 8192
Scott LaVarnwayed833042015-11-17 17:42:24 -080057pw_32: times 8 dw 32
Jingning Han52ae97b2014-05-01 18:34:46 -070058pw_16: times 8 dw 16
59
60%macro TRANSFORM_COEFFS 2
61pw_%1_%2: dw %1, %2, %1, %2, %1, %2, %1, %2
62pw_m%2_%1: dw -%2, %1, -%2, %1, -%2, %1, -%2, %1
Scott LaVarnwayed833042015-11-17 17:42:24 -080063pw_m%1_m%2: dw -%1, -%2, -%1, -%2, -%1, -%2, -%1, -%2
Jingning Han52ae97b2014-05-01 18:34:46 -070064%endmacro
65
66TRANSFORM_COEFFS 6270, 15137
67TRANSFORM_COEFFS 3196, 16069
68TRANSFORM_COEFFS 13623, 9102
69
Scott LaVarnwayed833042015-11-17 17:42:24 -080070; constants for 32x32_34
71TRANSFORM_COEFFS 804, 16364
72TRANSFORM_COEFFS 15426, 5520
73TRANSFORM_COEFFS 3981, 15893
74TRANSFORM_COEFFS 16207, 2404
75TRANSFORM_COEFFS 1606, 16305
76TRANSFORM_COEFFS 15679, 4756
77TRANSFORM_COEFFS 11585, 11585
78
Scott LaVarnway0148e202015-11-25 10:11:29 -080079; constants for 32x32_1024
80TRANSFORM_COEFFS 12140, 11003
81TRANSFORM_COEFFS 7005, 14811
82TRANSFORM_COEFFS 14053, 8423
83TRANSFORM_COEFFS 9760, 13160
84TRANSFORM_COEFFS 12665, 10394
85TRANSFORM_COEFFS 7723, 14449
86
Jingning Han9e7b09b2014-05-02 16:29:08 -070087%macro PAIR_PP_COEFFS 2
88dpw_%1_%2: dw %1, %1, %1, %1, %2, %2, %2, %2
89%endmacro
90
91%macro PAIR_MP_COEFFS 2
92dpw_m%1_%2: dw -%1, -%1, -%1, -%1, %2, %2, %2, %2
93%endmacro
94
95%macro PAIR_MM_COEFFS 2
96dpw_m%1_m%2: dw -%1, -%1, -%1, -%1, -%2, -%2, -%2, -%2
97%endmacro
98
99PAIR_PP_COEFFS 30274, 12540
100PAIR_PP_COEFFS 6392, 32138
101PAIR_MP_COEFFS 18204, 27246
102
103PAIR_PP_COEFFS 12540, 12540
104PAIR_PP_COEFFS 30274, 30274
105PAIR_PP_COEFFS 6392, 6392
106PAIR_PP_COEFFS 32138, 32138
107PAIR_MM_COEFFS 18204, 18204
108PAIR_PP_COEFFS 27246, 27246
109
Jingning Han52ae97b2014-05-01 18:34:46 -0700110SECTION .text
111
112%if ARCH_X86_64
113%macro SUM_SUB 3
114 psubw m%3, m%1, m%2
115 paddw m%1, m%2
116 SWAP %2, %3
117%endmacro
118
119; butterfly operation
120%macro MUL_ADD_2X 6 ; dst1, dst2, src, round, coefs1, coefs2
121 pmaddwd m%1, m%3, %5
122 pmaddwd m%2, m%3, %6
123 paddd m%1, %4
124 paddd m%2, %4
125 psrad m%1, 14
126 psrad m%2, 14
127%endmacro
128
129%macro BUTTERFLY_4X 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
130 punpckhwd m%6, m%2, m%1
131 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_%3_%4]
132 punpcklwd m%2, m%1
133 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_%3_%4]
134 packssdw m%1, m%7
135 packssdw m%2, m%6
136%endmacro
137
Scott LaVarnwayed833042015-11-17 17:42:24 -0800138%macro BUTTERFLY_4Xmm 7 ; dst1, dst2, coef1, coef2, round, tmp1, tmp2
139 punpckhwd m%6, m%2, m%1
140 MUL_ADD_2X %7, %6, %6, %5, [pw_m%4_%3], [pw_m%3_m%4]
141 punpcklwd m%2, m%1
142 MUL_ADD_2X %1, %2, %2, %5, [pw_m%4_%3], [pw_m%3_m%4]
143 packssdw m%1, m%7
144 packssdw m%2, m%6
145%endmacro
146
Jingning Han52ae97b2014-05-01 18:34:46 -0700147; matrix transpose
148%macro INTERLEAVE_2X 4
149 punpckh%1 m%4, m%2, m%3
150 punpckl%1 m%2, m%3
151 SWAP %3, %4
152%endmacro
153
154%macro TRANSPOSE8X8 9
155 INTERLEAVE_2X wd, %1, %2, %9
156 INTERLEAVE_2X wd, %3, %4, %9
157 INTERLEAVE_2X wd, %5, %6, %9
158 INTERLEAVE_2X wd, %7, %8, %9
159
160 INTERLEAVE_2X dq, %1, %3, %9
161 INTERLEAVE_2X dq, %2, %4, %9
162 INTERLEAVE_2X dq, %5, %7, %9
163 INTERLEAVE_2X dq, %6, %8, %9
164
165 INTERLEAVE_2X qdq, %1, %5, %9
166 INTERLEAVE_2X qdq, %3, %7, %9
167 INTERLEAVE_2X qdq, %2, %6, %9
168 INTERLEAVE_2X qdq, %4, %8, %9
169
170 SWAP %2, %5
171 SWAP %4, %7
172%endmacro
173
174%macro IDCT8_1D 0
175 SUM_SUB 0, 4, 9
176 BUTTERFLY_4X 2, 6, 6270, 15137, m8, 9, 10
177 pmulhrsw m0, m12
178 pmulhrsw m4, m12
179 BUTTERFLY_4X 1, 7, 3196, 16069, m8, 9, 10
180 BUTTERFLY_4X 5, 3, 13623, 9102, m8, 9, 10
181
182 SUM_SUB 1, 5, 9
183 SUM_SUB 7, 3, 9
184 SUM_SUB 0, 6, 9
185 SUM_SUB 4, 2, 9
186 SUM_SUB 3, 5, 9
187 pmulhrsw m3, m12
188 pmulhrsw m5, m12
189
190 SUM_SUB 0, 7, 9
191 SUM_SUB 4, 3, 9
192 SUM_SUB 2, 5, 9
193 SUM_SUB 6, 1, 9
194
195 SWAP 3, 6
196 SWAP 1, 4
197%endmacro
198
199; This macro handles 8 pixels per line
200%macro ADD_STORE_8P_2X 5; src1, src2, tmp1, tmp2, zero
201 paddw m%1, m11
202 paddw m%2, m11
203 psraw m%1, 5
204 psraw m%2, 5
205
206 movh m%3, [outputq]
207 movh m%4, [outputq + strideq]
208 punpcklbw m%3, m%5
209 punpcklbw m%4, m%5
210 paddw m%3, m%1
211 paddw m%4, m%2
212 packuswb m%3, m%5
213 packuswb m%4, m%5
214 movh [outputq], m%3
215 movh [outputq + strideq], m%4
216%endmacro
217
218INIT_XMM ssse3
Jingning Han9e7b09b2014-05-02 16:29:08 -0700219; full inverse 8x8 2D-DCT transform
Jingning Han52ae97b2014-05-01 18:34:46 -0700220cglobal idct8x8_64_add, 3, 5, 13, input, output, stride
221 mova m8, [pd_8192]
222 mova m11, [pw_16]
223 mova m12, [pw_11585x2]
224
225 lea r3, [2 * strideq]
Yaowu Xuf883b422016-08-30 14:01:10 -0700226%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xub2297102016-01-28 16:29:29 -0800227 mova m0, [inputq + 0]
228 packssdw m0, [inputq + 16]
229 mova m1, [inputq + 32]
230 packssdw m1, [inputq + 48]
231 mova m2, [inputq + 64]
232 packssdw m2, [inputq + 80]
233 mova m3, [inputq + 96]
234 packssdw m3, [inputq + 112]
235 mova m4, [inputq + 128]
236 packssdw m4, [inputq + 144]
237 mova m5, [inputq + 160]
238 packssdw m5, [inputq + 176]
239 mova m6, [inputq + 192]
240 packssdw m6, [inputq + 208]
241 mova m7, [inputq + 224]
242 packssdw m7, [inputq + 240]
243%else
Jingning Han52ae97b2014-05-01 18:34:46 -0700244 mova m0, [inputq + 0]
245 mova m1, [inputq + 16]
246 mova m2, [inputq + 32]
247 mova m3, [inputq + 48]
248 mova m4, [inputq + 64]
249 mova m5, [inputq + 80]
250 mova m6, [inputq + 96]
251 mova m7, [inputq + 112]
Yaowu Xub2297102016-01-28 16:29:29 -0800252%endif
Jingning Han52ae97b2014-05-01 18:34:46 -0700253 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
254 IDCT8_1D
255 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
256 IDCT8_1D
257
258 pxor m12, m12
259 ADD_STORE_8P_2X 0, 1, 9, 10, 12
260 lea outputq, [outputq + r3]
261 ADD_STORE_8P_2X 2, 3, 9, 10, 12
262 lea outputq, [outputq + r3]
263 ADD_STORE_8P_2X 4, 5, 9, 10, 12
264 lea outputq, [outputq + r3]
265 ADD_STORE_8P_2X 6, 7, 9, 10, 12
266
267 RET
Jingning Han9e7b09b2014-05-02 16:29:08 -0700268
269; inverse 8x8 2D-DCT transform with only first 10 coeffs non-zero
Jingning Han41a350a2014-05-08 09:42:26 -0700270cglobal idct8x8_12_add, 3, 5, 13, input, output, stride
Jingning Han9e7b09b2014-05-02 16:29:08 -0700271 mova m8, [pd_8192]
272 mova m11, [pw_16]
273 mova m12, [pw_11585x2]
274
275 lea r3, [2 * strideq]
276
Yaowu Xuf883b422016-08-30 14:01:10 -0700277%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xub2297102016-01-28 16:29:29 -0800278 mova m0, [inputq + 0]
279 packssdw m0, [inputq + 16]
280 mova m1, [inputq + 32]
281 packssdw m1, [inputq + 48]
282 mova m2, [inputq + 64]
283 packssdw m2, [inputq + 80]
284 mova m3, [inputq + 96]
285 packssdw m3, [inputq + 112]
286%else
Jingning Han9e7b09b2014-05-02 16:29:08 -0700287 mova m0, [inputq + 0]
288 mova m1, [inputq + 16]
289 mova m2, [inputq + 32]
290 mova m3, [inputq + 48]
Yaowu Xub2297102016-01-28 16:29:29 -0800291%endif
Jingning Han9e7b09b2014-05-02 16:29:08 -0700292
293 punpcklwd m0, m1
294 punpcklwd m2, m3
295 punpckhdq m9, m0, m2
296 punpckldq m0, m2
297 SWAP 2, 9
298
299 ; m0 -> [0], [0]
300 ; m1 -> [1], [1]
301 ; m2 -> [2], [2]
302 ; m3 -> [3], [3]
303 punpckhqdq m10, m0, m0
304 punpcklqdq m0, m0
305 punpckhqdq m9, m2, m2
306 punpcklqdq m2, m2
307 SWAP 1, 10
308 SWAP 3, 9
309
310 pmulhrsw m0, m12
311 pmulhrsw m2, [dpw_30274_12540]
312 pmulhrsw m1, [dpw_6392_32138]
313 pmulhrsw m3, [dpw_m18204_27246]
314
315 SUM_SUB 0, 2, 9
316 SUM_SUB 1, 3, 9
317
318 punpcklqdq m9, m3, m3
319 punpckhqdq m5, m3, m9
320
321 SUM_SUB 3, 5, 9
322 punpckhqdq m5, m3
323 pmulhrsw m5, m12
324
325 punpckhqdq m9, m1, m5
326 punpcklqdq m1, m5
327 SWAP 5, 9
328
329 SUM_SUB 0, 5, 9
330 SUM_SUB 2, 1, 9
331
332 punpckhqdq m3, m0, m0
333 punpckhqdq m4, m1, m1
334 punpckhqdq m6, m5, m5
335 punpckhqdq m7, m2, m2
336
337 punpcklwd m0, m3
338 punpcklwd m7, m2
339 punpcklwd m1, m4
340 punpcklwd m6, m5
341
342 punpckhdq m4, m0, m7
343 punpckldq m0, m7
344 punpckhdq m10, m1, m6
345 punpckldq m5, m1, m6
346
347 punpckhqdq m1, m0, m5
348 punpcklqdq m0, m5
349 punpckhqdq m3, m4, m10
350 punpcklqdq m2, m4, m10
351
352
353 pmulhrsw m0, m12
354 pmulhrsw m6, m2, [dpw_30274_30274]
355 pmulhrsw m4, m2, [dpw_12540_12540]
356
357 pmulhrsw m7, m1, [dpw_32138_32138]
358 pmulhrsw m1, [dpw_6392_6392]
359 pmulhrsw m5, m3, [dpw_m18204_m18204]
360 pmulhrsw m3, [dpw_27246_27246]
361
362 mova m2, m0
363 SUM_SUB 0, 6, 9
364 SUM_SUB 2, 4, 9
365 SUM_SUB 1, 5, 9
366 SUM_SUB 7, 3, 9
367
368 SUM_SUB 3, 5, 9
369 pmulhrsw m3, m12
370 pmulhrsw m5, m12
371
372 SUM_SUB 0, 7, 9
373 SUM_SUB 2, 3, 9
374 SUM_SUB 4, 5, 9
375 SUM_SUB 6, 1, 9
376
377 SWAP 3, 6
378 SWAP 1, 2
379 SWAP 2, 4
380
381
382 pxor m12, m12
383 ADD_STORE_8P_2X 0, 1, 9, 10, 12
384 lea outputq, [outputq + r3]
385 ADD_STORE_8P_2X 2, 3, 9, 10, 12
386 lea outputq, [outputq + r3]
387 ADD_STORE_8P_2X 4, 5, 9, 10, 12
388 lea outputq, [outputq + r3]
389 ADD_STORE_8P_2X 6, 7, 9, 10, 12
390
391 RET
392
Scott LaVarnwayed833042015-11-17 17:42:24 -0800393%define idx0 16 * 0
394%define idx1 16 * 1
395%define idx2 16 * 2
396%define idx3 16 * 3
397%define idx4 16 * 4
398%define idx5 16 * 5
399%define idx6 16 * 6
400%define idx7 16 * 7
401%define idx8 16 * 0
402%define idx9 16 * 1
403%define idx10 16 * 2
404%define idx11 16 * 3
405%define idx12 16 * 4
406%define idx13 16 * 5
407%define idx14 16 * 6
408%define idx15 16 * 7
409%define idx16 16 * 0
410%define idx17 16 * 1
411%define idx18 16 * 2
412%define idx19 16 * 3
413%define idx20 16 * 4
414%define idx21 16 * 5
415%define idx22 16 * 6
416%define idx23 16 * 7
417%define idx24 16 * 0
418%define idx25 16 * 1
419%define idx26 16 * 2
420%define idx27 16 * 3
421%define idx28 16 * 4
422%define idx29 16 * 5
423%define idx30 16 * 6
424%define idx31 16 * 7
425
Scott LaVarnway0148e202015-11-25 10:11:29 -0800426; FROM idct32x32_add_neon.asm
427;
428; Instead of doing the transforms stage by stage, it is done by loading
429; some input values and doing as many stages as possible to minimize the
430; storing/loading of intermediate results. To fit within registers, the
431; final coefficients are cut into four blocks:
432; BLOCK A: 16-19,28-31
433; BLOCK B: 20-23,24-27
434; BLOCK C: 8-11,12-15
435; BLOCK D: 0-3,4-7
436; Blocks A and C are straight calculation through the various stages. In
437; block B, further calculations are performed using the results from
438; block A. In block D, further calculations are performed using the results
439; from block C and then the final calculations are done using results from
440; block A and B which have been combined at the end of block B.
441;
442
Scott LaVarnway97e6cc62015-11-23 10:24:09 -0800443%macro IDCT32X32_34 4
Scott LaVarnwayed833042015-11-17 17:42:24 -0800444 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
445 mova m11, m1
446 pmulhrsw m1, [pw___804x2] ; stp1_16
447 mova [r4 + 0], m0
448 pmulhrsw m11, [pw_16364x2] ; stp2_31
449 mova [r4 + 16 * 2], m2
450 mova m12, m7
451 pmulhrsw m7, [pw_15426x2] ; stp1_28
452 mova [r4 + 16 * 4], m4
453 pmulhrsw m12, [pw_m5520x2] ; stp2_19
454 mova [r4 + 16 * 6], m6
455
456 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
457 mova m2, m1 ; stp1_16
458 mova m0, m11 ; stp1_31
459 mova m4, m7 ; stp1_28
460 mova m15, m12 ; stp1_19
461
462 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
463 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
464 BUTTERFLY_4Xmm 4, 15, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
465
466 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
467 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
468 SUM_SUB 0, 15, 9 ; stp2_17, stp2_18
469 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
470 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
471
472 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
473 BUTTERFLY_4X 4, 15, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
474 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
475
476 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
477 mova m6, m5
478 pmulhrsw m5, [pw__3981x2] ; stp1_20
479 mova [stp + %4 + idx28], m12
480 mova [stp + %4 + idx29], m15
481 pmulhrsw m6, [pw_15893x2] ; stp2_27
482 mova [stp + %4 + idx30], m2
483 mova m2, m3
484 pmulhrsw m3, [pw_m2404x2] ; stp1_23
485 mova [stp + %4 + idx31], m11
486 pmulhrsw m2, [pw_16207x2] ; stp2_24
487
488 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
489 mova m13, m5 ; stp1_20
490 mova m14, m6 ; stp1_27
491 mova m15, m3 ; stp1_23
492 mova m11, m2 ; stp1_24
493
494 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
495 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
496 BUTTERFLY_4Xmm 11, 15, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
497
498 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
499 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
500 SUM_SUB 15, 14, 9 ; stp2_22, stp2_21
501 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
502 SUM_SUB 11, 13, 9 ; stp2_25, stp2_26
503
504 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
505 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
506 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
507
508 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
509 SUM_SUB 1, 3, 9 ; stp2_16, stp2_23
510 SUM_SUB 0, 15, 9 ; stp2_17, stp2_22
511 SUM_SUB 4, 14, 9 ; stp2_18, stp2_21
512 SUM_SUB 7, 5, 9 ; stp2_19, stp2_20
513 mova [stp + %3 + idx16], m1
514 mova [stp + %3 + idx17], m0
515 mova [stp + %3 + idx18], m4
516 mova [stp + %3 + idx19], m7
517
518 mova m4, [stp + %4 + idx28]
519 mova m7, [stp + %4 + idx29]
520 mova m10, [stp + %4 + idx30]
521 mova m12, [stp + %4 + idx31]
522 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
523 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
524 SUM_SUB 10, 11, 9 ; stp2_30, stp2_25
525 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
526 mova [stp + %4 + idx28], m4
527 mova [stp + %4 + idx29], m7
528 mova [stp + %4 + idx30], m10
529 mova [stp + %4 + idx31], m12
530
531 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
532%if 0 ; overflow occurs in SUM_SUB when using test streams
533 mova m10, [pw_11585x2]
Scott LaVarnway0148e202015-11-25 10:11:29 -0800534 SUM_SUB 6, 5, 9
Scott LaVarnwayed833042015-11-17 17:42:24 -0800535 pmulhrsw m6, m10 ; stp1_27
536 pmulhrsw m5, m10 ; stp1_20
537 SUM_SUB 13, 14, 9
538 pmulhrsw m13, m10 ; stp1_26
539 pmulhrsw m14, m10 ; stp1_21
540 SUM_SUB 11, 15, 9
541 pmulhrsw m11, m10 ; stp1_25
542 pmulhrsw m15, m10 ; stp1_22
543 SUM_SUB 2, 3, 9
544 pmulhrsw m2, m10 ; stp1_24
545 pmulhrsw m3, m10 ; stp1_23
546%else
547 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
548 SWAP 6, 5
549 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
550 SWAP 13, 14
551 BUTTERFLY_4X 11, 15, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
552 SWAP 11, 15
553 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
554 SWAP 2, 3
555%endif
556
557 mova [stp + %4 + idx24], m2
558 mova [stp + %4 + idx25], m11
559 mova [stp + %4 + idx26], m13
560 mova [stp + %4 + idx27], m6
561
562 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
563 ;
564 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
565 mova m0, [rsp + transposed_in + 16 * 2]
566 mova m6, [rsp + transposed_in + 16 * 6]
567
568 mova m1, m0
569 pmulhrsw m0, [pw__1606x2] ; stp1_8
570 mova [stp + %3 + idx20], m5
571 mova [stp + %3 + idx21], m14
572 pmulhrsw m1, [pw_16305x2] ; stp2_15
573 mova [stp + %3 + idx22], m15
574 mova m7, m6
575 pmulhrsw m7, [pw_m4756x2] ; stp2_11
576 mova [stp + %3 + idx23], m3
577 pmulhrsw m6, [pw_15679x2] ; stp1_12
578
579 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
580 mova m3, m0 ; stp1_8
581 mova m2, m1 ; stp1_15
582
583 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
584 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
585 mova m4, m7 ; stp1_11
586 mova m5, m6 ; stp1_12
587 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
588
589 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
590 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
591 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
592 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
593 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
594
595 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
596%if 0 ; overflow occurs in SUM_SUB when using test streams
597 mova m10, [pw_11585x2]
Scott LaVarnway0148e202015-11-25 10:11:29 -0800598 SUM_SUB 5, 4, 9
Scott LaVarnwayed833042015-11-17 17:42:24 -0800599 pmulhrsw m5, m10 ; stp1_13
600 pmulhrsw m4, m10 ; stp1_10
Scott LaVarnway0148e202015-11-25 10:11:29 -0800601 SUM_SUB 6, 7, 9
Scott LaVarnwayed833042015-11-17 17:42:24 -0800602 pmulhrsw m6, m10 ; stp1_12
603 pmulhrsw m7, m10 ; stp1_11
604%else
605 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
606 SWAP 5, 4
607 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
608 SWAP 6, 7
609%endif
610
611 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
612 mova [stp + %2 + idx8], m0
613 mova [stp + %2 + idx9], m2
614 mova [stp + %2 + idx10], m4
615 mova [stp + %2 + idx11], m7
616
617 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
618 ;
619 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
620 ;
621 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
622 mova m11, [rsp + transposed_in + 16 * 4]
623 mova m12, m11
624 pmulhrsw m11, [pw__3196x2] ; stp1_4
625 pmulhrsw m12, [pw_16069x2] ; stp1_7
626
627 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
628 mova m0, [rsp + transposed_in + 16 * 0]
629 mova m10, [pw_11585x2]
Scott LaVarnwayed833042015-11-17 17:42:24 -0800630 pmulhrsw m0, m10 ; stp1_1
Scott LaVarnwayed833042015-11-17 17:42:24 -0800631
632 mova m14, m11 ; stp1_4
633 mova m13, m12 ; stp1_7
634
635 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
636%if 0 ; overflow occurs in SUM_SUB when using test streams
637 SUM_SUB 13, 14, 9
638 pmulhrsw m13, m10 ; stp1_6
639 pmulhrsw m14, m10 ; stp1_5
640%else
641 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
642 SWAP 13, 14
643%endif
Scott LaVarnway97e6cc62015-11-23 10:24:09 -0800644 mova m7, m0 ; stp1_0 = stp1_1
Scott LaVarnwayed833042015-11-17 17:42:24 -0800645 mova m4, m0 ; stp1_1
646 mova m2, m7 ; stp1_0
647
648 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
649 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
650 SUM_SUB 7, 13, 9 ; stp1_1, stp1_6
651 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
652 SUM_SUB 4, 11, 9 ; stp1_3, stp1_4
653
654 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
655 SUM_SUB 0, 1, 9 ; stp1_0, stp1_15
656 SUM_SUB 7, 3, 9 ; stp1_1, stp1_14
657 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
658 SUM_SUB 4, 6, 9 ; stp1_3, stp1_12
659
660 ; 0-3, 28-31 final stage
661 mova m15, [stp + %4 + idx30]
662 mova m10, [stp + %4 + idx31]
663 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
664 SUM_SUB 7, 15, 9 ; stp1_1, stp1_30
665 mova [stp + %1 + idx0], m0
666 mova [stp + %1 + idx1], m7
667 mova [stp + %4 + idx30], m15
668 mova [stp + %4 + idx31], m10
669 mova m7, [stp + %4 + idx28]
670 mova m0, [stp + %4 + idx29]
671 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
672 SUM_SUB 4, 7, 9 ; stp1_3, stp1_28
673 mova [stp + %1 + idx2], m2
674 mova [stp + %1 + idx3], m4
675 mova [stp + %4 + idx28], m7
676 mova [stp + %4 + idx29], m0
677
678 ; 12-15, 16-19 final stage
679 mova m0, [stp + %3 + idx16]
680 mova m7, [stp + %3 + idx17]
681 mova m2, [stp + %3 + idx18]
682 mova m4, [stp + %3 + idx19]
683 SUM_SUB 1, 0, 9 ; stp1_15, stp1_16
684 SUM_SUB 3, 7, 9 ; stp1_14, stp1_17
685 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
686 SUM_SUB 6, 4, 9 ; stp1_12, stp1_19
687 mova [stp + %2 + idx12], m6
688 mova [stp + %2 + idx13], m5
689 mova [stp + %2 + idx14], m3
690 mova [stp + %2 + idx15], m1
691 mova [stp + %3 + idx16], m0
692 mova [stp + %3 + idx17], m7
693 mova [stp + %3 + idx18], m2
694 mova [stp + %3 + idx19], m4
695
696 mova m4, [stp + %2 + idx8]
697 mova m5, [stp + %2 + idx9]
698 mova m6, [stp + %2 + idx10]
699 mova m7, [stp + %2 + idx11]
700 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
701 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
702 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
703 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
704
705 ; 4-7, 24-27 final stage
706 mova m0, [stp + %4 + idx27]
707 mova m1, [stp + %4 + idx26]
708 mova m2, [stp + %4 + idx25]
709 mova m3, [stp + %4 + idx24]
710 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
711 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
712 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
713 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
714 mova [stp + %4 + idx27], m0
715 mova [stp + %4 + idx26], m1
716 mova [stp + %4 + idx25], m2
717 mova [stp + %4 + idx24], m3
718 mova [stp + %1 + idx4], m11
719 mova [stp + %1 + idx5], m14
720 mova [stp + %1 + idx6], m13
721 mova [stp + %1 + idx7], m12
722
723 ; 8-11, 20-23 final stage
724 mova m0, [stp + %3 + idx20]
725 mova m1, [stp + %3 + idx21]
726 mova m2, [stp + %3 + idx22]
727 mova m3, [stp + %3 + idx23]
728 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
729 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
730 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
731 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
732 mova [stp + %2 + idx8], m4
733 mova [stp + %2 + idx9], m5
734 mova [stp + %2 + idx10], m6
735 mova [stp + %2 + idx11], m7
736 mova [stp + %3 + idx20], m0
737 mova [stp + %3 + idx21], m1
738 mova [stp + %3 + idx22], m2
739 mova [stp + %3 + idx23], m3
740%endmacro
741
742%macro RECON_AND_STORE 1
743 mova m11, [pw_32]
744 lea stp, [rsp + %1]
745 mov r6, 32
746 pxor m8, m8
747%%recon_and_store:
748 mova m0, [stp + 16 * 32 * 0]
749 mova m1, [stp + 16 * 32 * 1]
750 mova m2, [stp + 16 * 32 * 2]
751 mova m3, [stp + 16 * 32 * 3]
752 add stp, 16
753
754 paddw m0, m11
755 paddw m1, m11
756 paddw m2, m11
757 paddw m3, m11
758 psraw m0, 6
759 psraw m1, 6
760 psraw m2, 6
761 psraw m3, 6
762 movh m4, [outputq + 0]
763 movh m5, [outputq + 8]
764 movh m6, [outputq + 16]
765 movh m7, [outputq + 24]
766 punpcklbw m4, m8
767 punpcklbw m5, m8
768 punpcklbw m6, m8
769 punpcklbw m7, m8
770 paddw m0, m4
771 paddw m1, m5
772 paddw m2, m6
773 paddw m3, m7
774 packuswb m0, m1
775 packuswb m2, m3
776 mova [outputq + 0], m0
777 mova [outputq + 16], m2
778 lea outputq, [outputq + strideq]
779 dec r6
780 jnz %%recon_and_store
781%endmacro
782
783%define i32x32_size 16*32*5
784%define pass_two_start 16*32*0
785%define transposed_in 16*32*4
786%define pass_one_start 16*32*0
787%define stp r8
788
789INIT_XMM ssse3
790cglobal idct32x32_34_add, 3, 11, 16, i32x32_size, input, output, stride
791 mova m8, [pd_8192]
792 lea stp, [rsp + pass_one_start]
793
794idct32x32_34:
795 mov r3, inputq
796 lea r4, [rsp + transposed_in]
797
798idct32x32_34_transpose:
Yaowu Xuf883b422016-08-30 14:01:10 -0700799%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -0800800 mova m0, [r3 + 0]
801 packssdw m0, [r3 + 16]
802 mova m1, [r3 + 32 * 4]
803 packssdw m1, [r3 + 32 * 4 + 16]
804 mova m2, [r3 + 32 * 8]
805 packssdw m2, [r3 + 32 * 8 + 16]
806 mova m3, [r3 + 32 * 12]
807 packssdw m3, [r3 + 32 * 12 + 16]
808 mova m4, [r3 + 32 * 16]
809 packssdw m4, [r3 + 32 * 16 + 16]
810 mova m5, [r3 + 32 * 20]
811 packssdw m5, [r3 + 32 * 20 + 16]
812 mova m6, [r3 + 32 * 24]
813 packssdw m6, [r3 + 32 * 24 + 16]
814 mova m7, [r3 + 32 * 28]
815 packssdw m7, [r3 + 32 * 28 + 16]
816%else
Scott LaVarnwayed833042015-11-17 17:42:24 -0800817 mova m0, [r3 + 0]
818 mova m1, [r3 + 16 * 4]
819 mova m2, [r3 + 16 * 8]
820 mova m3, [r3 + 16 * 12]
821 mova m4, [r3 + 16 * 16]
822 mova m5, [r3 + 16 * 20]
823 mova m6, [r3 + 16 * 24]
824 mova m7, [r3 + 16 * 28]
Yaowu Xuaac1ef72016-01-27 15:25:42 -0800825%endif
Scott LaVarnwayed833042015-11-17 17:42:24 -0800826
827 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
828
Scott LaVarnway97e6cc62015-11-23 10:24:09 -0800829 IDCT32X32_34 16*0, 16*32, 16*64, 16*96
Scott LaVarnwayed833042015-11-17 17:42:24 -0800830 lea stp, [stp + 16 * 8]
831 mov r6, 4
832 lea stp, [rsp + pass_one_start]
833 lea r9, [rsp + pass_one_start]
834
835idct32x32_34_2:
836 lea r4, [rsp + transposed_in]
837 mov r3, r9
838
839idct32x32_34_transpose_2:
840 mova m0, [r3 + 0]
841 mova m1, [r3 + 16 * 1]
842 mova m2, [r3 + 16 * 2]
843 mova m3, [r3 + 16 * 3]
844 mova m4, [r3 + 16 * 4]
845 mova m5, [r3 + 16 * 5]
846 mova m6, [r3 + 16 * 6]
847 mova m7, [r3 + 16 * 7]
848
849 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
850
Scott LaVarnway97e6cc62015-11-23 10:24:09 -0800851 IDCT32X32_34 16*0, 16*8, 16*16, 16*24
Scott LaVarnwayed833042015-11-17 17:42:24 -0800852
853 lea stp, [stp + 16 * 32]
854 add r9, 16 * 32
855 dec r6
856 jnz idct32x32_34_2
857
858 RECON_AND_STORE pass_two_start
859
860 RET
Scott LaVarnway0148e202015-11-25 10:11:29 -0800861
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -0800862%macro IDCT32X32_135 4
863 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
864 mova m1, [rsp + transposed_in + 16 * 1]
865 mova m11, m1
866 pmulhrsw m1, [pw___804x2] ; stp1_16
867 pmulhrsw m11, [pw_16364x2] ; stp2_31
868
869 mova m7, [rsp + transposed_in + 16 * 7]
870 mova m12, m7
871 pmulhrsw m7, [pw_15426x2] ; stp1_28
872 pmulhrsw m12, [pw_m5520x2] ; stp2_19
873
874 mova m3, [rsp + transposed_in + 16 * 9]
875 mova m4, m3
876 pmulhrsw m3, [pw__7005x2] ; stp1_18
877 pmulhrsw m4, [pw_14811x2] ; stp2_29
878
879 mova m0, [rsp + transposed_in + 16 * 15]
880 mova m2, m0
881 pmulhrsw m0, [pw_12140x2] ; stp1_30
882 pmulhrsw m2, [pw_m11003x2] ; stp2_17
883
884 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
885 SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
886 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
887 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
888 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
889
890 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
891 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
892 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
893
894 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
895 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
896 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
897 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
898 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
899
900 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
901 BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
902 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
903
904 mova [stp + %3 + idx16], m1
905 mova [stp + %3 + idx17], m0
906 mova [stp + %3 + idx18], m4
907 mova [stp + %3 + idx19], m7
908 mova [stp + %4 + idx28], m12
909 mova [stp + %4 + idx29], m3
910 mova [stp + %4 + idx30], m2
911 mova [stp + %4 + idx31], m11
912
913 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
914 mova m2, [rsp + transposed_in + 16 * 3]
915 mova m3, m2
916 pmulhrsw m3, [pw_m2404x2] ; stp1_23
917 pmulhrsw m2, [pw_16207x2] ; stp2_24
918
919 mova m5, [rsp + transposed_in + 16 * 5]
920 mova m6, m5
921 pmulhrsw m5, [pw__3981x2] ; stp1_20
922 pmulhrsw m6, [pw_15893x2] ; stp2_27
923
924 mova m14, [rsp + transposed_in + 16 * 11]
925 mova m13, m14
926 pmulhrsw m13, [pw_m8423x2] ; stp1_21
927 pmulhrsw m14, [pw_14053x2] ; stp2_26
928
929 mova m0, [rsp + transposed_in + 16 * 13]
930 mova m1, m0
931 pmulhrsw m0, [pw__9760x2] ; stp1_22
932 pmulhrsw m1, [pw_13160x2] ; stp2_25
933
934 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
935 SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
936 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
937 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
938 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
939
940 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
941 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
942 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
943
944 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
945 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
946 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
947 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
948 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
949
950 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
951 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
952 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
953
954 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
955 mova m4, [stp + %3 + idx16]
956 mova m7, [stp + %3 + idx17]
957 mova m11, [stp + %3 + idx18]
958 mova m12, [stp + %3 + idx19]
959 SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
960 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
961 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
962 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
963 mova [stp + %3 + idx16], m4
964 mova [stp + %3 + idx17], m7
965 mova [stp + %3 + idx18], m11
966 mova [stp + %3 + idx19], m12
967
968 mova m4, [stp + %4 + idx28]
969 mova m7, [stp + %4 + idx29]
970 mova m11, [stp + %4 + idx30]
971 mova m12, [stp + %4 + idx31]
972 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
973 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
974 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
975 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
976 mova [stp + %4 + idx28], m4
977 mova [stp + %4 + idx29], m7
978 mova [stp + %4 + idx30], m11
979 mova [stp + %4 + idx31], m12
980
981 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
982%if 0 ; overflow occurs in SUM_SUB when using test streams
983 mova m10, [pw_11585x2]
984 SUM_SUB 6, 5, 9
985 pmulhrsw m6, m10 ; stp1_27
986 pmulhrsw m5, m10 ; stp1_20
987 SUM_SUB 13, 14, 9
988 pmulhrsw m13, m10 ; stp1_26
989 pmulhrsw m14, m10 ; stp1_21
990 SUM_SUB 1, 0, 9
991 pmulhrsw m1, m10 ; stp1_25
992 pmulhrsw m0, m10 ; stp1_22
993 SUM_SUB 2, 3, 9
994 pmulhrsw m2, m10 ; stp1_25
995 pmulhrsw m3, m10 ; stp1_22
996%else
997 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
998 SWAP 6, 5
999 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
1000 SWAP 13, 14
1001 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
1002 SWAP 1, 0
1003 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
1004 SWAP 2, 3
1005%endif
1006 mova [stp + %3 + idx20], m5
1007 mova [stp + %3 + idx21], m14
1008 mova [stp + %3 + idx22], m0
1009 mova [stp + %3 + idx23], m3
1010 mova [stp + %4 + idx24], m2
1011 mova [stp + %4 + idx25], m1
1012 mova [stp + %4 + idx26], m13
1013 mova [stp + %4 + idx27], m6
1014
1015 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1016 ;
1017 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1018 mova m0, [rsp + transposed_in + 16 * 2]
1019 mova m1, m0
1020 pmulhrsw m0, [pw__1606x2] ; stp1_8
1021 pmulhrsw m1, [pw_16305x2] ; stp2_15
1022
1023 mova m6, [rsp + transposed_in + 16 * 6]
1024 mova m7, m6
1025 pmulhrsw m7, [pw_m4756x2] ; stp2_11
1026 pmulhrsw m6, [pw_15679x2] ; stp1_12
1027
1028 mova m4, [rsp + transposed_in + 16 * 10]
1029 mova m5, m4
1030 pmulhrsw m4, [pw__7723x2] ; stp1_10
1031 pmulhrsw m5, [pw_14449x2] ; stp2_13
1032
1033 mova m2, [rsp + transposed_in + 16 * 14]
1034 mova m3, m2
1035 pmulhrsw m3, [pw_m10394x2] ; stp1_9
1036 pmulhrsw m2, [pw_12665x2] ; stp2_14
1037
1038 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1039 SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
1040 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
1041 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
1042 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
1043
1044 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1045 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
1046 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
1047
1048 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1049 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
1050 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
1051 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
1052 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
1053
1054 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1055%if 0 ; overflow occurs in SUM_SUB when using test streams
1056 mova m10, [pw_11585x2]
1057 SUM_SUB 5, 4, 9
1058 pmulhrsw m5, m10 ; stp1_13
1059 pmulhrsw m4, m10 ; stp1_10
1060 SUM_SUB 6, 7, 9
1061 pmulhrsw m6, m10 ; stp1_12
1062 pmulhrsw m7, m10 ; stp1_11
1063%else
1064 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
1065 SWAP 5, 4
1066 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
1067 SWAP 6, 7
1068%endif
1069 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1070 mova [stp + %2 + idx8], m0
1071 mova [stp + %2 + idx9], m2
1072 mova [stp + %2 + idx10], m4
1073 mova [stp + %2 + idx11], m7
1074 mova [stp + %2 + idx12], m6
1075 mova [stp + %2 + idx13], m5
1076 mova [stp + %2 + idx14], m3
1077 mova [stp + %2 + idx15], m1
1078
1079 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1080 ;
1081 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1082 ;
1083 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1084 mova m11, [rsp + transposed_in + 16 * 4]
1085 mova m12, m11
1086 pmulhrsw m11, [pw__3196x2] ; stp1_4
1087 pmulhrsw m12, [pw_16069x2] ; stp1_7
1088
1089 mova m13, [rsp + transposed_in + 16 * 12]
1090 mova m14, m13
1091 pmulhrsw m13, [pw_13623x2] ; stp1_6
1092 pmulhrsw m14, [pw_m9102x2] ; stp1_5
1093
1094 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1095 mova m0, [rsp + transposed_in + 16 * 0]
1096 mova m2, [rsp + transposed_in + 16 * 8]
1097 pmulhrsw m0, [pw_11585x2] ; stp1_1
1098 mova m3, m2
1099 pmulhrsw m2, [pw__6270x2] ; stp1_2
1100 pmulhrsw m3, [pw_15137x2] ; stp1_3
1101
1102 SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
1103 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
1104
1105 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1106%if 0 ; overflow occurs in SUM_SUB when using test streams
1107 mova m10, [pw_11585x2]
1108 SUM_SUB 13, 14, 9
1109 pmulhrsw m13, m10 ; stp1_6
1110 pmulhrsw m14, m10 ; stp1_5
1111%else
1112 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
1113 SWAP 13, 14
1114%endif
1115 mova m1, m0 ; stp1_0 = stp1_1
1116 SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
1117 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
1118
1119 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1120 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
1121 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
1122 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
1123 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
1124
1125 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1126 mova m4, [stp + %2 + idx12]
1127 mova m5, [stp + %2 + idx13]
1128 mova m6, [stp + %2 + idx14]
1129 mova m7, [stp + %2 + idx15]
1130 SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
1131 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
1132 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
1133 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
1134
1135 ; 0-3, 28-31 final stage
1136 mova m10, [stp + %4 + idx31]
1137 mova m15, [stp + %4 + idx30]
1138 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
1139 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
1140 mova [stp + %1 + idx0], m0
1141 mova [stp + %1 + idx1], m1
1142 mova [stp + %4 + idx31], m10
1143 mova [stp + %4 + idx30], m15
1144 mova m0, [stp + %4 + idx29]
1145 mova m1, [stp + %4 + idx28]
1146 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
1147 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
1148 mova [stp + %1 + idx2], m2
1149 mova [stp + %1 + idx3], m3
1150 mova [stp + %4 + idx29], m0
1151 mova [stp + %4 + idx28], m1
1152
1153 ; 12-15, 16-19 final stage
1154 mova m0, [stp + %3 + idx16]
1155 mova m1, [stp + %3 + idx17]
1156 mova m2, [stp + %3 + idx18]
1157 mova m3, [stp + %3 + idx19]
1158 SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
1159 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
1160 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
1161 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
1162 mova [stp + %2 + idx12], m4
1163 mova [stp + %2 + idx13], m5
1164 mova [stp + %2 + idx14], m6
1165 mova [stp + %2 + idx15], m7
1166 mova [stp + %3 + idx16], m0
1167 mova [stp + %3 + idx17], m1
1168 mova [stp + %3 + idx18], m2
1169 mova [stp + %3 + idx19], m3
1170
1171 mova m4, [stp + %2 + idx8]
1172 mova m5, [stp + %2 + idx9]
1173 mova m6, [stp + %2 + idx10]
1174 mova m7, [stp + %2 + idx11]
1175 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
1176 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
1177 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
1178 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
1179
1180 ; 4-7, 24-27 final stage
1181 mova m3, [stp + %4 + idx24]
1182 mova m2, [stp + %4 + idx25]
1183 mova m1, [stp + %4 + idx26]
1184 mova m0, [stp + %4 + idx27]
1185 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
1186 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
1187 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
1188 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
1189 mova [stp + %4 + idx24], m3
1190 mova [stp + %4 + idx25], m2
1191 mova [stp + %4 + idx26], m1
1192 mova [stp + %4 + idx27], m0
1193 mova [stp + %1 + idx4], m11
1194 mova [stp + %1 + idx5], m14
1195 mova [stp + %1 + idx6], m13
1196 mova [stp + %1 + idx7], m12
1197
1198 ; 8-11, 20-23 final stage
1199 mova m0, [stp + %3 + idx20]
1200 mova m1, [stp + %3 + idx21]
1201 mova m2, [stp + %3 + idx22]
1202 mova m3, [stp + %3 + idx23]
1203 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
1204 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
1205 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
1206 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
1207 mova [stp + %2 + idx8], m4
1208 mova [stp + %2 + idx9], m5
1209 mova [stp + %2 + idx10], m6
1210 mova [stp + %2 + idx11], m7
1211 mova [stp + %3 + idx20], m0
1212 mova [stp + %3 + idx21], m1
1213 mova [stp + %3 + idx22], m2
1214 mova [stp + %3 + idx23], m3
1215%endmacro
1216
1217INIT_XMM ssse3
1218cglobal idct32x32_135_add, 3, 11, 16, i32x32_size, input, output, stride
1219 mova m8, [pd_8192]
1220 mov r6, 2
1221 lea stp, [rsp + pass_one_start]
1222
1223idct32x32_135:
1224 mov r3, inputq
1225 lea r4, [rsp + transposed_in]
1226 mov r7, 2
1227
1228idct32x32_135_transpose:
Yaowu Xuf883b422016-08-30 14:01:10 -07001229%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001230 mova m0, [r3 + 0]
1231 packssdw m0, [r3 + 16]
1232 mova m1, [r3 + 32 * 4]
1233 packssdw m1, [r3 + 32 * 4 + 16]
1234 mova m2, [r3 + 32 * 8]
1235 packssdw m2, [r3 + 32 * 8 + 16]
1236 mova m3, [r3 + 32 * 12]
1237 packssdw m3, [r3 + 32 * 12 + 16]
1238 mova m4, [r3 + 32 * 16]
1239 packssdw m4, [r3 + 32 * 16 + 16]
1240 mova m5, [r3 + 32 * 20]
1241 packssdw m5, [r3 + 32 * 20 + 16]
1242 mova m6, [r3 + 32 * 24]
1243 packssdw m6, [r3 + 32 * 24 + 16]
1244 mova m7, [r3 + 32 * 28]
1245 packssdw m7, [r3 + 32 * 28 + 16]
1246%else
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001247 mova m0, [r3 + 0]
1248 mova m1, [r3 + 16 * 4]
1249 mova m2, [r3 + 16 * 8]
1250 mova m3, [r3 + 16 * 12]
1251 mova m4, [r3 + 16 * 16]
1252 mova m5, [r3 + 16 * 20]
1253 mova m6, [r3 + 16 * 24]
1254 mova m7, [r3 + 16 * 28]
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001255%endif
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001256 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1257
1258 mova [r4 + 0], m0
1259 mova [r4 + 16 * 1], m1
1260 mova [r4 + 16 * 2], m2
1261 mova [r4 + 16 * 3], m3
1262 mova [r4 + 16 * 4], m4
1263 mova [r4 + 16 * 5], m5
1264 mova [r4 + 16 * 6], m6
1265 mova [r4 + 16 * 7], m7
1266
Yaowu Xuf883b422016-08-30 14:01:10 -07001267%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001268 add r3, 32
1269%else
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001270 add r3, 16
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001271%endif
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001272 add r4, 16 * 8
1273 dec r7
1274 jne idct32x32_135_transpose
1275
1276 IDCT32X32_135 16*0, 16*32, 16*64, 16*96
1277 lea stp, [stp + 16 * 8]
Yaowu Xuf883b422016-08-30 14:01:10 -07001278%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001279 lea inputq, [inputq + 32 * 32]
1280%else
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001281 lea inputq, [inputq + 16 * 32]
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001282%endif
Scott LaVarnwayf0b0b1f2015-12-02 04:50:46 -08001283 dec r6
1284 jnz idct32x32_135
1285
1286 mov r6, 4
1287 lea stp, [rsp + pass_one_start]
1288 lea r9, [rsp + pass_one_start]
1289
1290idct32x32_135_2:
1291 lea r4, [rsp + transposed_in]
1292 mov r3, r9
1293 mov r7, 2
1294
1295idct32x32_135_transpose_2:
1296 mova m0, [r3 + 0]
1297 mova m1, [r3 + 16 * 1]
1298 mova m2, [r3 + 16 * 2]
1299 mova m3, [r3 + 16 * 3]
1300 mova m4, [r3 + 16 * 4]
1301 mova m5, [r3 + 16 * 5]
1302 mova m6, [r3 + 16 * 6]
1303 mova m7, [r3 + 16 * 7]
1304
1305 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1306
1307 mova [r4 + 0], m0
1308 mova [r4 + 16 * 1], m1
1309 mova [r4 + 16 * 2], m2
1310 mova [r4 + 16 * 3], m3
1311 mova [r4 + 16 * 4], m4
1312 mova [r4 + 16 * 5], m5
1313 mova [r4 + 16 * 6], m6
1314 mova [r4 + 16 * 7], m7
1315
1316 add r3, 16 * 8
1317 add r4, 16 * 8
1318 dec r7
1319 jne idct32x32_135_transpose_2
1320
1321 IDCT32X32_135 16*0, 16*8, 16*16, 16*24
1322
1323 lea stp, [stp + 16 * 32]
1324 add r9, 16 * 32
1325 dec r6
1326 jnz idct32x32_135_2
1327
1328 RECON_AND_STORE pass_two_start
1329
1330 RET
1331
Scott LaVarnway0148e202015-11-25 10:11:29 -08001332%macro IDCT32X32_1024 4
1333 ; BLOCK A STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1334 mova m1, [rsp + transposed_in + 16 * 1]
1335 mova m11, [rsp + transposed_in + 16 * 31]
1336 BUTTERFLY_4X 1, 11, 804, 16364, m8, 9, 10 ; stp1_16, stp1_31
1337
1338 mova m0, [rsp + transposed_in + 16 * 15]
1339 mova m2, [rsp + transposed_in + 16 * 17]
1340 BUTTERFLY_4X 2, 0, 12140, 11003, m8, 9, 10 ; stp1_17, stp1_30
1341
1342 mova m7, [rsp + transposed_in + 16 * 7]
1343 mova m12, [rsp + transposed_in + 16 * 25]
1344 BUTTERFLY_4X 12, 7, 15426, 5520, m8, 9, 10 ; stp1_19, stp1_28
1345
1346 mova m3, [rsp + transposed_in + 16 * 9]
1347 mova m4, [rsp + transposed_in + 16 * 23]
1348 BUTTERFLY_4X 3, 4, 7005, 14811, m8, 9, 10 ; stp1_18, stp1_29
1349
1350 ; BLOCK A STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1351 SUM_SUB 1, 2, 9 ; stp2_16, stp2_17
1352 SUM_SUB 12, 3, 9 ; stp2_19, stp2_18
1353 SUM_SUB 7, 4, 9 ; stp2_28, stp2_29
1354 SUM_SUB 11, 0, 9 ; stp2_31, stp2_30
1355
1356 ; BLOCK A STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1357 BUTTERFLY_4X 0, 2, 3196, 16069, m8, 9, 10 ; stp1_17, stp1_30
1358 BUTTERFLY_4Xmm 4, 3, 3196, 16069, m8, 9, 10 ; stp1_29, stp1_18
1359
1360 ; BLOCK A STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1361 SUM_SUB 1, 12, 9 ; stp2_16, stp2_19
1362 SUM_SUB 0, 3, 9 ; stp2_17, stp2_18
1363 SUM_SUB 11, 7, 9 ; stp2_31, stp2_28
1364 SUM_SUB 2, 4, 9 ; stp2_30, stp2_29
1365
1366 ; BLOCK A STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1367 BUTTERFLY_4X 4, 3, 6270, 15137, m8, 9, 10 ; stp1_18, stp1_29
1368 BUTTERFLY_4X 7, 12, 6270, 15137, m8, 9, 10 ; stp1_19, stp1_28
1369
1370 mova [stp + %3 + idx16], m1
1371 mova [stp + %3 + idx17], m0
1372 mova [stp + %3 + idx18], m4
1373 mova [stp + %3 + idx19], m7
1374 mova [stp + %4 + idx28], m12
1375 mova [stp + %4 + idx29], m3
1376 mova [stp + %4 + idx30], m2
1377 mova [stp + %4 + idx31], m11
1378
1379 ; BLOCK B STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1380 mova m5, [rsp + transposed_in + 16 * 5]
1381 mova m6, [rsp + transposed_in + 16 * 27]
1382 BUTTERFLY_4X 5, 6, 3981, 15893, m8, 9, 10 ; stp1_20, stp1_27
1383
1384 mova m13, [rsp + transposed_in + 16 * 21]
1385 mova m14, [rsp + transposed_in + 16 * 11]
1386 BUTTERFLY_4X 13, 14, 14053, 8423, m8, 9, 10 ; stp1_21, stp1_26
1387
1388 mova m0, [rsp + transposed_in + 16 * 13]
1389 mova m1, [rsp + transposed_in + 16 * 19]
1390 BUTTERFLY_4X 0, 1, 9760, 13160, m8, 9, 10 ; stp1_22, stp1_25
1391
1392 mova m2, [rsp + transposed_in + 16 * 3]
1393 mova m3, [rsp + transposed_in + 16 * 29]
1394 BUTTERFLY_4X 3, 2, 16207, 2404, m8, 9, 10 ; stp1_23, stp1_24
1395
1396 ; BLOCK B STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1397 SUM_SUB 5, 13, 9 ; stp2_20, stp2_21
1398 SUM_SUB 3, 0, 9 ; stp2_23, stp2_22
1399 SUM_SUB 2, 1, 9 ; stp2_24, stp2_25
1400 SUM_SUB 6, 14, 9 ; stp2_27, stp2_26
1401
1402 ; BLOCK B STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1403 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_21, stp1_26
1404 BUTTERFLY_4Xmm 1, 0, 13623, 9102, m8, 9, 10 ; stp1_25, stp1_22
1405
1406 ; BLOCK B STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1407 SUM_SUB 3, 5, 9 ; stp2_23, stp2_20
1408 SUM_SUB 0, 14, 9 ; stp2_22, stp2_21
1409 SUM_SUB 2, 6, 9 ; stp2_24, stp2_27
1410 SUM_SUB 1, 13, 9 ; stp2_25, stp2_26
1411
1412 ; BLOCK B STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1413 BUTTERFLY_4Xmm 6, 5, 6270, 15137, m8, 9, 10 ; stp1_27, stp1_20
1414 BUTTERFLY_4Xmm 13, 14, 6270, 15137, m8, 9, 10 ; stp1_26, stp1_21
1415
1416 ; BLOCK B STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1417 mova m4, [stp + %3 + idx16]
1418 mova m7, [stp + %3 + idx17]
1419 mova m11, [stp + %3 + idx18]
1420 mova m12, [stp + %3 + idx19]
1421 SUM_SUB 4, 3, 9 ; stp2_16, stp2_23
1422 SUM_SUB 7, 0, 9 ; stp2_17, stp2_22
1423 SUM_SUB 11, 14, 9 ; stp2_18, stp2_21
1424 SUM_SUB 12, 5, 9 ; stp2_19, stp2_20
1425 mova [stp + %3 + idx16], m4
1426 mova [stp + %3 + idx17], m7
1427 mova [stp + %3 + idx18], m11
1428 mova [stp + %3 + idx19], m12
1429
1430 mova m4, [stp + %4 + idx28]
1431 mova m7, [stp + %4 + idx29]
1432 mova m11, [stp + %4 + idx30]
1433 mova m12, [stp + %4 + idx31]
1434 SUM_SUB 4, 6, 9 ; stp2_28, stp2_27
1435 SUM_SUB 7, 13, 9 ; stp2_29, stp2_26
1436 SUM_SUB 11, 1, 9 ; stp2_30, stp2_25
1437 SUM_SUB 12, 2, 9 ; stp2_31, stp2_24
1438 mova [stp + %4 + idx28], m4
1439 mova [stp + %4 + idx29], m7
1440 mova [stp + %4 + idx30], m11
1441 mova [stp + %4 + idx31], m12
1442
1443 ; BLOCK B STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1444%if 0 ; overflow occurs in SUM_SUB when using test streams
1445 mova m10, [pw_11585x2]
1446 SUM_SUB 6, 5, 9
1447 pmulhrsw m6, m10 ; stp1_27
1448 pmulhrsw m5, m10 ; stp1_20
1449 SUM_SUB 13, 14, 9
1450 pmulhrsw m13, m10 ; stp1_26
1451 pmulhrsw m14, m10 ; stp1_21
1452 SUM_SUB 1, 0, 9
1453 pmulhrsw m1, m10 ; stp1_25
1454 pmulhrsw m0, m10 ; stp1_22
1455 SUM_SUB 2, 3, 9
1456 pmulhrsw m2, m10 ; stp1_25
1457 pmulhrsw m3, m10 ; stp1_22
1458%else
1459 BUTTERFLY_4X 6, 5, 11585, 11585, m8, 9, 10 ; stp1_20, stp1_27
1460 SWAP 6, 5
1461 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_21, stp1_26
1462 SWAP 13, 14
1463 BUTTERFLY_4X 1, 0, 11585, 11585, m8, 9, 10 ; stp1_22, stp1_25
1464 SWAP 1, 0
1465 BUTTERFLY_4X 2, 3, 11585, 11585, m8, 9, 10 ; stp1_23, stp1_24
1466 SWAP 2, 3
1467%endif
1468 mova [stp + %3 + idx20], m5
1469 mova [stp + %3 + idx21], m14
1470 mova [stp + %3 + idx22], m0
1471 mova [stp + %3 + idx23], m3
1472 mova [stp + %4 + idx24], m2
1473 mova [stp + %4 + idx25], m1
1474 mova [stp + %4 + idx26], m13
1475 mova [stp + %4 + idx27], m6
1476
1477 ; BLOCK C STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1478 ;
1479 ; BLOCK C STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1480 mova m0, [rsp + transposed_in + 16 * 2]
1481 mova m1, [rsp + transposed_in + 16 * 30]
1482 BUTTERFLY_4X 0, 1, 1606, 16305, m8, 9, 10 ; stp1_8, stp1_15
1483
1484 mova m2, [rsp + transposed_in + 16 * 14]
1485 mova m3, [rsp + transposed_in + 16 * 18]
1486 BUTTERFLY_4X 3, 2, 12665, 10394, m8, 9, 10 ; stp1_9, stp1_14
1487
1488 mova m4, [rsp + transposed_in + 16 * 10]
1489 mova m5, [rsp + transposed_in + 16 * 22]
1490 BUTTERFLY_4X 4, 5, 7723, 14449, m8, 9, 10 ; stp1_10, stp1_13
1491
1492 mova m6, [rsp + transposed_in + 16 * 6]
1493 mova m7, [rsp + transposed_in + 16 * 26]
1494 BUTTERFLY_4X 7, 6, 15679, 4756, m8, 9, 10 ; stp1_11, stp1_12
1495
1496 ; BLOCK C STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1497 SUM_SUB 0, 3, 9 ; stp1_8, stp1_9
1498 SUM_SUB 7, 4, 9 ; stp1_11, stp1_10
1499 SUM_SUB 6, 5, 9 ; stp1_12, stp1_13
1500 SUM_SUB 1, 2, 9 ; stp1_15, stp1_14
1501
1502 ; BLOCK C STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1503 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_9, stp1_14
1504 BUTTERFLY_4Xmm 5, 4, 6270, 15137, m8, 9, 10 ; stp1_13, stp1_10
1505
1506 ; BLOCK C STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1507 SUM_SUB 0, 7, 9 ; stp1_8, stp1_11
1508 SUM_SUB 2, 4, 9 ; stp1_9, stp1_10
1509 SUM_SUB 1, 6, 9 ; stp1_15, stp1_12
1510 SUM_SUB 3, 5, 9 ; stp1_14, stp1_13
1511
1512 ; BLOCK C STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1513%if 0 ; overflow occurs in SUM_SUB when using test streams
1514 mova m10, [pw_11585x2]
1515 SUM_SUB 5, 4, 9
1516 pmulhrsw m5, m10 ; stp1_13
1517 pmulhrsw m4, m10 ; stp1_10
1518 SUM_SUB 6, 7, 9
1519 pmulhrsw m6, m10 ; stp1_12
1520 pmulhrsw m7, m10 ; stp1_11
1521%else
1522 BUTTERFLY_4X 5, 4, 11585, 11585, m8, 9, 10 ; stp1_10, stp1_13
1523 SWAP 5, 4
1524 BUTTERFLY_4X 6, 7, 11585, 11585, m8, 9, 10 ; stp1_11, stp1_12
1525 SWAP 6, 7
1526%endif
1527 ; BLOCK C STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1528 mova [stp + %2 + idx8], m0
1529 mova [stp + %2 + idx9], m2
1530 mova [stp + %2 + idx10], m4
1531 mova [stp + %2 + idx11], m7
1532 mova [stp + %2 + idx12], m6
1533 mova [stp + %2 + idx13], m5
1534 mova [stp + %2 + idx14], m3
1535 mova [stp + %2 + idx15], m1
1536
1537 ; BLOCK D STAGE 1 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1538 ;
1539 ; BLOCK D STAGE 2 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1540 ;
1541 ; BLOCK D STAGE 3 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1542 mova m11, [rsp + transposed_in + 16 * 4]
1543 mova m12, [rsp + transposed_in + 16 * 28]
1544 BUTTERFLY_4X 11, 12, 3196, 16069, m8, 9, 10 ; stp1_4, stp1_7
1545
1546 mova m13, [rsp + transposed_in + 16 * 12]
1547 mova m14, [rsp + transposed_in + 16 * 20]
1548 BUTTERFLY_4X 14, 13, 13623, 9102, m8, 9, 10 ; stp1_5, stp1_6
1549
1550 ; BLOCK D STAGE 4 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1551 mova m0, [rsp + transposed_in + 16 * 0]
1552 mova m1, [rsp + transposed_in + 16 * 16]
1553
1554%if 0 ; overflow occurs in SUM_SUB when using test streams
1555 mova m10, [pw_11585x2]
1556 SUM_SUB 0, 1, 9
1557 pmulhrsw m0, m10 ; stp1_1
1558 pmulhrsw m1, m10 ; stp1_0
1559%else
1560 BUTTERFLY_4X 0, 1, 11585, 11585, m8, 9, 10 ; stp1_1, stp1_0
1561 SWAP 0, 1
1562%endif
1563 mova m2, [rsp + transposed_in + 16 * 8]
1564 mova m3, [rsp + transposed_in + 16 * 24]
1565 BUTTERFLY_4X 2, 3, 6270, 15137, m8, 9, 10 ; stp1_2, stp1_3
1566
1567 mova m10, [pw_11585x2]
1568 SUM_SUB 11, 14, 9 ; stp1_4, stp1_5
1569 SUM_SUB 12, 13, 9 ; stp1_7, stp1_6
1570
1571 ; BLOCK D STAGE 5 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1572%if 0 ; overflow occurs in SUM_SUB when using test streams
1573 SUM_SUB 13, 14, 9
1574 pmulhrsw m13, m10 ; stp1_6
1575 pmulhrsw m14, m10 ; stp1_5
1576%else
1577 BUTTERFLY_4X 13, 14, 11585, 11585, m8, 9, 10 ; stp1_5, stp1_6
1578 SWAP 13, 14
1579%endif
1580 SUM_SUB 0, 3, 9 ; stp1_0, stp1_3
1581 SUM_SUB 1, 2, 9 ; stp1_1, stp1_2
1582
1583 ; BLOCK D STAGE 6 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1584 SUM_SUB 0, 12, 9 ; stp1_0, stp1_7
1585 SUM_SUB 1, 13, 9 ; stp1_1, stp1_6
1586 SUM_SUB 2, 14, 9 ; stp1_2, stp1_5
1587 SUM_SUB 3, 11, 9 ; stp1_3, stp1_4
1588
1589 ; BLOCK D STAGE 7 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1590 mova m4, [stp + %2 + idx12]
1591 mova m5, [stp + %2 + idx13]
1592 mova m6, [stp + %2 + idx14]
1593 mova m7, [stp + %2 + idx15]
1594 SUM_SUB 0, 7, 9 ; stp1_0, stp1_15
1595 SUM_SUB 1, 6, 9 ; stp1_1, stp1_14
1596 SUM_SUB 2, 5, 9 ; stp1_2, stp1_13
1597 SUM_SUB 3, 4, 9 ; stp1_3, stp1_12
1598
1599 ; 0-3, 28-31 final stage
1600 mova m10, [stp + %4 + idx31]
1601 mova m15, [stp + %4 + idx30]
1602 SUM_SUB 0, 10, 9 ; stp1_0, stp1_31
1603 SUM_SUB 1, 15, 9 ; stp1_1, stp1_30
1604 mova [stp + %1 + idx0], m0
1605 mova [stp + %1 + idx1], m1
1606 mova [stp + %4 + idx31], m10
1607 mova [stp + %4 + idx30], m15
1608 mova m0, [stp + %4 + idx29]
1609 mova m1, [stp + %4 + idx28]
1610 SUM_SUB 2, 0, 9 ; stp1_2, stp1_29
1611 SUM_SUB 3, 1, 9 ; stp1_3, stp1_28
1612 mova [stp + %1 + idx2], m2
1613 mova [stp + %1 + idx3], m3
1614 mova [stp + %4 + idx29], m0
1615 mova [stp + %4 + idx28], m1
1616
1617 ; 12-15, 16-19 final stage
1618 mova m0, [stp + %3 + idx16]
1619 mova m1, [stp + %3 + idx17]
1620 mova m2, [stp + %3 + idx18]
1621 mova m3, [stp + %3 + idx19]
1622 SUM_SUB 7, 0, 9 ; stp1_15, stp1_16
1623 SUM_SUB 6, 1, 9 ; stp1_14, stp1_17
1624 SUM_SUB 5, 2, 9 ; stp1_13, stp1_18
1625 SUM_SUB 4, 3, 9 ; stp1_12, stp1_19
1626 mova [stp + %2 + idx12], m4
1627 mova [stp + %2 + idx13], m5
1628 mova [stp + %2 + idx14], m6
1629 mova [stp + %2 + idx15], m7
1630 mova [stp + %3 + idx16], m0
1631 mova [stp + %3 + idx17], m1
1632 mova [stp + %3 + idx18], m2
1633 mova [stp + %3 + idx19], m3
1634
1635 mova m4, [stp + %2 + idx8]
1636 mova m5, [stp + %2 + idx9]
1637 mova m6, [stp + %2 + idx10]
1638 mova m7, [stp + %2 + idx11]
1639 SUM_SUB 11, 7, 9 ; stp1_4, stp1_11
1640 SUM_SUB 14, 6, 9 ; stp1_5, stp1_10
1641 SUM_SUB 13, 5, 9 ; stp1_6, stp1_9
1642 SUM_SUB 12, 4, 9 ; stp1_7, stp1_8
1643
1644 ; 4-7, 24-27 final stage
1645 mova m3, [stp + %4 + idx24]
1646 mova m2, [stp + %4 + idx25]
1647 mova m1, [stp + %4 + idx26]
1648 mova m0, [stp + %4 + idx27]
1649 SUM_SUB 12, 3, 9 ; stp1_7, stp1_24
1650 SUM_SUB 13, 2, 9 ; stp1_6, stp1_25
1651 SUM_SUB 14, 1, 9 ; stp1_5, stp1_26
1652 SUM_SUB 11, 0, 9 ; stp1_4, stp1_27
1653 mova [stp + %4 + idx24], m3
1654 mova [stp + %4 + idx25], m2
1655 mova [stp + %4 + idx26], m1
1656 mova [stp + %4 + idx27], m0
1657 mova [stp + %1 + idx4], m11
1658 mova [stp + %1 + idx5], m14
1659 mova [stp + %1 + idx6], m13
1660 mova [stp + %1 + idx7], m12
1661
1662 ; 8-11, 20-23 final stage
1663 mova m0, [stp + %3 + idx20]
1664 mova m1, [stp + %3 + idx21]
1665 mova m2, [stp + %3 + idx22]
1666 mova m3, [stp + %3 + idx23]
1667 SUM_SUB 7, 0, 9 ; stp1_11, stp_20
1668 SUM_SUB 6, 1, 9 ; stp1_10, stp_21
1669 SUM_SUB 5, 2, 9 ; stp1_9, stp_22
1670 SUM_SUB 4, 3, 9 ; stp1_8, stp_23
1671 mova [stp + %2 + idx8], m4
1672 mova [stp + %2 + idx9], m5
1673 mova [stp + %2 + idx10], m6
1674 mova [stp + %2 + idx11], m7
1675 mova [stp + %3 + idx20], m0
1676 mova [stp + %3 + idx21], m1
1677 mova [stp + %3 + idx22], m2
1678 mova [stp + %3 + idx23], m3
1679%endmacro
1680
1681INIT_XMM ssse3
1682cglobal idct32x32_1024_add, 3, 11, 16, i32x32_size, input, output, stride
1683 mova m8, [pd_8192]
1684 mov r6, 4
1685 lea stp, [rsp + pass_one_start]
1686
1687idct32x32_1024:
1688 mov r3, inputq
1689 lea r4, [rsp + transposed_in]
1690 mov r7, 4
1691
1692idct32x32_1024_transpose:
Yaowu Xuf883b422016-08-30 14:01:10 -07001693%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001694 mova m0, [r3 + 0]
1695 packssdw m0, [r3 + 16]
1696 mova m1, [r3 + 32 * 4]
1697 packssdw m1, [r3 + 32 * 4 + 16]
1698 mova m2, [r3 + 32 * 8]
1699 packssdw m2, [r3 + 32 * 8 + 16]
1700 mova m3, [r3 + 32 * 12]
1701 packssdw m3, [r3 + 32 * 12 + 16]
1702 mova m4, [r3 + 32 * 16]
1703 packssdw m4, [r3 + 32 * 16 + 16]
1704 mova m5, [r3 + 32 * 20]
1705 packssdw m5, [r3 + 32 * 20 + 16]
1706 mova m6, [r3 + 32 * 24]
1707 packssdw m6, [r3 + 32 * 24 + 16]
1708 mova m7, [r3 + 32 * 28]
1709 packssdw m7, [r3 + 32 * 28 + 16]
1710%else
Scott LaVarnway0148e202015-11-25 10:11:29 -08001711 mova m0, [r3 + 0]
1712 mova m1, [r3 + 16 * 4]
1713 mova m2, [r3 + 16 * 8]
1714 mova m3, [r3 + 16 * 12]
1715 mova m4, [r3 + 16 * 16]
1716 mova m5, [r3 + 16 * 20]
1717 mova m6, [r3 + 16 * 24]
1718 mova m7, [r3 + 16 * 28]
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001719%endif
Scott LaVarnway0148e202015-11-25 10:11:29 -08001720
1721 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1722
1723 mova [r4 + 0], m0
1724 mova [r4 + 16 * 1], m1
1725 mova [r4 + 16 * 2], m2
1726 mova [r4 + 16 * 3], m3
1727 mova [r4 + 16 * 4], m4
1728 mova [r4 + 16 * 5], m5
1729 mova [r4 + 16 * 6], m6
1730 mova [r4 + 16 * 7], m7
Yaowu Xuf883b422016-08-30 14:01:10 -07001731%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001732 add r3, 32
1733%else
Scott LaVarnway0148e202015-11-25 10:11:29 -08001734 add r3, 16
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001735%endif
Scott LaVarnway0148e202015-11-25 10:11:29 -08001736 add r4, 16 * 8
1737 dec r7
1738 jne idct32x32_1024_transpose
1739
1740 IDCT32X32_1024 16*0, 16*32, 16*64, 16*96
1741
1742 lea stp, [stp + 16 * 8]
Yaowu Xuf883b422016-08-30 14:01:10 -07001743%if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001744 lea inputq, [inputq + 32 * 32]
1745%else
Scott LaVarnway0148e202015-11-25 10:11:29 -08001746 lea inputq, [inputq + 16 * 32]
Yaowu Xuaac1ef72016-01-27 15:25:42 -08001747%endif
Scott LaVarnway0148e202015-11-25 10:11:29 -08001748 dec r6
1749 jnz idct32x32_1024
1750
1751 mov r6, 4
1752 lea stp, [rsp + pass_one_start]
1753 lea r9, [rsp + pass_one_start]
1754
1755idct32x32_1024_2:
1756 lea r4, [rsp + transposed_in]
1757 mov r3, r9
1758 mov r7, 4
1759
1760idct32x32_1024_transpose_2:
1761 mova m0, [r3 + 0]
1762 mova m1, [r3 + 16 * 1]
1763 mova m2, [r3 + 16 * 2]
1764 mova m3, [r3 + 16 * 3]
1765 mova m4, [r3 + 16 * 4]
1766 mova m5, [r3 + 16 * 5]
1767 mova m6, [r3 + 16 * 6]
1768 mova m7, [r3 + 16 * 7]
1769
1770 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
1771
1772 mova [r4 + 0], m0
1773 mova [r4 + 16 * 1], m1
1774 mova [r4 + 16 * 2], m2
1775 mova [r4 + 16 * 3], m3
1776 mova [r4 + 16 * 4], m4
1777 mova [r4 + 16 * 5], m5
1778 mova [r4 + 16 * 6], m6
1779 mova [r4 + 16 * 7], m7
1780
1781 add r3, 16 * 8
1782 add r4, 16 * 8
1783 dec r7
1784 jne idct32x32_1024_transpose_2
1785
1786 IDCT32X32_1024 16*0, 16*8, 16*16, 16*24
1787
1788 lea stp, [stp + 16 * 32]
1789 add r9, 16 * 32
1790 dec r6
1791 jnz idct32x32_1024_2
1792
1793 RECON_AND_STORE pass_two_start
1794
1795 RET
Jingning Han52ae97b2014-05-01 18:34:46 -07001796%endif