Blame - aom_dsp/arm/idct32x32_add_neon.asm - avm

blob: e7793fb16e8763acddc6902b19fd4f30132bf10d [file] [log] [blame]

Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1	;
Yaowu Xu	9c01aa1	2016-09-01 14:32:49 -0700	[diff] [blame]	2	; Copyright (c) 2016, Alliance for Open Media. All rights reserved
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	3	;
Yaowu Xu	9c01aa1	2016-09-01 14:32:49 -0700	[diff] [blame]	4	; This source code is subject to the terms of the BSD 2 Clause License and
				5	; the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	; was not distributed with this source code in the LICENSE file, you can
				7	; obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	; Media Patent License 1.0 was not distributed with this source code in the
				9	; PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	;
				11
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	12	;
				13
				14	;TODO(cd): adjust these constant to be able to use vqdmulh for faster
				15	; dct_const_round_shift(a * b) within butterfly calculations.
				16	cospi_1_64 EQU 16364
				17	cospi_2_64 EQU 16305
				18	cospi_3_64 EQU 16207
				19	cospi_4_64 EQU 16069
				20	cospi_5_64 EQU 15893
				21	cospi_6_64 EQU 15679
				22	cospi_7_64 EQU 15426
				23	cospi_8_64 EQU 15137
				24	cospi_9_64 EQU 14811
				25	cospi_10_64 EQU 14449
				26	cospi_11_64 EQU 14053
				27	cospi_12_64 EQU 13623
				28	cospi_13_64 EQU 13160
				29	cospi_14_64 EQU 12665
				30	cospi_15_64 EQU 12140
				31	cospi_16_64 EQU 11585
				32	cospi_17_64 EQU 11003
				33	cospi_18_64 EQU 10394
				34	cospi_19_64 EQU 9760
				35	cospi_20_64 EQU 9102
				36	cospi_21_64 EQU 8423
				37	cospi_22_64 EQU 7723
				38	cospi_23_64 EQU 7005
				39	cospi_24_64 EQU 6270
				40	cospi_25_64 EQU 5520
				41	cospi_26_64 EQU 4756
				42	cospi_27_64 EQU 3981
				43	cospi_28_64 EQU 3196
				44	cospi_29_64 EQU 2404
				45	cospi_30_64 EQU 1606
				46	cospi_31_64 EQU 804
				47
				48
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	49	EXPORT \|aom_idct32x32_1024_add_neon\|
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	50	ARM
				51	REQUIRE8
				52	PRESERVE8
				53
				54	AREA \|\|.text\|\|, CODE, READONLY, ALIGN=2
				55
				56	AREA Block, CODE, READONLY
				57
				58	; --------------------------------------------------------------------------
				59	; Load from transposed_buffer
				60	; q13 = transposed_buffer[first_offset]
				61	; q14 = transposed_buffer[second_offset]
				62	; for proper address calculation, the last offset used when manipulating
				63	; transposed_buffer must be passed in. use 0 for first use.
				64	MACRO
				65	LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
				66	; address calculation with proper stride and loading
				67	add r0, #($first_offset - $prev_offset )82
				68	vld1.s16 {q14}, [r0]
				69	add r0, #($second_offset - $first_offset)82
				70	vld1.s16 {q13}, [r0]
				71	; (used) two registers (q14, q13)
				72	MEND
				73	; --------------------------------------------------------------------------
				74	; Load from output (used as temporary storage)
				75	; reg1 = output[first_offset]
				76	; reg2 = output[second_offset]
				77	; for proper address calculation, the last offset used when manipulating
Andrew Russell	549c31f	2014-02-12 16:32:51 -0800	[diff] [blame]	78	; output, whether reading or storing) must be passed in. use 0 for first
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	79	; use.
				80	MACRO
				81	LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
				82	; address calculation with proper stride and loading
				83	add r1, #($first_offset - $prev_offset )322
				84	vld1.s16 {$reg1}, [r1]
				85	add r1, #($second_offset - $first_offset)322
				86	vld1.s16 {$reg2}, [r1]
				87	; (used) two registers ($reg1, $reg2)
				88	MEND
				89	; --------------------------------------------------------------------------
				90	; Store into output (sometimes as as temporary storage)
				91	; output[first_offset] = reg1
				92	; output[second_offset] = reg2
				93	; for proper address calculation, the last offset used when manipulating
Andrew Russell	549c31f	2014-02-12 16:32:51 -0800	[diff] [blame]	94	; output, whether reading or storing) must be passed in. use 0 for first
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	95	; use.
				96	MACRO
				97	STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
				98	; address calculation with proper stride and storing
				99	add r1, #($first_offset - $prev_offset )322
				100	vst1.16 {$reg1}, [r1]
				101	add r1, #($second_offset - $first_offset)322
				102	vst1.16 {$reg2}, [r1]
				103	MEND
				104	; --------------------------------------------------------------------------
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	105	; Combine-add results with current destination content
				106	; q6-q9 contain the results (out[j * 32 + 0-31])
				107	MACRO
				108	STORE_COMBINE_CENTER_RESULTS
				109	; load dest[j * dest_stride + 0-31]
				110	vld1.s16 {d8}, [r10], r2
				111	vld1.s16 {d11}, [r9], r11
				112	vld1.s16 {d9}, [r10]
				113	vld1.s16 {d10}, [r9]
				114	; ROUND_POWER_OF_TWO
				115	vrshr.s16 q7, q7, #6
				116	vrshr.s16 q8, q8, #6
				117	vrshr.s16 q9, q9, #6
				118	vrshr.s16 q6, q6, #6
				119	; add to dest[j * dest_stride + 0-31]
				120	vaddw.u8 q7, q7, d9
				121	vaddw.u8 q8, q8, d10
				122	vaddw.u8 q9, q9, d11
				123	vaddw.u8 q6, q6, d8
				124	; clip pixel
				125	vqmovun.s16 d9, q7
				126	vqmovun.s16 d10, q8
				127	vqmovun.s16 d11, q9
				128	vqmovun.s16 d8, q6
				129	; store back into dest[j * dest_stride + 0-31]
				130	vst1.16 {d9}, [r10], r11
				131	vst1.16 {d10}, [r9], r2
				132	vst1.16 {d8}, [r10]
				133	vst1.16 {d11}, [r9]
				134	; update pointers (by dest_stride * 2)
				135	sub r9, r9, r2, lsl #1
				136	add r10, r10, r2, lsl #1
				137	MEND
				138	; --------------------------------------------------------------------------
				139	; Combine-add results with current destination content
				140	; q6-q9 contain the results (out[j * 32 + 0-31])
				141	MACRO
				142	STORE_COMBINE_CENTER_RESULTS_LAST
				143	; load dest[j * dest_stride + 0-31]
				144	vld1.s16 {d8}, [r10], r2
				145	vld1.s16 {d11}, [r9], r11
				146	vld1.s16 {d9}, [r10]
				147	vld1.s16 {d10}, [r9]
				148	; ROUND_POWER_OF_TWO
				149	vrshr.s16 q7, q7, #6
				150	vrshr.s16 q8, q8, #6
				151	vrshr.s16 q9, q9, #6
				152	vrshr.s16 q6, q6, #6
				153	; add to dest[j * dest_stride + 0-31]
				154	vaddw.u8 q7, q7, d9
				155	vaddw.u8 q8, q8, d10
				156	vaddw.u8 q9, q9, d11
				157	vaddw.u8 q6, q6, d8
				158	; clip pixel
				159	vqmovun.s16 d9, q7
				160	vqmovun.s16 d10, q8
				161	vqmovun.s16 d11, q9
				162	vqmovun.s16 d8, q6
				163	; store back into dest[j * dest_stride + 0-31]
				164	vst1.16 {d9}, [r10], r11
				165	vst1.16 {d10}, [r9], r2
				166	vst1.16 {d8}, [r10]!
				167	vst1.16 {d11}, [r9]!
				168	; update pointers (by dest_stride * 2)
				169	sub r9, r9, r2, lsl #1
				170	add r10, r10, r2, lsl #1
				171	MEND
				172	; --------------------------------------------------------------------------
				173	; Combine-add results with current destination content
				174	; q4-q7 contain the results (out[j * 32 + 0-31])
				175	MACRO
				176	STORE_COMBINE_EXTREME_RESULTS
				177	; load dest[j * dest_stride + 0-31]
				178	vld1.s16 {d4}, [r7], r2
				179	vld1.s16 {d7}, [r6], r11
				180	vld1.s16 {d5}, [r7]
				181	vld1.s16 {d6}, [r6]
				182	; ROUND_POWER_OF_TWO
				183	vrshr.s16 q5, q5, #6
				184	vrshr.s16 q6, q6, #6
				185	vrshr.s16 q7, q7, #6
				186	vrshr.s16 q4, q4, #6
				187	; add to dest[j * dest_stride + 0-31]
				188	vaddw.u8 q5, q5, d5
				189	vaddw.u8 q6, q6, d6
				190	vaddw.u8 q7, q7, d7
				191	vaddw.u8 q4, q4, d4
				192	; clip pixel
				193	vqmovun.s16 d5, q5
				194	vqmovun.s16 d6, q6
				195	vqmovun.s16 d7, q7
				196	vqmovun.s16 d4, q4
				197	; store back into dest[j * dest_stride + 0-31]
				198	vst1.16 {d5}, [r7], r11
				199	vst1.16 {d6}, [r6], r2
				200	vst1.16 {d7}, [r6]
				201	vst1.16 {d4}, [r7]
				202	; update pointers (by dest_stride * 2)
				203	sub r6, r6, r2, lsl #1
				204	add r7, r7, r2, lsl #1
				205	MEND
				206	; --------------------------------------------------------------------------
				207	; Combine-add results with current destination content
				208	; q4-q7 contain the results (out[j * 32 + 0-31])
				209	MACRO
				210	STORE_COMBINE_EXTREME_RESULTS_LAST
				211	; load dest[j * dest_stride + 0-31]
				212	vld1.s16 {d4}, [r7], r2
				213	vld1.s16 {d7}, [r6], r11
				214	vld1.s16 {d5}, [r7]
				215	vld1.s16 {d6}, [r6]
				216	; ROUND_POWER_OF_TWO
				217	vrshr.s16 q5, q5, #6
				218	vrshr.s16 q6, q6, #6
				219	vrshr.s16 q7, q7, #6
				220	vrshr.s16 q4, q4, #6
				221	; add to dest[j * dest_stride + 0-31]
				222	vaddw.u8 q5, q5, d5
				223	vaddw.u8 q6, q6, d6
				224	vaddw.u8 q7, q7, d7
				225	vaddw.u8 q4, q4, d4
				226	; clip pixel
				227	vqmovun.s16 d5, q5
				228	vqmovun.s16 d6, q6
				229	vqmovun.s16 d7, q7
				230	vqmovun.s16 d4, q4
				231	; store back into dest[j * dest_stride + 0-31]
				232	vst1.16 {d5}, [r7], r11
				233	vst1.16 {d6}, [r6], r2
				234	vst1.16 {d7}, [r6]!
				235	vst1.16 {d4}, [r7]!
				236	; update pointers (by dest_stride * 2)
				237	sub r6, r6, r2, lsl #1
				238	add r7, r7, r2, lsl #1
				239	MEND
				240	; --------------------------------------------------------------------------
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	241	; Touches q8-q12, q15 (q13-q14 are preserved)
				242	; valid output registers are anything but q8-q11
				243	MACRO
				244	DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
				245	; TODO(cd): have special case to re-use constants when they are similar for
				246	; consecutive butterflies
				247	; TODO(cd): have special case when both constants are the same, do the
Andrew Russell	549c31f	2014-02-12 16:32:51 -0800	[diff] [blame]	248	; additions/subtractions before the multiplies.
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	249	; generate the constants
				250	; generate scalar constants
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	251	mov r8, #$first_constant & 0xFF00
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	252	mov r12, #$second_constant & 0xFF00
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	253	add r8, #$first_constant & 0x00FF
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	254	add r12, #$second_constant & 0x00FF
				255	; generate vector constants
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	256	vdup.16 d30, r8
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	257	vdup.16 d31, r12
				258	; (used) two for inputs (regA-regD), one for constants (q15)
				259	; do some multiplications (ordered for maximum latency hiding)
				260	vmull.s16 q8, $regC, d30
				261	vmull.s16 q10, $regA, d31
				262	vmull.s16 q9, $regD, d30
				263	vmull.s16 q11, $regB, d31
				264	vmull.s16 q12, $regC, d31
				265	; (used) five for intermediate (q8-q12), one for constants (q15)
Andrew Russell	549c31f	2014-02-12 16:32:51 -0800	[diff] [blame]	266	; do some addition/subtractions (to get back two register)
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	267	vsub.s32 q8, q8, q10
				268	vsub.s32 q9, q9, q11
				269	; do more multiplications (ordered for maximum latency hiding)
				270	vmull.s16 q10, $regD, d31
				271	vmull.s16 q11, $regA, d30
				272	vmull.s16 q15, $regB, d30
				273	; (used) six for intermediate (q8-q12, q15)
Andrew Russell	549c31f	2014-02-12 16:32:51 -0800	[diff] [blame]	274	; do more addition/subtractions
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	275	vadd.s32 q11, q12, q11
				276	vadd.s32 q10, q10, q15
				277	; (used) four for intermediate (q8-q11)
				278	; dct_const_round_shift
				279	vqrshrn.s32 $reg1, q8, #14
				280	vqrshrn.s32 $reg2, q9, #14
				281	vqrshrn.s32 $reg3, q11, #14
				282	vqrshrn.s32 $reg4, q10, #14
				283	; (used) two for results, well four d registers
				284	MEND
				285	; --------------------------------------------------------------------------
				286	; Touches q8-q12, q15 (q13-q14 are preserved)
				287	; valid output registers are anything but q8-q11
				288	MACRO
				289	DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
				290	DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
				291	MEND
				292	; --------------------------------------------------------------------------
				293
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	294	;void aom_idct32x32_1024_add_neon(int16_t input, uint8_t dest, int dest_stride);
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	295	;
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	296	; r0 int16_t *input,
				297	; r1 uint8_t *dest,
				298	; r2 int dest_stride)
				299	; loop counters
				300	; r4 bands loop counter
				301	; r5 pass loop counter
				302	; r8 transpose loop counter
				303	; combine-add pointers
				304	; r6 dest + 31 * dest_stride, descending (30, 29, 28, ...)
				305	; r7 dest + 0 * dest_stride, ascending (1, 2, 3, ...)
				306	; r9 dest + 15 * dest_stride, descending (14, 13, 12, ...)
				307	; r10 dest + 16 * dest_stride, ascending (17, 18, 19, ...)
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	308
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	309	\|aom_idct32x32_1024_add_neon\| PROC
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	310	; This function does one pass of idct32x32 transform.
				311	;
				312	; This is done by transposing the input and then doing a 1d transform on
				313	; columns. In the first pass, the transposed columns are the original
				314	; rows. In the second pass, after the transposition, the colums are the
				315	; original columns.
				316	; The 1d transform is done by looping over bands of eight columns (the
				317	; idct32_bands loop). For each band, the transform input transposition
				318	; is done on demand, one band of four 8x8 matrices at a time. The four
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	319	; matrices are transposed by pairs (the idct32_transpose_pair loop).
				320	push {r4-r11}
				321	vpush {d8-d15}
				322	; stack operation
				323	; internal buffer used to transpose 8 lines into before transforming them
				324	; int16_t transpose_buffer[32 * 8];
				325	; at sp + [4096, 4607]
				326	; results of the first pass (transpose and transform rows)
				327	; int16_t pass1[32 * 32];
				328	; at sp + [0, 2047]
				329	; results of the second pass (transpose and transform columns)
				330	; int16_t pass2[32 * 32];
				331	; at sp + [2048, 4095]
				332	sub sp, sp, #512+2048+2048
				333
				334	; r6 = dest + 31 * dest_stride
				335	; r7 = dest + 0 * dest_stride
				336	; r9 = dest + 15 * dest_stride
				337	; r10 = dest + 16 * dest_stride
				338	rsb r6, r2, r2, lsl #5
				339	rsb r9, r2, r2, lsl #4
				340	add r10, r1, r2, lsl #4
				341	mov r7, r1
				342	add r6, r6, r1
				343	add r9, r9, r1
				344	; r11 = -dest_stride
				345	neg r11, r2
				346	; r3 = input
				347	mov r3, r0
				348	; parameters for first pass
				349	; r0 = transpose_buffer[32 * 8]
				350	add r0, sp, #4096
				351	; r1 = pass1[32 * 32]
				352	mov r1, sp
				353
				354	mov r5, #0 ; initialize pass loop counter
				355	idct32_pass_loop
				356	mov r4, #4 ; initialize bands loop counter
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	357	idct32_bands_loop
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	358	mov r8, #2 ; initialize transpose loop counter
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	359	idct32_transpose_pair_loop
				360	; Load two horizontally consecutive 8x8 16bit data matrices. The first one
				361	; into q0-q7 and the second one into q8-q15. There is a stride of 64,
				362	; adjusted to 32 because of the two post-increments.
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	363	vld1.s16 {q8}, [r3]!
				364	vld1.s16 {q0}, [r3]!
				365	add r3, #32
				366	vld1.s16 {q9}, [r3]!
				367	vld1.s16 {q1}, [r3]!
				368	add r3, #32
				369	vld1.s16 {q10}, [r3]!
				370	vld1.s16 {q2}, [r3]!
				371	add r3, #32
				372	vld1.s16 {q11}, [r3]!
				373	vld1.s16 {q3}, [r3]!
				374	add r3, #32
				375	vld1.s16 {q12}, [r3]!
				376	vld1.s16 {q4}, [r3]!
				377	add r3, #32
				378	vld1.s16 {q13}, [r3]!
				379	vld1.s16 {q5}, [r3]!
				380	add r3, #32
				381	vld1.s16 {q14}, [r3]!
				382	vld1.s16 {q6}, [r3]!
				383	add r3, #32
				384	vld1.s16 {q15}, [r3]!
				385	vld1.s16 {q7}, [r3]!
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	386
				387	; Transpose the two 8x8 16bit data matrices.
				388	vswp d17, d24
				389	vswp d23, d30
				390	vswp d21, d28
				391	vswp d19, d26
				392	vswp d1, d8
				393	vswp d7, d14
				394	vswp d5, d12
				395	vswp d3, d10
				396	vtrn.32 q8, q10
				397	vtrn.32 q9, q11
				398	vtrn.32 q12, q14
				399	vtrn.32 q13, q15
				400	vtrn.32 q0, q2
				401	vtrn.32 q1, q3
				402	vtrn.32 q4, q6
				403	vtrn.32 q5, q7
				404	vtrn.16 q8, q9
				405	vtrn.16 q10, q11
				406	vtrn.16 q12, q13
				407	vtrn.16 q14, q15
				408	vtrn.16 q0, q1
				409	vtrn.16 q2, q3
				410	vtrn.16 q4, q5
				411	vtrn.16 q6, q7
				412
				413	; Store both matrices after each other. There is a stride of 32, which
				414	; adjusts to nothing because of the post-increments.
				415	vst1.16 {q8}, [r0]!
				416	vst1.16 {q9}, [r0]!
				417	vst1.16 {q10}, [r0]!
				418	vst1.16 {q11}, [r0]!
				419	vst1.16 {q12}, [r0]!
				420	vst1.16 {q13}, [r0]!
				421	vst1.16 {q14}, [r0]!
				422	vst1.16 {q15}, [r0]!
				423	vst1.16 {q0}, [r0]!
				424	vst1.16 {q1}, [r0]!
				425	vst1.16 {q2}, [r0]!
				426	vst1.16 {q3}, [r0]!
				427	vst1.16 {q4}, [r0]!
				428	vst1.16 {q5}, [r0]!
				429	vst1.16 {q6}, [r0]!
				430	vst1.16 {q7}, [r0]!
				431
				432	; increment pointers by adjusted stride (not necessary for r0/out)
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	433	; go back by 7*32 for the seven lines moved fully by read and add
				434	; go back by 32 for the eigth line only read
				435	; advance by 16*2 to go the next pair
				436	sub r3, r3, #7322 + 32 - 16*2
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	437	; transpose pair loop processing
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	438	subs r8, r8, #1
				439	bne idct32_transpose_pair_loop
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	440
				441	; restore r0/input to its original value
				442	sub r0, r0, #3282
				443
				444	; Instead of doing the transforms stage by stage, it is done by loading
				445	; some input values and doing as many stages as possible to minimize the
				446	; storing/loading of intermediate results. To fit within registers, the
				447	; final coefficients are cut into four blocks:
				448	; BLOCK A: 16-19,28-31
				449	; BLOCK B: 20-23,24-27
				450	; BLOCK C: 8-10,11-15
				451	; BLOCK D: 0-3,4-7
				452	; Blocks A and C are straight calculation through the various stages. In
				453	; block B, further calculations are performed using the results from
				454	; block A. In block D, further calculations are performed using the results
				455	; from block C and then the final calculations are done using results from
				456	; block A and B which have been combined at the end of block B.
				457
				458	; --------------------------------------------------------------------------
				459	; BLOCK A: 16-19,28-31
				460	; --------------------------------------------------------------------------
				461	; generate 16,17,30,31
				462	; --------------------------------------------------------------------------
				463	; part of stage 1
				464	;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
				465	;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
				466	;step1b[16][i] = dct_const_round_shift(temp1);
				467	;step1b[31][i] = dct_const_round_shift(temp2);
				468	LOAD_FROM_TRANSPOSED 0, 1, 31
				469	DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
				470	; --------------------------------------------------------------------------
				471	; part of stage 1
				472	;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
				473	;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
				474	;step1b[17][i] = dct_const_round_shift(temp1);
				475	;step1b[30][i] = dct_const_round_shift(temp2);
				476	LOAD_FROM_TRANSPOSED 31, 17, 15
				477	DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
				478	; --------------------------------------------------------------------------
				479	; part of stage 2
				480	;step2[16] = step1b[16][i] + step1b[17][i];
				481	;step2[17] = step1b[16][i] - step1b[17][i];
				482	;step2[30] = -step1b[30][i] + step1b[31][i];
				483	;step2[31] = step1b[30][i] + step1b[31][i];
				484	vadd.s16 q4, q0, q1
				485	vsub.s16 q13, q0, q1
				486	vadd.s16 q6, q2, q3
				487	vsub.s16 q14, q2, q3
				488	; --------------------------------------------------------------------------
				489	; part of stage 3
				490	;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
				491	;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
				492	;step3[17] = dct_const_round_shift(temp1);
				493	;step3[30] = dct_const_round_shift(temp2);
				494	DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
				495	; --------------------------------------------------------------------------
				496	; generate 18,19,28,29
				497	; --------------------------------------------------------------------------
				498	; part of stage 1
				499	;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
				500	;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
				501	;step1b[18][i] = dct_const_round_shift(temp1);
				502	;step1b[29][i] = dct_const_round_shift(temp2);
				503	LOAD_FROM_TRANSPOSED 15, 9, 23
				504	DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
				505	; --------------------------------------------------------------------------
				506	; part of stage 1
				507	;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
				508	;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
				509	;step1b[19][i] = dct_const_round_shift(temp1);
				510	;step1b[28][i] = dct_const_round_shift(temp2);
				511	LOAD_FROM_TRANSPOSED 23, 25, 7
				512	DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
				513	; --------------------------------------------------------------------------
				514	; part of stage 2
				515	;step2[18] = -step1b[18][i] + step1b[19][i];
				516	;step2[19] = step1b[18][i] + step1b[19][i];
				517	;step2[28] = step1b[28][i] + step1b[29][i];
				518	;step2[29] = step1b[28][i] - step1b[29][i];
				519	vsub.s16 q13, q3, q2
				520	vadd.s16 q3, q3, q2
				521	vsub.s16 q14, q1, q0
				522	vadd.s16 q2, q1, q0
				523	; --------------------------------------------------------------------------
				524	; part of stage 3
				525	;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
				526	;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
				527	;step3[29] = dct_const_round_shift(temp1);
				528	;step3[18] = dct_const_round_shift(temp2);
				529	DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
				530	; --------------------------------------------------------------------------
				531	; combine 16-19,28-31
				532	; --------------------------------------------------------------------------
				533	; part of stage 4
				534	;step1[16] = step1b[16][i] + step1b[19][i];
				535	;step1[17] = step1b[17][i] + step1b[18][i];
				536	;step1[18] = step1b[17][i] - step1b[18][i];
				537	;step1[29] = step1b[30][i] - step1b[29][i];
				538	;step1[30] = step1b[30][i] + step1b[29][i];
				539	;step1[31] = step1b[31][i] + step1b[28][i];
				540	vadd.s16 q8, q4, q2
				541	vadd.s16 q9, q5, q0
				542	vadd.s16 q10, q7, q1
				543	vadd.s16 q15, q6, q3
				544	vsub.s16 q13, q5, q0
				545	vsub.s16 q14, q7, q1
				546	STORE_IN_OUTPUT 0, 16, 31, q8, q15
				547	STORE_IN_OUTPUT 31, 17, 30, q9, q10
				548	; --------------------------------------------------------------------------
				549	; part of stage 5
				550	;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
				551	;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
				552	;step2[18] = dct_const_round_shift(temp1);
				553	;step2[29] = dct_const_round_shift(temp2);
				554	DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
				555	STORE_IN_OUTPUT 30, 29, 18, q1, q0
				556	; --------------------------------------------------------------------------
				557	; part of stage 4
				558	;step1[19] = step1b[16][i] - step1b[19][i];
				559	;step1[28] = step1b[31][i] - step1b[28][i];
				560	vsub.s16 q13, q4, q2
				561	vsub.s16 q14, q6, q3
				562	; --------------------------------------------------------------------------
				563	; part of stage 5
				564	;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
				565	;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
				566	;step2[19] = dct_const_round_shift(temp1);
				567	;step2[28] = dct_const_round_shift(temp2);
				568	DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
				569	STORE_IN_OUTPUT 18, 19, 28, q4, q6
				570	; --------------------------------------------------------------------------
				571
				572
				573	; --------------------------------------------------------------------------
				574	; BLOCK B: 20-23,24-27
				575	; --------------------------------------------------------------------------
				576	; generate 20,21,26,27
				577	; --------------------------------------------------------------------------
				578	; part of stage 1
				579	;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
				580	;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
				581	;step1b[20][i] = dct_const_round_shift(temp1);
				582	;step1b[27][i] = dct_const_round_shift(temp2);
				583	LOAD_FROM_TRANSPOSED 7, 5, 27
				584	DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
				585	; --------------------------------------------------------------------------
				586	; part of stage 1
				587	;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
				588	;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
				589	;step1b[21][i] = dct_const_round_shift(temp1);
				590	;step1b[26][i] = dct_const_round_shift(temp2);
				591	LOAD_FROM_TRANSPOSED 27, 21, 11
				592	DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
				593	; --------------------------------------------------------------------------
				594	; part of stage 2
				595	;step2[20] = step1b[20][i] + step1b[21][i];
				596	;step2[21] = step1b[20][i] - step1b[21][i];
				597	;step2[26] = -step1b[26][i] + step1b[27][i];
				598	;step2[27] = step1b[26][i] + step1b[27][i];
				599	vsub.s16 q13, q0, q1
				600	vadd.s16 q0, q0, q1
				601	vsub.s16 q14, q2, q3
				602	vadd.s16 q2, q2, q3
				603	; --------------------------------------------------------------------------
				604	; part of stage 3
				605	;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
				606	;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
				607	;step3[21] = dct_const_round_shift(temp1);
				608	;step3[26] = dct_const_round_shift(temp2);
				609	DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
				610	; --------------------------------------------------------------------------
				611	; generate 22,23,24,25
				612	; --------------------------------------------------------------------------
				613	; part of stage 1
				614	;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
				615	;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
				616	;step1b[22][i] = dct_const_round_shift(temp1);
				617	;step1b[25][i] = dct_const_round_shift(temp2);
				618	LOAD_FROM_TRANSPOSED 11, 13, 19
				619	DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
				620	; --------------------------------------------------------------------------
				621	; part of stage 1
				622	;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
				623	;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
				624	;step1b[23][i] = dct_const_round_shift(temp1);
				625	;step1b[24][i] = dct_const_round_shift(temp2);
				626	LOAD_FROM_TRANSPOSED 19, 29, 3
				627	DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
				628	; --------------------------------------------------------------------------
				629	; part of stage 2
				630	;step2[22] = -step1b[22][i] + step1b[23][i];
				631	;step2[23] = step1b[22][i] + step1b[23][i];
				632	;step2[24] = step1b[24][i] + step1b[25][i];
				633	;step2[25] = step1b[24][i] - step1b[25][i];
				634	vsub.s16 q14, q4, q5
				635	vadd.s16 q5, q4, q5
				636	vsub.s16 q13, q6, q7
				637	vadd.s16 q6, q6, q7
				638	; --------------------------------------------------------------------------
				639	; part of stage 3
				640	;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
				641	;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
				642	;step3[25] = dct_const_round_shift(temp1);
				643	;step3[22] = dct_const_round_shift(temp2);
				644	DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
				645	; --------------------------------------------------------------------------
				646	; combine 20-23,24-27
				647	; --------------------------------------------------------------------------
				648	; part of stage 4
				649	;step1[22] = step1b[22][i] + step1b[21][i];
				650	;step1[23] = step1b[23][i] + step1b[20][i];
				651	vadd.s16 q10, q7, q1
				652	vadd.s16 q11, q5, q0
				653	;step1[24] = step1b[24][i] + step1b[27][i];
				654	;step1[25] = step1b[25][i] + step1b[26][i];
				655	vadd.s16 q12, q6, q2
				656	vadd.s16 q15, q4, q3
				657	; --------------------------------------------------------------------------
				658	; part of stage 6
				659	;step3[16] = step1b[16][i] + step1b[23][i];
				660	;step3[17] = step1b[17][i] + step1b[22][i];
				661	;step3[22] = step1b[17][i] - step1b[22][i];
				662	;step3[23] = step1b[16][i] - step1b[23][i];
				663	LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
				664	vadd.s16 q8, q14, q11
				665	vadd.s16 q9, q13, q10
				666	vsub.s16 q13, q13, q10
				667	vsub.s16 q11, q14, q11
				668	STORE_IN_OUTPUT 17, 17, 16, q9, q8
				669	; --------------------------------------------------------------------------
				670	; part of stage 6
				671	;step3[24] = step1b[31][i] - step1b[24][i];
				672	;step3[25] = step1b[30][i] - step1b[25][i];
				673	;step3[30] = step1b[30][i] + step1b[25][i];
				674	;step3[31] = step1b[31][i] + step1b[24][i];
				675	LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
				676	vsub.s16 q8, q9, q12
				677	vadd.s16 q10, q14, q15
				678	vsub.s16 q14, q14, q15
				679	vadd.s16 q12, q9, q12
				680	STORE_IN_OUTPUT 31, 30, 31, q10, q12
				681	; --------------------------------------------------------------------------
				682	; TODO(cd) do some register allocation change to remove these push/pop
				683	vpush {q8} ; [24]
				684	vpush {q11} ; [23]
				685	; --------------------------------------------------------------------------
				686	; part of stage 7
				687	;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
				688	;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
				689	;step1[22] = dct_const_round_shift(temp1);
				690	;step1[25] = dct_const_round_shift(temp2);
				691	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
				692	STORE_IN_OUTPUT 31, 25, 22, q14, q13
				693	; --------------------------------------------------------------------------
				694	; part of stage 7
				695	;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
				696	;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
				697	;step1[23] = dct_const_round_shift(temp1);
				698	;step1[24] = dct_const_round_shift(temp2);
				699	; TODO(cd) do some register allocation change to remove these push/pop
				700	vpop {q13} ; [23]
				701	vpop {q14} ; [24]
				702	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
				703	STORE_IN_OUTPUT 22, 24, 23, q14, q13
				704	; --------------------------------------------------------------------------
				705	; part of stage 4
				706	;step1[20] = step1b[23][i] - step1b[20][i];
				707	;step1[27] = step1b[24][i] - step1b[27][i];
				708	vsub.s16 q14, q5, q0
				709	vsub.s16 q13, q6, q2
				710	; --------------------------------------------------------------------------
				711	; part of stage 5
				712	;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
				713	;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
				714	;step2[27] = dct_const_round_shift(temp1);
				715	;step2[20] = dct_const_round_shift(temp2);
				716	DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
				717	; --------------------------------------------------------------------------
				718	; part of stage 4
				719	;step1[21] = step1b[22][i] - step1b[21][i];
				720	;step1[26] = step1b[25][i] - step1b[26][i];
				721	vsub.s16 q14, q7, q1
				722	vsub.s16 q13, q4, q3
				723	; --------------------------------------------------------------------------
				724	; part of stage 5
				725	;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
				726	;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
				727	;step2[26] = dct_const_round_shift(temp1);
				728	;step2[21] = dct_const_round_shift(temp2);
				729	DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
				730	; --------------------------------------------------------------------------
				731	; part of stage 6
				732	;step3[18] = step1b[18][i] + step1b[21][i];
				733	;step3[19] = step1b[19][i] + step1b[20][i];
				734	;step3[20] = step1b[19][i] - step1b[20][i];
				735	;step3[21] = step1b[18][i] - step1b[21][i];
				736	LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
				737	vadd.s16 q8, q14, q1
				738	vadd.s16 q9, q13, q6
				739	vsub.s16 q13, q13, q6
				740	vsub.s16 q1, q14, q1
				741	STORE_IN_OUTPUT 19, 18, 19, q8, q9
				742	; --------------------------------------------------------------------------
				743	; part of stage 6
				744	;step3[27] = step1b[28][i] - step1b[27][i];
				745	;step3[28] = step1b[28][i] + step1b[27][i];
				746	;step3[29] = step1b[29][i] + step1b[26][i];
				747	;step3[26] = step1b[29][i] - step1b[26][i];
				748	LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
				749	vsub.s16 q14, q8, q5
				750	vadd.s16 q10, q8, q5
				751	vadd.s16 q11, q9, q0
				752	vsub.s16 q0, q9, q0
				753	STORE_IN_OUTPUT 29, 28, 29, q10, q11
				754	; --------------------------------------------------------------------------
				755	; part of stage 7
				756	;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
				757	;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
				758	;step1[20] = dct_const_round_shift(temp1);
				759	;step1[27] = dct_const_round_shift(temp2);
				760	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
				761	STORE_IN_OUTPUT 29, 20, 27, q13, q14
				762	; --------------------------------------------------------------------------
				763	; part of stage 7
				764	;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
				765	;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
				766	;step1[21] = dct_const_round_shift(temp1);
				767	;step1[26] = dct_const_round_shift(temp2);
				768	DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
				769	STORE_IN_OUTPUT 27, 21, 26, q1, q0
				770	; --------------------------------------------------------------------------
				771
				772
				773	; --------------------------------------------------------------------------
				774	; BLOCK C: 8-10,11-15
				775	; --------------------------------------------------------------------------
				776	; generate 8,9,14,15
				777	; --------------------------------------------------------------------------
				778	; part of stage 2
				779	;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
				780	;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
				781	;step2[8] = dct_const_round_shift(temp1);
				782	;step2[15] = dct_const_round_shift(temp2);
				783	LOAD_FROM_TRANSPOSED 3, 2, 30
				784	DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
				785	; --------------------------------------------------------------------------
				786	; part of stage 2
				787	;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
				788	;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
				789	;step2[9] = dct_const_round_shift(temp1);
				790	;step2[14] = dct_const_round_shift(temp2);
				791	LOAD_FROM_TRANSPOSED 30, 18, 14
				792	DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
				793	; --------------------------------------------------------------------------
				794	; part of stage 3
				795	;step3[8] = step1b[8][i] + step1b[9][i];
				796	;step3[9] = step1b[8][i] - step1b[9][i];
				797	;step3[14] = step1b[15][i] - step1b[14][i];
				798	;step3[15] = step1b[15][i] + step1b[14][i];
				799	vsub.s16 q13, q0, q1
				800	vadd.s16 q0, q0, q1
				801	vsub.s16 q14, q2, q3
				802	vadd.s16 q2, q2, q3
				803	; --------------------------------------------------------------------------
				804	; part of stage 4
				805	;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
				806	;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
				807	;step1[9] = dct_const_round_shift(temp1);
				808	;step1[14] = dct_const_round_shift(temp2);
				809	DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
				810	; --------------------------------------------------------------------------
				811	; generate 10,11,12,13
				812	; --------------------------------------------------------------------------
				813	; part of stage 2
				814	;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
				815	;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
				816	;step2[10] = dct_const_round_shift(temp1);
				817	;step2[13] = dct_const_round_shift(temp2);
				818	LOAD_FROM_TRANSPOSED 14, 10, 22
				819	DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
				820	; --------------------------------------------------------------------------
				821	; part of stage 2
				822	;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
				823	;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
				824	;step2[11] = dct_const_round_shift(temp1);
				825	;step2[12] = dct_const_round_shift(temp2);
				826	LOAD_FROM_TRANSPOSED 22, 26, 6
				827	DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
				828	; --------------------------------------------------------------------------
				829	; part of stage 3
				830	;step3[10] = step1b[11][i] - step1b[10][i];
				831	;step3[11] = step1b[11][i] + step1b[10][i];
				832	;step3[12] = step1b[12][i] + step1b[13][i];
				833	;step3[13] = step1b[12][i] - step1b[13][i];
				834	vsub.s16 q14, q4, q5
				835	vadd.s16 q5, q4, q5
				836	vsub.s16 q13, q6, q7
				837	vadd.s16 q6, q6, q7
				838	; --------------------------------------------------------------------------
				839	; part of stage 4
				840	;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
				841	;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
				842	;step1[13] = dct_const_round_shift(temp1);
				843	;step1[10] = dct_const_round_shift(temp2);
				844	DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
				845	; --------------------------------------------------------------------------
				846	; combine 8-10,11-15
				847	; --------------------------------------------------------------------------
				848	; part of stage 5
				849	;step2[8] = step1b[8][i] + step1b[11][i];
				850	;step2[9] = step1b[9][i] + step1b[10][i];
				851	;step2[10] = step1b[9][i] - step1b[10][i];
				852	vadd.s16 q8, q0, q5
				853	vadd.s16 q9, q1, q7
				854	vsub.s16 q13, q1, q7
				855	;step2[13] = step1b[14][i] - step1b[13][i];
				856	;step2[14] = step1b[14][i] + step1b[13][i];
				857	;step2[15] = step1b[15][i] + step1b[12][i];
				858	vsub.s16 q14, q3, q4
				859	vadd.s16 q10, q3, q4
				860	vadd.s16 q15, q2, q6
				861	STORE_IN_OUTPUT 26, 8, 15, q8, q15
				862	STORE_IN_OUTPUT 15, 9, 14, q9, q10
				863	; --------------------------------------------------------------------------
				864	; part of stage 6
				865	;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
				866	;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
				867	;step3[10] = dct_const_round_shift(temp1);
				868	;step3[13] = dct_const_round_shift(temp2);
				869	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
				870	STORE_IN_OUTPUT 14, 13, 10, q3, q1
				871	; --------------------------------------------------------------------------
				872	; part of stage 5
				873	;step2[11] = step1b[8][i] - step1b[11][i];
				874	;step2[12] = step1b[15][i] - step1b[12][i];
				875	vsub.s16 q13, q0, q5
				876	vsub.s16 q14, q2, q6
				877	; --------------------------------------------------------------------------
				878	; part of stage 6
				879	;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
				880	;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
				881	;step3[11] = dct_const_round_shift(temp1);
				882	;step3[12] = dct_const_round_shift(temp2);
				883	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
				884	STORE_IN_OUTPUT 10, 11, 12, q1, q3
				885	; --------------------------------------------------------------------------
				886
				887
				888	; --------------------------------------------------------------------------
				889	; BLOCK D: 0-3,4-7
				890	; --------------------------------------------------------------------------
				891	; generate 4,5,6,7
				892	; --------------------------------------------------------------------------
				893	; part of stage 3
				894	;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
				895	;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
				896	;step3[4] = dct_const_round_shift(temp1);
				897	;step3[7] = dct_const_round_shift(temp2);
				898	LOAD_FROM_TRANSPOSED 6, 4, 28
				899	DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
				900	; --------------------------------------------------------------------------
				901	; part of stage 3
				902	;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
				903	;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
				904	;step3[5] = dct_const_round_shift(temp1);
				905	;step3[6] = dct_const_round_shift(temp2);
				906	LOAD_FROM_TRANSPOSED 28, 20, 12
				907	DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
				908	; --------------------------------------------------------------------------
				909	; part of stage 4
				910	;step1[4] = step1b[4][i] + step1b[5][i];
				911	;step1[5] = step1b[4][i] - step1b[5][i];
				912	;step1[6] = step1b[7][i] - step1b[6][i];
				913	;step1[7] = step1b[7][i] + step1b[6][i];
				914	vsub.s16 q13, q0, q1
				915	vadd.s16 q0, q0, q1
				916	vsub.s16 q14, q2, q3
				917	vadd.s16 q2, q2, q3
				918	; --------------------------------------------------------------------------
				919	; part of stage 5
				920	;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
				921	;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
				922	;step2[5] = dct_const_round_shift(temp1);
				923	;step2[6] = dct_const_round_shift(temp2);
				924	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
				925	; --------------------------------------------------------------------------
				926	; generate 0,1,2,3
				927	; --------------------------------------------------------------------------
				928	; part of stage 4
				929	;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
				930	;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
				931	;step1[1] = dct_const_round_shift(temp1);
				932	;step1[0] = dct_const_round_shift(temp2);
				933	LOAD_FROM_TRANSPOSED 12, 0, 16
				934	DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
				935	; --------------------------------------------------------------------------
				936	; part of stage 4
				937	;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
				938	;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
				939	;step1[2] = dct_const_round_shift(temp1);
				940	;step1[3] = dct_const_round_shift(temp2);
				941	LOAD_FROM_TRANSPOSED 16, 8, 24
				942	DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
				943	; --------------------------------------------------------------------------
				944	; part of stage 5
				945	;step2[0] = step1b[0][i] + step1b[3][i];
				946	;step2[1] = step1b[1][i] + step1b[2][i];
				947	;step2[2] = step1b[1][i] - step1b[2][i];
				948	;step2[3] = step1b[0][i] - step1b[3][i];
				949	vadd.s16 q4, q7, q6
				950	vsub.s16 q7, q7, q6
				951	vsub.s16 q6, q5, q14
				952	vadd.s16 q5, q5, q14
				953	; --------------------------------------------------------------------------
				954	; combine 0-3,4-7
				955	; --------------------------------------------------------------------------
				956	; part of stage 6
				957	;step3[0] = step1b[0][i] + step1b[7][i];
				958	;step3[1] = step1b[1][i] + step1b[6][i];
				959	;step3[2] = step1b[2][i] + step1b[5][i];
				960	;step3[3] = step1b[3][i] + step1b[4][i];
				961	vadd.s16 q8, q4, q2
				962	vadd.s16 q9, q5, q3
				963	vadd.s16 q10, q6, q1
				964	vadd.s16 q11, q7, q0
				965	;step3[4] = step1b[3][i] - step1b[4][i];
				966	;step3[5] = step1b[2][i] - step1b[5][i];
				967	;step3[6] = step1b[1][i] - step1b[6][i];
				968	;step3[7] = step1b[0][i] - step1b[7][i];
				969	vsub.s16 q12, q7, q0
				970	vsub.s16 q13, q6, q1
				971	vsub.s16 q14, q5, q3
				972	vsub.s16 q15, q4, q2
				973	; --------------------------------------------------------------------------
				974	; part of stage 7
				975	;step1[0] = step1b[0][i] + step1b[15][i];
				976	;step1[1] = step1b[1][i] + step1b[14][i];
				977	;step1[14] = step1b[1][i] - step1b[14][i];
				978	;step1[15] = step1b[0][i] - step1b[15][i];
				979	LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
				980	vadd.s16 q2, q8, q1
				981	vadd.s16 q3, q9, q0
				982	vsub.s16 q4, q9, q0
				983	vsub.s16 q5, q8, q1
				984	; --------------------------------------------------------------------------
				985	; part of final stage
				986	;output[14 * 32] = step1b[14][i] + step1b[17][i];
				987	;output[15 * 32] = step1b[15][i] + step1b[16][i];
				988	;output[16 * 32] = step1b[15][i] - step1b[16][i];
				989	;output[17 * 32] = step1b[14][i] - step1b[17][i];
				990	LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
				991	vadd.s16 q8, q4, q1
				992	vadd.s16 q9, q5, q0
				993	vsub.s16 q6, q5, q0
				994	vsub.s16 q7, q4, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	995
				996	cmp r5, #0
				997	bgt idct32_bands_end_2nd_pass
				998
				999	idct32_bands_end_1st_pass
				1000	STORE_IN_OUTPUT 17, 16, 17, q6, q7
				1001	STORE_IN_OUTPUT 17, 14, 15, q8, q9
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1002	; --------------------------------------------------------------------------
				1003	; part of final stage
				1004	;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
				1005	;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
				1006	;output[30 * 32] = step1b[1][i] - step1b[30][i];
				1007	;output[31 * 32] = step1b[0][i] - step1b[31][i];
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1008	LOAD_FROM_OUTPUT 15, 30, 31, q0, q1
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1009	vadd.s16 q4, q2, q1
				1010	vadd.s16 q5, q3, q0
				1011	vsub.s16 q6, q3, q0
				1012	vsub.s16 q7, q2, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1013	STORE_IN_OUTPUT 31, 30, 31, q6, q7
				1014	STORE_IN_OUTPUT 31, 0, 1, q4, q5
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1015	; --------------------------------------------------------------------------
				1016	; part of stage 7
				1017	;step1[2] = step1b[2][i] + step1b[13][i];
				1018	;step1[3] = step1b[3][i] + step1b[12][i];
				1019	;step1[12] = step1b[3][i] - step1b[12][i];
				1020	;step1[13] = step1b[2][i] - step1b[13][i];
				1021	LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
				1022	vadd.s16 q2, q10, q1
				1023	vadd.s16 q3, q11, q0
				1024	vsub.s16 q4, q11, q0
				1025	vsub.s16 q5, q10, q1
				1026	; --------------------------------------------------------------------------
				1027	; part of final stage
				1028	;output[12 * 32] = step1b[12][i] + step1b[19][i];
				1029	;output[13 * 32] = step1b[13][i] + step1b[18][i];
				1030	;output[18 * 32] = step1b[13][i] - step1b[18][i];
				1031	;output[19 * 32] = step1b[12][i] - step1b[19][i];
				1032	LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1033	vadd.s16 q8, q4, q1
				1034	vadd.s16 q9, q5, q0
				1035	vsub.s16 q6, q5, q0
				1036	vsub.s16 q7, q4, q1
				1037	STORE_IN_OUTPUT 19, 18, 19, q6, q7
				1038	STORE_IN_OUTPUT 19, 12, 13, q8, q9
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1039	; --------------------------------------------------------------------------
				1040	; part of final stage
				1041	;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
				1042	;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
				1043	;output[28 * 32] = step1b[3][i] - step1b[28][i];
				1044	;output[29 * 32] = step1b[2][i] - step1b[29][i];
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1045	LOAD_FROM_OUTPUT 13, 28, 29, q0, q1
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1046	vadd.s16 q4, q2, q1
				1047	vadd.s16 q5, q3, q0
				1048	vsub.s16 q6, q3, q0
				1049	vsub.s16 q7, q2, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1050	STORE_IN_OUTPUT 29, 28, 29, q6, q7
				1051	STORE_IN_OUTPUT 29, 2, 3, q4, q5
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1052	; --------------------------------------------------------------------------
				1053	; part of stage 7
				1054	;step1[4] = step1b[4][i] + step1b[11][i];
				1055	;step1[5] = step1b[5][i] + step1b[10][i];
				1056	;step1[10] = step1b[5][i] - step1b[10][i];
				1057	;step1[11] = step1b[4][i] - step1b[11][i];
				1058	LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
				1059	vadd.s16 q2, q12, q1
				1060	vadd.s16 q3, q13, q0
				1061	vsub.s16 q4, q13, q0
				1062	vsub.s16 q5, q12, q1
				1063	; --------------------------------------------------------------------------
				1064	; part of final stage
				1065	;output[10 * 32] = step1b[10][i] + step1b[21][i];
				1066	;output[11 * 32] = step1b[11][i] + step1b[20][i];
				1067	;output[20 * 32] = step1b[11][i] - step1b[20][i];
				1068	;output[21 * 32] = step1b[10][i] - step1b[21][i];
				1069	LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1070	vadd.s16 q8, q4, q1
				1071	vadd.s16 q9, q5, q0
				1072	vsub.s16 q6, q5, q0
				1073	vsub.s16 q7, q4, q1
				1074	STORE_IN_OUTPUT 21, 20, 21, q6, q7
				1075	STORE_IN_OUTPUT 21, 10, 11, q8, q9
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1076	; --------------------------------------------------------------------------
				1077	; part of final stage
				1078	;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
				1079	;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
				1080	;output[26 * 32] = step1b[5][i] - step1b[26][i];
				1081	;output[27 * 32] = step1b[4][i] - step1b[27][i];
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1082	LOAD_FROM_OUTPUT 11, 26, 27, q0, q1
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1083	vadd.s16 q4, q2, q1
				1084	vadd.s16 q5, q3, q0
				1085	vsub.s16 q6, q3, q0
				1086	vsub.s16 q7, q2, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1087	STORE_IN_OUTPUT 27, 26, 27, q6, q7
				1088	STORE_IN_OUTPUT 27, 4, 5, q4, q5
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1089	; --------------------------------------------------------------------------
				1090	; part of stage 7
				1091	;step1[6] = step1b[6][i] + step1b[9][i];
				1092	;step1[7] = step1b[7][i] + step1b[8][i];
				1093	;step1[8] = step1b[7][i] - step1b[8][i];
				1094	;step1[9] = step1b[6][i] - step1b[9][i];
				1095	LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
				1096	vadd.s16 q2, q14, q1
				1097	vadd.s16 q3, q15, q0
				1098	vsub.s16 q4, q15, q0
				1099	vsub.s16 q5, q14, q1
				1100	; --------------------------------------------------------------------------
				1101	; part of final stage
				1102	;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
				1103	;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
				1104	;output[22 * 32] = step1b[9][i] - step1b[22][i];
				1105	;output[23 * 32] = step1b[8][i] - step1b[23][i];
				1106	LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1107	vadd.s16 q8, q4, q1
				1108	vadd.s16 q9, q5, q0
				1109	vsub.s16 q6, q5, q0
				1110	vsub.s16 q7, q4, q1
				1111	STORE_IN_OUTPUT 23, 22, 23, q6, q7
				1112	STORE_IN_OUTPUT 23, 8, 9, q8, q9
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1113	; --------------------------------------------------------------------------
				1114	; part of final stage
				1115	;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
				1116	;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
				1117	;output[24 * 32] = step1b[7][i] - step1b[24][i];
				1118	;output[25 * 32] = step1b[6][i] - step1b[25][i];
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1119	LOAD_FROM_OUTPUT 9, 24, 25, q0, q1
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1120	vadd.s16 q4, q2, q1
				1121	vadd.s16 q5, q3, q0
				1122	vsub.s16 q6, q3, q0
				1123	vsub.s16 q7, q2, q1
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1124	STORE_IN_OUTPUT 25, 24, 25, q6, q7
				1125	STORE_IN_OUTPUT 25, 6, 7, q4, q5
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1126
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1127	; restore r0 by removing the last offset from the last
				1128	; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 2482
				1129	sub r0, r0, #2482
				1130	; restore r1 by removing the last offset from the last
				1131	; operation (STORE_IN_OUTPUT 24, 6, 7) => 7322
				1132	; advance by 8 columns => 8*2
				1133	sub r1, r1, #7322 - 8*2
				1134	; advance by 8 lines (8322)
				1135	; go back by the two pairs from the loop (32*2)
				1136	add r3, r3, #8322 - 32*2
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1137
				1138	; bands loop processing
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1139	subs r4, r4, #1
				1140	bne idct32_bands_loop
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1141
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1142	; parameters for second pass
				1143	; the input of pass2 is the result of pass1. we have to remove the offset
				1144	; of 32 columns induced by the above idct32_bands_loop
				1145	sub r3, r1, #32*2
				1146	; r1 = pass2[32 * 32]
				1147	add r1, sp, #2048
				1148
				1149	; pass loop processing
				1150	add r5, r5, #1
Johann	e72d49a	2013-11-12 10:41:06 -0800	[diff] [blame]	1151	b idct32_pass_loop
Christian Duvivier	5b1dc15	2013-09-25 18:07:10 -0700	[diff] [blame]	1152
				1153	idct32_bands_end_2nd_pass
				1154	STORE_COMBINE_CENTER_RESULTS
				1155	; --------------------------------------------------------------------------
				1156	; part of final stage
				1157	;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
				1158	;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
				1159	;output[30 * 32] = step1b[1][i] - step1b[30][i];
				1160	;output[31 * 32] = step1b[0][i] - step1b[31][i];
				1161	LOAD_FROM_OUTPUT 17, 30, 31, q0, q1
				1162	vadd.s16 q4, q2, q1
				1163	vadd.s16 q5, q3, q0
				1164	vsub.s16 q6, q3, q0
				1165	vsub.s16 q7, q2, q1
				1166	STORE_COMBINE_EXTREME_RESULTS
				1167	; --------------------------------------------------------------------------
				1168	; part of stage 7
				1169	;step1[2] = step1b[2][i] + step1b[13][i];
				1170	;step1[3] = step1b[3][i] + step1b[12][i];
				1171	;step1[12] = step1b[3][i] - step1b[12][i];
				1172	;step1[13] = step1b[2][i] - step1b[13][i];
				1173	LOAD_FROM_OUTPUT 31, 12, 13, q0, q1
				1174	vadd.s16 q2, q10, q1
				1175	vadd.s16 q3, q11, q0
				1176	vsub.s16 q4, q11, q0
				1177	vsub.s16 q5, q10, q1
				1178	; --------------------------------------------------------------------------
				1179	; part of final stage
				1180	;output[12 * 32] = step1b[12][i] + step1b[19][i];
				1181	;output[13 * 32] = step1b[13][i] + step1b[18][i];
				1182	;output[18 * 32] = step1b[13][i] - step1b[18][i];
				1183	;output[19 * 32] = step1b[12][i] - step1b[19][i];
				1184	LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
				1185	vadd.s16 q8, q4, q1
				1186	vadd.s16 q9, q5, q0
				1187	vsub.s16 q6, q5, q0
				1188	vsub.s16 q7, q4, q1
				1189	STORE_COMBINE_CENTER_RESULTS
				1190	; --------------------------------------------------------------------------
				1191	; part of final stage
				1192	;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
				1193	;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
				1194	;output[28 * 32] = step1b[3][i] - step1b[28][i];
				1195	;output[29 * 32] = step1b[2][i] - step1b[29][i];
				1196	LOAD_FROM_OUTPUT 19, 28, 29, q0, q1
				1197	vadd.s16 q4, q2, q1
				1198	vadd.s16 q5, q3, q0
				1199	vsub.s16 q6, q3, q0
				1200	vsub.s16 q7, q2, q1
				1201	STORE_COMBINE_EXTREME_RESULTS
				1202	; --------------------------------------------------------------------------
				1203	; part of stage 7
				1204	;step1[4] = step1b[4][i] + step1b[11][i];
				1205	;step1[5] = step1b[5][i] + step1b[10][i];
				1206	;step1[10] = step1b[5][i] - step1b[10][i];
				1207	;step1[11] = step1b[4][i] - step1b[11][i];
				1208	LOAD_FROM_OUTPUT 29, 10, 11, q0, q1
				1209	vadd.s16 q2, q12, q1
				1210	vadd.s16 q3, q13, q0
				1211	vsub.s16 q4, q13, q0
				1212	vsub.s16 q5, q12, q1
				1213	; --------------------------------------------------------------------------
				1214	; part of final stage
				1215	;output[10 * 32] = step1b[10][i] + step1b[21][i];
				1216	;output[11 * 32] = step1b[11][i] + step1b[20][i];
				1217	;output[20 * 32] = step1b[11][i] - step1b[20][i];
				1218	;output[21 * 32] = step1b[10][i] - step1b[21][i];
				1219	LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
				1220	vadd.s16 q8, q4, q1
				1221	vadd.s16 q9, q5, q0
				1222	vsub.s16 q6, q5, q0
				1223	vsub.s16 q7, q4, q1
				1224	STORE_COMBINE_CENTER_RESULTS
				1225	; --------------------------------------------------------------------------
				1226	; part of final stage
				1227	;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
				1228	;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
				1229	;output[26 * 32] = step1b[5][i] - step1b[26][i];
				1230	;output[27 * 32] = step1b[4][i] - step1b[27][i];
				1231	LOAD_FROM_OUTPUT 21, 26, 27, q0, q1
				1232	vadd.s16 q4, q2, q1
				1233	vadd.s16 q5, q3, q0
				1234	vsub.s16 q6, q3, q0
				1235	vsub.s16 q7, q2, q1
				1236	STORE_COMBINE_EXTREME_RESULTS
				1237	; --------------------------------------------------------------------------
				1238	; part of stage 7
				1239	;step1[6] = step1b[6][i] + step1b[9][i];
				1240	;step1[7] = step1b[7][i] + step1b[8][i];
				1241	;step1[8] = step1b[7][i] - step1b[8][i];
				1242	;step1[9] = step1b[6][i] - step1b[9][i];
				1243	LOAD_FROM_OUTPUT 27, 8, 9, q0, q1
				1244	vadd.s16 q2, q14, q1
				1245	vadd.s16 q3, q15, q0
				1246	vsub.s16 q4, q15, q0
				1247	vsub.s16 q5, q14, q1
				1248	; --------------------------------------------------------------------------
				1249	; part of final stage
				1250	;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
				1251	;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
				1252	;output[22 * 32] = step1b[9][i] - step1b[22][i];
				1253	;output[23 * 32] = step1b[8][i] - step1b[23][i];
				1254	LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
				1255	vadd.s16 q8, q4, q1
				1256	vadd.s16 q9, q5, q0
				1257	vsub.s16 q6, q5, q0
				1258	vsub.s16 q7, q4, q1
				1259	STORE_COMBINE_CENTER_RESULTS_LAST
				1260	; --------------------------------------------------------------------------
				1261	; part of final stage
				1262	;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
				1263	;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
				1264	;output[24 * 32] = step1b[7][i] - step1b[24][i];
				1265	;output[25 * 32] = step1b[6][i] - step1b[25][i];
				1266	LOAD_FROM_OUTPUT 23, 24, 25, q0, q1
				1267	vadd.s16 q4, q2, q1
				1268	vadd.s16 q5, q3, q0
				1269	vsub.s16 q6, q3, q0
				1270	vsub.s16 q7, q2, q1
				1271	STORE_COMBINE_EXTREME_RESULTS_LAST
				1272	; --------------------------------------------------------------------------
				1273	; restore pointers to their initial indices for next band pass by
				1274	; removing/adding dest_stride * 8. The actual increment by eight
				1275	; is taken care of within the _LAST macros.
				1276	add r6, r6, r2, lsl #3
				1277	add r9, r9, r2, lsl #3
				1278	sub r7, r7, r2, lsl #3
				1279	sub r10, r10, r2, lsl #3
				1280
				1281	; restore r0 by removing the last offset from the last
				1282	; operation (LOAD_FROM_TRANSPOSED 16, 8, 24) => 2482
				1283	sub r0, r0, #2482
				1284	; restore r1 by removing the last offset from the last
				1285	; operation (LOAD_FROM_OUTPUT 23, 24, 25) => 25322
				1286	; advance by 8 columns => 8*2
				1287	sub r1, r1, #25322 - 8*2
				1288	; advance by 8 lines (8322)
				1289	; go back by the two pairs from the loop (32*2)
				1290	add r3, r3, #8322 - 32*2
				1291
				1292	; bands loop processing
				1293	subs r4, r4, #1
				1294	bne idct32_bands_loop
				1295
				1296	; stack operation
				1297	add sp, sp, #512+2048+2048
				1298	vpop {d8-d15}
				1299	pop {r4-r11}
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1300	bx lr
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	1301	ENDP ; \|aom_idct32x32_1024_add_neon\|
Christian Duvivier	6a50146	2013-09-11 15:18:47 -0700	[diff] [blame]	1302	END