Merge "Deleted #include <inttypes.h>"
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index fc38605..3e3e400 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -29,8 +29,8 @@
int16_t skip_adding,
uint8_t *dest,
int dest_stride);
-extern void save_registers();
-extern void restore_registers();
+extern void save_neon_registers();
+extern void restore_neon_registers();
void vp9_short_idct16x16_add_neon(int16_t *input,
@@ -39,7 +39,7 @@
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
- save_registers();
+ save_neon_registers();
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -102,7 +102,7 @@
dest_stride);
// restore d8-d15 register values.
- restore_registers();
+ restore_neon_registers();
return;
}
@@ -113,7 +113,7 @@
int16_t row_idct_output[16*16] = {0};
// save d8-d15 register values.
- save_registers();
+ save_neon_registers();
/* Parallel idct on the upper 8 rows */
// First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -163,7 +163,7 @@
dest_stride);
// restore d8-d15 register values.
- restore_registers();
+ restore_neon_registers();
return;
}
diff --git a/vp9/common/arm/neon/vp9_idct32x32_neon.c b/vp9/common/arm/neon/vp9_idct32x32_neon.c
new file mode 100644
index 0000000..ceecd6f
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+
+// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
+ int16_t *output, int16_t *input);
+extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+
+
+// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
+ int dest_stride) {
+ // TODO(cd): move the creation of these buffers within the ASM file
+ // internal buffer used to transpose 8 lines into before transforming them
+ int16_t transpose_buffer[32 * 8];
+ // results of the first pass (transpose and transform rows)
+ int16_t pass1[32 * 32];
+ // results of the second pass (transpose and transform columns)
+ int16_t pass2[32 * 32];
+
+ // save register we need to preserve
+ save_neon_registers();
+ // process rows
+ idct32_transpose_and_transform(transpose_buffer, pass1, input);
+ // process columns
+ // TODO(cd): do these two steps/passes within the ASM file
+ idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
+ // combine and add to dest
+ // TODO(cd): integrate this within the last storage step of the second pass
+ idct32_combine_add(dest, pass2, dest_stride);
+ // restore register we need to preserve
+ restore_neon_registers();
+}
+
+// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index 92f3ceb..7464e80 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -12,8 +12,8 @@
EXPORT |vp9_short_idct16x16_add_neon_pass2|
EXPORT |vp9_short_idct10_16x16_add_neon_pass1|
EXPORT |vp9_short_idct10_16x16_add_neon_pass2|
- EXPORT |save_registers|
- EXPORT |restore_registers|
+ EXPORT |save_neon_registers|
+ EXPORT |restore_neon_registers|
ARM
REQUIRE8
PRESERVE8
@@ -1178,13 +1178,13 @@
pop {r3-r9}
bx lr
ENDP ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_registers|()
-|save_registers| PROC
+;void |save_neon_registers|()
+|save_neon_registers| PROC
vpush {d8-d15}
bx lr
ENDP ; |save_registers|
-;void |restore_registers|()
-|restore_registers| PROC
+;void |restore_neon_registers|()
+|restore_neon_registers| PROC
vpop {d8-d15}
bx lr
ENDP ; |restore_registers|
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
new file mode 100644
index 0000000..5c097cc
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -0,0 +1,1013 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+; dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64 EQU 16364
+cospi_2_64 EQU 16305
+cospi_3_64 EQU 16207
+cospi_4_64 EQU 16069
+cospi_5_64 EQU 15893
+cospi_6_64 EQU 15679
+cospi_7_64 EQU 15426
+cospi_8_64 EQU 15137
+cospi_9_64 EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU 9760
+cospi_20_64 EQU 9102
+cospi_21_64 EQU 8423
+cospi_22_64 EQU 7723
+cospi_23_64 EQU 7005
+cospi_24_64 EQU 6270
+cospi_25_64 EQU 5520
+cospi_26_64 EQU 4756
+cospi_27_64 EQU 3981
+cospi_28_64 EQU 3196
+cospi_29_64 EQU 2404
+cospi_30_64 EQU 1606
+cospi_31_64 EQU 804
+
+
+ EXPORT |idct32_transpose_and_transform|
+ EXPORT |idct32_combine_add|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ AREA Block, CODE, READONLY
+
+ ; --------------------------------------------------------------------------
+ ; Load from transposed_buffer
+ ; q13 = transposed_buffer[first_offset]
+ ; q14 = transposed_buffer[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; transposed_buffer must be passed in. use 0 for first use.
+ MACRO
+ LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+ ; address calculation with proper stride and loading
+ add r0, #($first_offset - $prev_offset )*8*2
+ vld1.s16 {q14}, [r0]
+ add r0, #($second_offset - $first_offset)*8*2
+ vld1.s16 {q13}, [r0]
+ ; (used) two registers (q14, q13)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Load from output (used as temporary storage)
+ ; reg1 = output[first_offset]
+ ; reg2 = output[second_offset]
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and loading
+ add r1, #($first_offset - $prev_offset )*32*2
+ vld1.s16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vld1.s16 {$reg2}, [r1]
+ ; (used) two registers ($reg1, $reg2)
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Store into output (sometimes as as temporary storage)
+ ; output[first_offset] = reg1
+ ; output[second_offset] = reg2
+ ; for proper address calculation, the last offset used when manipulating
+ ; output, wethere reading or storing) must be passed in. use 0 for first
+ ; use.
+ MACRO
+ STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+ ; address calculation with proper stride and storing
+ add r1, #($first_offset - $prev_offset )*32*2
+ vst1.16 {$reg1}, [r1]
+ add r1, #($second_offset - $first_offset)*32*2
+ vst1.16 {$reg2}, [r1]
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ ; TODO(cd): have special case to re-use constants when they are similar for
+ ; consecutive butterflies
+ ; TODO(cd): have special case when both constants are the same, do the
+ ; additions/substractions before the multiplies.
+ ; generate the constants
+ ; generate scalar constants
+ mov r3, #$first_constant & 0xFF00
+ add r3, #$first_constant & 0x00FF
+ mov r12, #$second_constant & 0xFF00
+ add r12, #$second_constant & 0x00FF
+ ; generate vector constants
+ vdup.16 d30, r3
+ vdup.16 d31, r12
+ ; (used) two for inputs (regA-regD), one for constants (q15)
+ ; do some multiplications (ordered for maximum latency hiding)
+ vmull.s16 q8, $regC, d30
+ vmull.s16 q10, $regA, d31
+ vmull.s16 q9, $regD, d30
+ vmull.s16 q11, $regB, d31
+ vmull.s16 q12, $regC, d31
+ ; (used) five for intermediate (q8-q12), one for constants (q15)
+ ; do some addition/substractions (to get back two register)
+ vsub.s32 q8, q8, q10
+ vsub.s32 q9, q9, q11
+ ; do more multiplications (ordered for maximum latency hiding)
+ vmull.s16 q10, $regD, d31
+ vmull.s16 q11, $regA, d30
+ vmull.s16 q15, $regB, d30
+ ; (used) six for intermediate (q8-q12, q15)
+ ; do more addition/substractions
+ vadd.s32 q11, q12, q11
+ vadd.s32 q10, q10, q15
+ ; (used) four for intermediate (q8-q11)
+ ; dct_const_round_shift
+ vqrshrn.s32 $reg1, q8, #14
+ vqrshrn.s32 $reg2, q9, #14
+ vqrshrn.s32 $reg3, q11, #14
+ vqrshrn.s32 $reg4, q10, #14
+ ; (used) two for results, well four d registers
+ MEND
+ ; --------------------------------------------------------------------------
+ ; Touches q8-q12, q15 (q13-q14 are preserved)
+ ; valid output registers are anything but q8-q11
+ MACRO
+ DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+ MEND
+ ; --------------------------------------------------------------------------
+
+;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;
+; r0 int16_t *transpose_buffer
+; r1 int16_t *output
+; r2 int16_t *input)
+; TODO(cd): have more logical parameter ordering but this issue will disappear
+; when functions are combined.
+
+|idct32_transpose_and_transform| PROC
+ ; This function does one pass of idct32x32 transform.
+ ;
+ ; This is done by transposing the input and then doing a 1d transform on
+ ; columns. In the first pass, the transposed columns are the original
+ ; rows. In the second pass, after the transposition, the colums are the
+ ; original columns.
+ ; The 1d transform is done by looping over bands of eight columns (the
+ ; idct32_bands loop). For each band, the transform input transposition
+ ; is done on demand, one band of four 8x8 matrices at a time. The four
+ ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
+ push {r4}
+ mov r4, #0 ; initialize bands loop counter
+idct32_bands_loop
+ ; TODO(cd) get rid of these push/pop by properly adjusting register
+ ; content at end of loop
+ push {r0}
+ push {r1}
+ push {r2}
+ mov r3, #0 ; initialize transpose loop counter
+idct32_transpose_pair_loop
+ ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+ ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+ ; adjusted to 32 because of the two post-increments.
+ vld1.s16 {q8}, [r2]!
+ vld1.s16 {q0}, [r2]!
+ add r2, #32
+ vld1.s16 {q9}, [r2]!
+ vld1.s16 {q1}, [r2]!
+ add r2, #32
+ vld1.s16 {q10}, [r2]!
+ vld1.s16 {q2}, [r2]!
+ add r2, #32
+ vld1.s16 {q11}, [r2]!
+ vld1.s16 {q3}, [r2]!
+ add r2, #32
+ vld1.s16 {q12}, [r2]!
+ vld1.s16 {q4}, [r2]!
+ add r2, #32
+ vld1.s16 {q13}, [r2]!
+ vld1.s16 {q5}, [r2]!
+ add r2, #32
+ vld1.s16 {q14}, [r2]!
+ vld1.s16 {q6}, [r2]!
+ add r2, #32
+ vld1.s16 {q15}, [r2]!
+ vld1.s16 {q7}, [r2]!
+
+ ; Transpose the two 8x8 16bit data matrices.
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vswp d1, d8
+ vswp d7, d14
+ vswp d5, d12
+ vswp d3, d10
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.32 q0, q2
+ vtrn.32 q1, q3
+ vtrn.32 q4, q6
+ vtrn.32 q5, q7
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ vtrn.16 q0, q1
+ vtrn.16 q2, q3
+ vtrn.16 q4, q5
+ vtrn.16 q6, q7
+
+ ; Store both matrices after each other. There is a stride of 32, which
+ ; adjusts to nothing because of the post-increments.
+ vst1.16 {q8}, [r0]!
+ vst1.16 {q9}, [r0]!
+ vst1.16 {q10}, [r0]!
+ vst1.16 {q11}, [r0]!
+ vst1.16 {q12}, [r0]!
+ vst1.16 {q13}, [r0]!
+ vst1.16 {q14}, [r0]!
+ vst1.16 {q15}, [r0]!
+ vst1.16 {q0}, [r0]!
+ vst1.16 {q1}, [r0]!
+ vst1.16 {q2}, [r0]!
+ vst1.16 {q3}, [r0]!
+ vst1.16 {q4}, [r0]!
+ vst1.16 {q5}, [r0]!
+ vst1.16 {q6}, [r0]!
+ vst1.16 {q7}, [r0]!
+
+ ; increment pointers by adjusted stride (not necessary for r0/out)
+ sub r2, r2, #8*32*2-32-16*2
+ ; transpose pair loop processing
+ add r3, r3, #1
+ cmp r3, #1
+ BLE idct32_transpose_pair_loop
+
+ ; restore r0/input to its original value
+ sub r0, r0, #32*8*2
+
+ ; Instead of doing the transforms stage by stage, it is done by loading
+ ; some input values and doing as many stages as possible to minimize the
+ ; storing/loading of intermediate results. To fit within registers, the
+ ; final coefficients are cut into four blocks:
+ ; BLOCK A: 16-19,28-31
+ ; BLOCK B: 20-23,24-27
+ ; BLOCK C: 8-10,11-15
+ ; BLOCK D: 0-3,4-7
+ ; Blocks A and C are straight calculation through the various stages. In
+ ; block B, further calculations are performed using the results from
+ ; block A. In block D, further calculations are performed using the results
+ ; from block C and then the final calculations are done using results from
+ ; block A and B which have been combined at the end of block B.
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK A: 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; generate 16,17,30,31
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] * cospi_1_64;
+ ;temp2 = input[1 * 32] * cospi_1_64 + input[31 * 32] * cospi_31_64;
+ ;step1b[16][i] = dct_const_round_shift(temp1);
+ ;step1b[31][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 0, 1, 31
+ DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+ ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+ ;step1b[17][i] = dct_const_round_shift(temp1);
+ ;step1b[30][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 31, 17, 15
+ DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[16] = step1b[16][i] + step1b[17][i];
+ ;step2[17] = step1b[16][i] - step1b[17][i];
+ ;step2[30] = -step1b[30][i] + step1b[31][i];
+ ;step2[31] = step1b[30][i] + step1b[31][i];
+ vadd.s16 q4, q0, q1
+ vsub.s16 q13, q0, q1
+ vadd.s16 q6, q2, q3
+ vsub.s16 q14, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+ ;temp2 = step1b[30][i] * cospi_4_64 - step1b[17][i] * cospi_28_64;
+ ;step3[17] = dct_const_round_shift(temp1);
+ ;step3[30] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; generate 18,19,28,29
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+ ;temp2 = input[9 * 32] * cospi_9_64 + input[23 * 32] * cospi_23_64;
+ ;step1b[18][i] = dct_const_round_shift(temp1);
+ ;step1b[29][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 15, 9, 23
+ DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[25 * 32] * cospi_7_64 - input[7 * 32] * cospi_25_64;
+ ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+ ;step1b[19][i] = dct_const_round_shift(temp1);
+ ;step1b[28][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 23, 25, 7
+ DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[18] = -step1b[18][i] + step1b[19][i];
+ ;step2[19] = step1b[18][i] + step1b[19][i];
+ ;step2[28] = step1b[28][i] + step1b[29][i];
+ ;step2[29] = step1b[28][i] - step1b[29][i];
+ vsub.s16 q13, q3, q2
+ vadd.s16 q3, q3, q2
+ vsub.s16 q14, q1, q0
+ vadd.s16 q2, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[18][i] * (-cospi_4_64) - step1b[29][i] * (-cospi_28_64);
+ ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+ ;step3[29] = dct_const_round_shift(temp1);
+ ;step3[18] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+ ; --------------------------------------------------------------------------
+ ; combine 16-19,28-31
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[16] = step1b[16][i] + step1b[19][i];
+ ;step1[17] = step1b[17][i] + step1b[18][i];
+ ;step1[18] = step1b[17][i] - step1b[18][i];
+ ;step1[29] = step1b[30][i] - step1b[29][i];
+ ;step1[30] = step1b[30][i] + step1b[29][i];
+ ;step1[31] = step1b[31][i] + step1b[28][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q0
+ vadd.s16 q10, q7, q1
+ vadd.s16 q15, q6, q3
+ vsub.s16 q13, q5, q0
+ vsub.s16 q14, q7, q1
+ STORE_IN_OUTPUT 0, 16, 31, q8, q15
+ STORE_IN_OUTPUT 31, 17, 30, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+ ;temp2 = step1b[29][i] * cospi_8_64 + step1b[18][i] * cospi_24_64;
+ ;step2[18] = dct_const_round_shift(temp1);
+ ;step2[29] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+ STORE_IN_OUTPUT 30, 29, 18, q1, q0
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[19] = step1b[16][i] - step1b[19][i];
+ ;step1[28] = step1b[31][i] - step1b[28][i];
+ vsub.s16 q13, q4, q2
+ vsub.s16 q14, q6, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+ ;temp2 = step1b[28][i] * cospi_8_64 + step1b[19][i] * cospi_24_64;
+ ;step2[19] = dct_const_round_shift(temp1);
+ ;step2[28] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+ STORE_IN_OUTPUT 18, 19, 28, q4, q6
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK B: 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; generate 20,21,26,27
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+ ;temp2 = input[5 * 32] * cospi_5_64 + input[27 * 32] * cospi_27_64;
+ ;step1b[20][i] = dct_const_round_shift(temp1);
+ ;step1b[27][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 7, 5, 27
+ DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+ ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+ ;step1b[21][i] = dct_const_round_shift(temp1);
+ ;step1b[26][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 27, 21, 11
+ DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[20] = step1b[20][i] + step1b[21][i];
+ ;step2[21] = step1b[20][i] - step1b[21][i];
+ ;step2[26] = -step1b[26][i] + step1b[27][i];
+ ;step2[27] = step1b[26][i] + step1b[27][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+ ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+ ;step3[21] = dct_const_round_shift(temp1);
+ ;step3[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 22,23,24,25
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+ ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+ ;step1b[22][i] = dct_const_round_shift(temp1);
+ ;step1b[25][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 11, 13, 19
+ DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 1
+ ;temp1 = input[29 * 32] * cospi_3_64 - input[3 * 32] * cospi_29_64;
+ ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+ ;step1b[23][i] = dct_const_round_shift(temp1);
+ ;step1b[24][i] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 19, 29, 3
+ DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;step2[22] = -step1b[22][i] + step1b[23][i];
+ ;step2[23] = step1b[22][i] + step1b[23][i];
+ ;step2[24] = step1b[24][i] + step1b[25][i];
+ ;step2[25] = step1b[24][i] - step1b[25][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+ ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+ ;step3[25] = dct_const_round_shift(temp1);
+ ;step3[22] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 20-23,24-27
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[22] = step1b[22][i] + step1b[21][i];
+ ;step1[23] = step1b[23][i] + step1b[20][i];
+ vadd.s16 q10, q7, q1
+ vadd.s16 q11, q5, q0
+ ;step1[24] = step1b[24][i] + step1b[27][i];
+ ;step1[25] = step1b[25][i] + step1b[26][i];
+ vadd.s16 q12, q6, q2
+ vadd.s16 q15, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[16] = step1b[16][i] + step1b[23][i];
+ ;step3[17] = step1b[17][i] + step1b[22][i];
+ ;step3[22] = step1b[17][i] - step1b[22][i];
+ ;step3[23] = step1b[16][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+ vadd.s16 q8, q14, q11
+ vadd.s16 q9, q13, q10
+ vsub.s16 q13, q13, q10
+ vsub.s16 q11, q14, q11
+ STORE_IN_OUTPUT 17, 17, 16, q9, q8
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[24] = step1b[31][i] - step1b[24][i];
+ ;step3[25] = step1b[30][i] - step1b[25][i];
+ ;step3[30] = step1b[30][i] + step1b[25][i];
+ ;step3[31] = step1b[31][i] + step1b[24][i];
+ LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+ vsub.s16 q8, q9, q12
+ vadd.s16 q10, q14, q15
+ vsub.s16 q14, q14, q15
+ vadd.s16 q12, q9, q12
+ STORE_IN_OUTPUT 31, 30, 31, q10, q12
+ ; --------------------------------------------------------------------------
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpush {q8} ; [24]
+ vpush {q11} ; [23]
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+ ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+ ;step1[22] = dct_const_round_shift(temp1);
+ ;step1[25] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 31, 25, 22, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+ ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+ ;step1[23] = dct_const_round_shift(temp1);
+ ;step1[24] = dct_const_round_shift(temp2);
+ ; TODO(cd) do some register allocation change to remove these push/pop
+ vpop {q13} ; [23]
+ vpop {q14} ; [24]
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 22, 24, 23, q14, q13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[20] = step1b[23][i] - step1b[20][i];
+ ;step1[27] = step1b[24][i] - step1b[27][i];
+ vsub.s16 q14, q5, q0
+ vsub.s16 q13, q6, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[20][i] * (-cospi_8_64) - step1b[27][i] * (-cospi_24_64);
+ ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+ ;step2[27] = dct_const_round_shift(temp1);
+ ;step2[20] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[21] = step1b[22][i] - step1b[21][i];
+ ;step1[26] = step1b[25][i] - step1b[26][i];
+ vsub.s16 q14, q7, q1
+ vsub.s16 q13, q4, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = step1b[21][i] * (-cospi_8_64) - step1b[26][i] * (-cospi_24_64);
+ ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+ ;step2[26] = dct_const_round_shift(temp1);
+ ;step2[21] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[18] = step1b[18][i] + step1b[21][i];
+ ;step3[19] = step1b[19][i] + step1b[20][i];
+ ;step3[20] = step1b[19][i] - step1b[20][i];
+ ;step3[21] = step1b[18][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+ vadd.s16 q8, q14, q1
+ vadd.s16 q9, q13, q6
+ vsub.s16 q13, q13, q6
+ vsub.s16 q1, q14, q1
+ STORE_IN_OUTPUT 19, 18, 19, q8, q9
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[27] = step1b[28][i] - step1b[27][i];
+ ;step3[28] = step1b[28][i] + step1b[27][i];
+ ;step3[29] = step1b[29][i] + step1b[26][i];
+ ;step3[26] = step1b[29][i] - step1b[26][i];
+ LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+ vsub.s16 q14, q8, q5
+ vadd.s16 q10, q8, q5
+ vadd.s16 q11, q9, q0
+ vsub.s16 q0, q9, q0
+ STORE_IN_OUTPUT 29, 28, 29, q10, q11
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+ ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+ ;step1[20] = dct_const_round_shift(temp1);
+ ;step1[27] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+ STORE_IN_OUTPUT 29, 20, 27, q13, q14
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+ ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+ ;step1[21] = dct_const_round_shift(temp1);
+ ;step1[26] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+ STORE_IN_OUTPUT 27, 21, 26, q1, q0
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK C: 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; generate 8,9,14,15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+ ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+ ;step2[8] = dct_const_round_shift(temp1);
+ ;step2[15] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 3, 2, 30
+ DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+ ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+ ;step2[9] = dct_const_round_shift(temp1);
+ ;step2[14] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 30, 18, 14
+ DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[8] = step1b[8][i] + step1b[9][i];
+ ;step3[9] = step1b[8][i] - step1b[9][i];
+ ;step3[14] = step1b[15][i] - step1b[14][i];
+ ;step3[15] = step1b[15][i] + step1b[14][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+ ;temp2 = step1b[14][i] * cospi_8_64 + step1b[9][i] * cospi_24_64;
+ ;step1[9] = dct_const_round_shift(temp1);
+ ;step1[14] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 10,11,12,13
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+ ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+ ;step2[10] = dct_const_round_shift(temp1);
+ ;step2[13] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 14, 10, 22
+ DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 2
+ ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+ ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+ ;step2[11] = dct_const_round_shift(temp1);
+ ;step2[12] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 22, 26, 6
+ DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;step3[10] = step1b[11][i] - step1b[10][i];
+ ;step3[11] = step1b[11][i] + step1b[10][i];
+ ;step3[12] = step1b[12][i] + step1b[13][i];
+ ;step3[13] = step1b[12][i] - step1b[13][i];
+ vsub.s16 q14, q4, q5
+ vadd.s16 q5, q4, q5
+ vsub.s16 q13, q6, q7
+ vadd.s16 q6, q6, q7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = step1b[10][i] * (-cospi_8_64) - step1b[13][i] * (-cospi_24_64);
+ ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+ ;step1[13] = dct_const_round_shift(temp1);
+ ;step1[10] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+ ; --------------------------------------------------------------------------
+ ; combine 8-10,11-15
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[8] = step1b[8][i] + step1b[11][i];
+ ;step2[9] = step1b[9][i] + step1b[10][i];
+ ;step2[10] = step1b[9][i] - step1b[10][i];
+ vadd.s16 q8, q0, q5
+ vadd.s16 q9, q1, q7
+ vsub.s16 q13, q1, q7
+ ;step2[13] = step1b[14][i] - step1b[13][i];
+ ;step2[14] = step1b[14][i] + step1b[13][i];
+ ;step2[15] = step1b[15][i] + step1b[12][i];
+ vsub.s16 q14, q3, q4
+ vadd.s16 q10, q3, q4
+ vadd.s16 q15, q2, q6
+ STORE_IN_OUTPUT 26, 8, 15, q8, q15
+ STORE_IN_OUTPUT 15, 9, 14, q9, q10
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+ ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+ ;step3[10] = dct_const_round_shift(temp1);
+ ;step3[13] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 14, 13, 10, q3, q1
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[11] = step1b[8][i] - step1b[11][i];
+ ;step2[12] = step1b[15][i] - step1b[12][i];
+ vsub.s16 q13, q0, q5
+ vsub.s16 q14, q2, q6
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+ ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+ ;step3[11] = dct_const_round_shift(temp1);
+ ;step3[12] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ STORE_IN_OUTPUT 10, 11, 12, q1, q3
+ ; --------------------------------------------------------------------------
+
+
+ ; --------------------------------------------------------------------------
+ ; BLOCK D: 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; generate 4,5,6,7
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+ ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+ ;step3[4] = dct_const_round_shift(temp1);
+ ;step3[7] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 6, 4, 28
+ DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+ ; --------------------------------------------------------------------------
+ ; part of stage 3
+ ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+ ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+ ;step3[5] = dct_const_round_shift(temp1);
+ ;step3[6] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 28, 20, 12
+ DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;step1[4] = step1b[4][i] + step1b[5][i];
+ ;step1[5] = step1b[4][i] - step1b[5][i];
+ ;step1[6] = step1b[7][i] - step1b[6][i];
+ ;step1[7] = step1b[7][i] + step1b[6][i];
+ vsub.s16 q13, q0, q1
+ vadd.s16 q0, q0, q1
+ vsub.s16 q14, q2, q3
+ vadd.s16 q2, q2, q3
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+ ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+ ;step2[5] = dct_const_round_shift(temp1);
+ ;step2[6] = dct_const_round_shift(temp2);
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+ ; --------------------------------------------------------------------------
+ ; generate 0,1,2,3
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+ ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+ ;step1[1] = dct_const_round_shift(temp1);
+ ;step1[0] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 12, 0, 16
+ DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+ ; --------------------------------------------------------------------------
+ ; part of stage 4
+ ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+ ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+ ;step1[2] = dct_const_round_shift(temp1);
+ ;step1[3] = dct_const_round_shift(temp2);
+ LOAD_FROM_TRANSPOSED 16, 8, 24
+ DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+ ; --------------------------------------------------------------------------
+ ; part of stage 5
+ ;step2[0] = step1b[0][i] + step1b[3][i];
+ ;step2[1] = step1b[1][i] + step1b[2][i];
+ ;step2[2] = step1b[1][i] - step1b[2][i];
+ ;step2[3] = step1b[0][i] - step1b[3][i];
+ vadd.s16 q4, q7, q6
+ vsub.s16 q7, q7, q6
+ vsub.s16 q6, q5, q14
+ vadd.s16 q5, q5, q14
+ ; --------------------------------------------------------------------------
+ ; combine 0-3,4-7
+ ; --------------------------------------------------------------------------
+ ; part of stage 6
+ ;step3[0] = step1b[0][i] + step1b[7][i];
+ ;step3[1] = step1b[1][i] + step1b[6][i];
+ ;step3[2] = step1b[2][i] + step1b[5][i];
+ ;step3[3] = step1b[3][i] + step1b[4][i];
+ vadd.s16 q8, q4, q2
+ vadd.s16 q9, q5, q3
+ vadd.s16 q10, q6, q1
+ vadd.s16 q11, q7, q0
+ ;step3[4] = step1b[3][i] - step1b[4][i];
+ ;step3[5] = step1b[2][i] - step1b[5][i];
+ ;step3[6] = step1b[1][i] - step1b[6][i];
+ ;step3[7] = step1b[0][i] - step1b[7][i];
+ vsub.s16 q12, q7, q0
+ vsub.s16 q13, q6, q1
+ vsub.s16 q14, q5, q3
+ vsub.s16 q15, q4, q2
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[0] = step1b[0][i] + step1b[15][i];
+ ;step1[1] = step1b[1][i] + step1b[14][i];
+ ;step1[14] = step1b[1][i] - step1b[14][i];
+ ;step1[15] = step1b[0][i] - step1b[15][i];
+ LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+ vadd.s16 q2, q8, q1
+ vadd.s16 q3, q9, q0
+ vsub.s16 q4, q9, q0
+ vsub.s16 q5, q8, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+ ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+ ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+ ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+ LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+ vadd.s16 q8, q4, q1
+ vadd.s16 q9, q5, q0
+ vsub.s16 q6, q5, q0
+ vsub.s16 q7, q4, q1
+ STORE_IN_OUTPUT 17, 17, 16, q7, q6
+ STORE_IN_OUTPUT 16, 15, 14, q9, q8
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+ ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+ ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+ ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+ LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 31, 31, 30, q7, q6
+ STORE_IN_OUTPUT 30, 0, 1, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[2] = step1b[2][i] + step1b[13][i];
+ ;step1[3] = step1b[3][i] + step1b[12][i];
+ ;step1[12] = step1b[3][i] - step1b[12][i];
+ ;step1[13] = step1b[2][i] - step1b[13][i];
+ LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+ vadd.s16 q2, q10, q1
+ vadd.s16 q3, q11, q0
+ vsub.s16 q4, q11, q0
+ vsub.s16 q5, q10, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+ ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+ ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+ ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+ LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+ vadd.s16 q6, q4, q1
+ vadd.s16 q7, q5, q0
+ vsub.s16 q8, q5, q0
+ vsub.s16 q9, q4, q1
+ STORE_IN_OUTPUT 19, 19, 18, q9, q8
+ STORE_IN_OUTPUT 18, 13, 12, q7, q6
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+ ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+ ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+ ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+ LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 29, 29, 28, q7, q6
+ STORE_IN_OUTPUT 28, 2, 3, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[4] = step1b[4][i] + step1b[11][i];
+ ;step1[5] = step1b[5][i] + step1b[10][i];
+ ;step1[10] = step1b[5][i] - step1b[10][i];
+ ;step1[11] = step1b[4][i] - step1b[11][i];
+ LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+ vadd.s16 q2, q12, q1
+ vadd.s16 q3, q13, q0
+ vsub.s16 q4, q13, q0
+ vsub.s16 q5, q12, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+ ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+ ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+ ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+ LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+ vadd.s16 q6, q4, q1
+ vadd.s16 q7, q5, q0
+ vsub.s16 q8, q5, q0
+ vsub.s16 q9, q4, q1
+ STORE_IN_OUTPUT 21, 21, 20, q9, q8
+ STORE_IN_OUTPUT 20, 11, 10, q7, q6
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+ ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+ ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+ ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+ LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 27, 27, 26, q7, q6
+ STORE_IN_OUTPUT 26, 4, 5, q4, q5
+ ; --------------------------------------------------------------------------
+ ; part of stage 7
+ ;step1[6] = step1b[6][i] + step1b[9][i];
+ ;step1[7] = step1b[7][i] + step1b[8][i];
+ ;step1[8] = step1b[7][i] - step1b[8][i];
+ ;step1[9] = step1b[6][i] - step1b[9][i];
+ LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+ vadd.s16 q2, q14, q1
+ vadd.s16 q3, q15, q0
+ vsub.s16 q4, q15, q0
+ vsub.s16 q5, q14, q1
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+ ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+ ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+ ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+ LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+ vadd.s16 q6, q4, q1
+ vadd.s16 q7, q5, q0
+ vsub.s16 q8, q5, q0
+ vsub.s16 q9, q4, q1
+ STORE_IN_OUTPUT 23, 23, 22, q9, q8
+ STORE_IN_OUTPUT 22, 9, 8, q7, q6
+ ; --------------------------------------------------------------------------
+ ; part of final stage
+ ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+ ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+ ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+ ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+ LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+ vadd.s16 q4, q2, q1
+ vadd.s16 q5, q3, q0
+ vsub.s16 q6, q3, q0
+ vsub.s16 q7, q2, q1
+ STORE_IN_OUTPUT 25, 25, 24, q7, q6
+ STORE_IN_OUTPUT 24, 6, 7, q4, q5
+ ; --------------------------------------------------------------------------
+
+ ; TODO(cd) get rid of these push/pop by properly adjusting register
+ ; content at end of loop
+ pop {r2}
+ pop {r1}
+ pop {r0}
+ add r1, r1, #8*2
+ add r2, r2, #8*32*2
+
+ ; bands loop processing
+ add r4, r4, #1
+ cmp r4, #3
+ BLE idct32_bands_loop
+
+ pop {r4}
+ bx lr
+ ENDP ; |idct32_transpose_and_transform|
+
+;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+;
+; r0 uint8_t *dest
+; r1 int16_t *out
+; r2 int dest_stride)
+
+|idct32_combine_add| PROC
+
+ mov r12, r0 ; dest pointer used for stores
+ sub r2, r2, #32 ; adjust the stride (remove the post-increments)
+ mov r3, #0 ; initialize loop counter
+
+idct32_combine_add_loop
+ ; load out[j * 32 + 0-31]
+ vld1.s16 {q12}, [r1]!
+ vld1.s16 {q13}, [r1]!
+ vld1.s16 {q14}, [r1]!
+ vld1.s16 {q15}, [r1]!
+ ; load dest[j * dest_stride + 0-31]
+ vld1.s16 {q6}, [r0]!
+ vld1.s16 {q7}, [r0]!
+ ; ROUND_POWER_OF_TWO
+ vrshr.s16 q12, q12, #6
+ vrshr.s16 q13, q13, #6
+ vrshr.s16 q14, q14, #6
+ vrshr.s16 q15, q15, #6
+ ; add to dest[j * dest_stride + 0-31]
+ vaddw.u8 q12, q12, d12
+ vaddw.u8 q13, q13, d13
+ vaddw.u8 q14, q14, d14
+ vaddw.u8 q15, q15, d15
+ ; clip pixel
+ vqmovun.s16 d12, q12
+ vqmovun.s16 d13, q13
+ vqmovun.s16 d14, q14
+ vqmovun.s16 d15, q15
+ ; store back into dest[j * dest_stride + 0-31]
+ vst1.16 {q6}, [r12]!
+ vst1.16 {q7}, [r12]!
+ ; increment pointers by adjusted stride (not necessary for r1/out)
+ add r0, r0, r2
+ add r12, r12, r2
+ ; loop processing
+ add r3, r3, #1
+ cmp r3, #31
+ BLE idct32_combine_add_loop
+
+ bx lr
+ ENDP ; |idct32_transpose|
+
+ END
diff --git a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
new file mode 100644
index 0000000..53ccee0
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -0,0 +1,696 @@
+;
+; Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+ EXPORT |vp9_short_iht8x8_add_neon|
+ ARM
+ REQUIRE8
+ PRESERVE8
+
+ AREA ||.text||, CODE, READONLY, ALIGN=2
+
+ ; Generate IADST constants in r0 - r12 for the IADST.
+ MACRO
+ GENERATE_IADST_CONSTANTS
+ ; generate cospi_2_64 = 16305
+ mov r0, #0x3f00
+ add r0, #0xb1
+
+ ; generate cospi_30_64 = 1606
+ mov r1, #0x600
+ add r1, #0x46
+
+ ; generate cospi_10_64 = 14449
+ mov r2, #0x3800
+ add r2, #0x71
+
+ ; generate cospi_22_64 = 7723
+ mov r3, #0x1e00
+ add r3, #0x2b
+
+ ; generate cospi_18_64 = 10394
+ mov r4, #0x2800
+ add r4, #0x9a
+
+ ; generate cospi_14_64 = 12665
+ mov r5, #0x3100
+ add r5, #0x79
+
+ ; generate cospi_26_64 = 4756
+ mov r6, #0x1200
+ add r6, #0x94
+
+ ; generate cospi_6_64 = 15679
+ mov r7, #0x3d00
+ add r7, #0x3f
+
+ ; generate cospi_8_64 = 15137
+ mov r8, #0x3b00
+ add r8, #0x21
+
+ ; generate cospi_24_64 = 6270
+ mov r9, #0x1800
+ add r9, #0x7e
+
+ ; generate 0
+ mov r10, #0
+
+ ; generate cospi_16_64 = 11585
+ mov r12, #0x2d00
+ add r12, #0x41
+ MEND
+
+ ; Generate IDCT constants in r3 - r9 for the IDCT.
+ MACRO
+ GENERATE_IDCT_CONSTANTS
+ ; generate cospi_28_64 = 3196
+ mov r3, #0x0c00
+ add r3, #0x7c
+
+ ; generate cospi_4_64 = 16069
+ mov r4, #0x3e00
+ add r4, #0xc5
+
+ ; generate cospi_12_64 = 13623
+ mov r5, #0x3500
+ add r5, #0x37
+
+ ; generate cospi_20_64 = 9102
+ mov r6, #0x2300
+ add r6, #0x8e
+
+ ; generate cospi_16_64 = 11585
+ mov r7, #0x2d00
+ add r7, #0x41
+
+ ; generate cospi_24_64 = 6270
+ mov r8, #0x1800
+ add r8, #0x7e
+
+ ; generate cospi_8_64 = 15137
+ mov r9, #0x3b00
+ add r9, #0x21
+ MEND
+
+ ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+ MACRO
+ TRANSPOSE8X8
+ vswp d17, d24
+ vswp d23, d30
+ vswp d21, d28
+ vswp d19, d26
+ vtrn.32 q8, q10
+ vtrn.32 q9, q11
+ vtrn.32 q12, q14
+ vtrn.32 q13, q15
+ vtrn.16 q8, q9
+ vtrn.16 q10, q11
+ vtrn.16 q12, q13
+ vtrn.16 q14, q15
+ MEND
+
+ ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+ ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+ ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+ ; registers and use them as buffer during calculation.
+ MACRO
+ IDCT8x8_1D
+ ; stage 1
+ vdup.16 d0, r3 ; duplicate cospi_28_64
+ vdup.16 d1, r4 ; duplicate cospi_4_64
+ vdup.16 d2, r5 ; duplicate cospi_12_64
+ vdup.16 d3, r6 ; duplicate cospi_20_64
+
+ ; input[1] * cospi_28_64
+ vmull.s16 q2, d18, d0
+ vmull.s16 q3, d19, d0
+
+ ; input[5] * cospi_12_64
+ vmull.s16 q5, d26, d2
+ vmull.s16 q6, d27, d2
+
+ ; input[1]*cospi_28_64-input[7]*cospi_4_64
+ vmlsl.s16 q2, d30, d1
+ vmlsl.s16 q3, d31, d1
+
+ ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+ vmlsl.s16 q5, d22, d3
+ vmlsl.s16 q6, d23, d3
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d8, q2, #14 ; >> 14
+ vqrshrn.s32 d9, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; input[1] * cospi_4_64
+ vmull.s16 q2, d18, d1
+ vmull.s16 q3, d19, d1
+
+ ; input[5] * cospi_20_64
+ vmull.s16 q9, d26, d3
+ vmull.s16 q13, d27, d3
+
+ ; input[1]*cospi_4_64+input[7]*cospi_28_64
+ vmlal.s16 q2, d30, d0
+ vmlal.s16 q3, d31, d0
+
+ ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+ vmlal.s16 q9, d22, d2
+ vmlal.s16 q13, d23, d2
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d14, q2, #14 ; >> 14
+ vqrshrn.s32 d15, q3, #14 ; >> 14
+
+ ; stage 2 & stage 3 - even half
+ vdup.16 d0, r7 ; duplicate cospi_16_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q9, #14 ; >> 14
+ vqrshrn.s32 d13, q13, #14 ; >> 14
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q2, d16, d0
+ vmull.s16 q3, d17, d0
+
+ ; input[0] * cospi_16_64
+ vmull.s16 q13, d16, d0
+ vmull.s16 q15, d17, d0
+
+ ; (input[0] + input[2]) * cospi_16_64
+ vmlal.s16 q2, d24, d0
+ vmlal.s16 q3, d25, d0
+
+ ; (input[0] - input[2]) * cospi_16_64
+ vmlsl.s16 q13, d24, d0
+ vmlsl.s16 q15, d25, d0
+
+ vdup.16 d0, r8 ; duplicate cospi_24_64
+ vdup.16 d1, r9 ; duplicate cospi_8_64
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d18, q2, #14 ; >> 14
+ vqrshrn.s32 d19, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d22, q13, #14 ; >> 14
+ vqrshrn.s32 d23, q15, #14 ; >> 14
+
+ ; input[1] * cospi_24_64
+ vmull.s16 q2, d20, d0
+ vmull.s16 q3, d21, d0
+
+ ; input[1] * cospi_8_64
+ vmull.s16 q8, d20, d1
+ vmull.s16 q12, d21, d1
+
+ ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+ vmlsl.s16 q2, d28, d1
+ vmlsl.s16 q3, d29, d1
+
+ ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+ vmlal.s16 q8, d28, d0
+ vmlal.s16 q12, d29, d0
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d26, q2, #14 ; >> 14
+ vqrshrn.s32 d27, q3, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d30, q8, #14 ; >> 14
+ vqrshrn.s32 d31, q12, #14 ; >> 14
+
+ vadd.s16 q0, q9, q15 ; output[0] = step[0] + step[3]
+ vadd.s16 q1, q11, q13 ; output[1] = step[1] + step[2]
+ vsub.s16 q2, q11, q13 ; output[2] = step[1] - step[2]
+ vsub.s16 q3, q9, q15 ; output[3] = step[0] - step[3]
+
+ ; stage 3 -odd half
+ vdup.16 d16, r7 ; duplicate cospi_16_64
+
+ ; stage 2 - odd half
+ vsub.s16 q13, q4, q5 ; step2[5] = step1[4] - step1[5]
+ vadd.s16 q4, q4, q5 ; step2[4] = step1[4] + step1[5]
+ vsub.s16 q14, q7, q6 ; step2[6] = -step1[6] + step1[7]
+ vadd.s16 q7, q7, q6 ; step2[7] = step1[6] + step1[7]
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q9, d28, d16
+ vmull.s16 q10, d29, d16
+
+ ; step2[6] * cospi_16_64
+ vmull.s16 q11, d28, d16
+ vmull.s16 q12, d29, d16
+
+ ; (step2[6] - step2[5]) * cospi_16_64
+ vmlsl.s16 q9, d26, d16
+ vmlsl.s16 q10, d27, d16
+
+ ; (step2[5] + step2[6]) * cospi_16_64
+ vmlal.s16 q11, d26, d16
+ vmlal.s16 q12, d27, d16
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d10, q9, #14 ; >> 14
+ vqrshrn.s32 d11, q10, #14 ; >> 14
+
+ ; dct_const_round_shift(input_dc * cospi_16_64)
+ vqrshrn.s32 d12, q11, #14 ; >> 14
+ vqrshrn.s32 d13, q12, #14 ; >> 14
+
+ ; stage 4
+ vadd.s16 q8, q0, q7 ; output[0] = step1[0] + step1[7];
+ vadd.s16 q9, q1, q6 ; output[1] = step1[1] + step1[6];
+ vadd.s16 q10, q2, q5 ; output[2] = step1[2] + step1[5];
+ vadd.s16 q11, q3, q4 ; output[3] = step1[3] + step1[4];
+ vsub.s16 q12, q3, q4 ; output[4] = step1[3] - step1[4];
+ vsub.s16 q13, q2, q5 ; output[5] = step1[2] - step1[5];
+ vsub.s16 q14, q1, q6 ; output[6] = step1[1] - step1[6];
+ vsub.s16 q15, q0, q7 ; output[7] = step1[0] - step1[7];
+ MEND
+
+ ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+ ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+ ; output will be stored back into q8-q15 registers. This macro will touch
+ ; q0 - q7 registers and use them as buffer during calculation.
+ MACRO
+ IADST8X8_1D
+ vdup.16 d14, r0 ; duplicate cospi_2_64
+ vdup.16 d15, r1 ; duplicate cospi_30_64
+
+ ; cospi_2_64 * x0
+ vmull.s16 q1, d30, d14
+ vmull.s16 q2, d31, d14
+
+ ; s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+ vmlal.s16 q1, d16, d15
+ vmlal.s16 q2, d17, d15
+
+ ; cospi_30_64 * x0
+ vmull.s16 q3, d30, d15
+ vmull.s16 q4, d31, d15
+
+ ; s1 = cospi_30_64 * x0 - cospi_2_64 * x1
+ vmlsl.s16 q3, d16, d14
+ vmlsl.s16 q4, d17, d14
+
+ vdup.16 d30, r4 ; duplicate cospi_18_64
+ vdup.16 d31, r5 ; duplicate cospi_14_64
+
+ ; cospi_18_64 * x4
+ vmull.s16 q5, d22, d30
+ vmull.s16 q6, d23, d30
+
+ ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+ vmlal.s16 q5, d24, d31
+ vmlal.s16 q6, d25, d31
+
+ ; cospi_14_64 * x4
+ vmull.s16 q7, d22, d31
+ vmull.s16 q8, d23, d31
+
+ ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+ vmlsl.s16 q7, d24, d30
+ vmlsl.s16 q8, d25, d30
+
+ ; (s0 + s4)
+ vadd.s32 q11, q1, q5
+ vadd.s32 q12, q2, q6
+
+ ; x0 = dct_const_round_shift(s0 + s4);
+ vqrshrn.s32 d22, q11, #14 ; >> 14
+ vqrshrn.s32 d23, q12, #14 ; >> 14
+
+ ; (s0 - s4)
+ vsub.s32 q1, q1, q5
+ vsub.s32 q2, q2, q6
+
+ ; x4 = dct_const_round_shift(s0 - s4);
+ vqrshrn.s32 d2, q1, #14 ; >> 14
+ vqrshrn.s32 d3, q2, #14 ; >> 14
+
+ ; (s1 + s5)
+ vadd.s32 q12, q3, q7
+ vadd.s32 q15, q4, q8
+
+ ; x1 = dct_const_round_shift(s1 + s5);
+ vqrshrn.s32 d24, q12, #14 ; >> 14
+ vqrshrn.s32 d25, q15, #14 ; >> 14
+
+ ; (s1 - s5)
+ vsub.s32 q3, q3, q7
+ vsub.s32 q4, q4, q8
+
+ ; x5 = dct_const_round_shift(s1 - s5);
+ vqrshrn.s32 d6, q3, #14 ; >> 14
+ vqrshrn.s32 d7, q4, #14 ; >> 14
+
+ vdup.16 d30, r2 ; duplicate cospi_10_64
+ vdup.16 d31, r3 ; duplicate cospi_22_64
+
+ ; cospi_10_64 * x2
+ vmull.s16 q4, d26, d30
+ vmull.s16 q5, d27, d30
+
+ ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+ vmlal.s16 q4, d20, d31
+ vmlal.s16 q5, d21, d31
+
+ ; cospi_22_64 * x2
+ vmull.s16 q2, d26, d31
+ vmull.s16 q6, d27, d31
+
+ ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+ vmlsl.s16 q2, d20, d30
+ vmlsl.s16 q6, d21, d30
+
+ vdup.16 d30, r6 ; duplicate cospi_26_64
+ vdup.16 d31, r7 ; duplicate cospi_6_64
+
+ ; cospi_26_64 * x6
+ vmull.s16 q0, d18, d30
+ vmull.s16 q13, d19, d30
+
+ ; s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+ vmlal.s16 q0, d28, d31
+ vmlal.s16 q13, d29, d31
+
+ ; cospi_6_64 * x6
+ vmull.s16 q10, d18, d31
+ vmull.s16 q9, d19, d31
+
+ ; s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+ vmlsl.s16 q10, d28, d30
+ vmlsl.s16 q9, d29, d30
+
+ ; (s3 + s7)
+ vadd.s32 q14, q2, q10
+ vadd.s32 q15, q6, q9
+
+ ; x3 = dct_const_round_shift(s3 + s7);
+ vqrshrn.s32 d28, q14, #14 ; >> 14
+ vqrshrn.s32 d29, q15, #14 ; >> 14
+
+ ; (s3 - s7)
+ vsub.s32 q2, q2, q10
+ vsub.s32 q6, q6, q9
+
+ ; x7 = dct_const_round_shift(s3 - s7);
+ vqrshrn.s32 d4, q2, #14 ; >> 14
+ vqrshrn.s32 d5, q6, #14 ; >> 14
+
+ ; (s2 + s6)
+ vadd.s32 q9, q4, q0
+ vadd.s32 q10, q5, q13
+
+ ; x2 = dct_const_round_shift(s2 + s6);
+ vqrshrn.s32 d18, q9, #14 ; >> 14
+ vqrshrn.s32 d19, q10, #14 ; >> 14
+
+ ; (s2 - s6)
+ vsub.s32 q4, q4, q0
+ vsub.s32 q5, q5, q13
+
+ ; x6 = dct_const_round_shift(s2 - s6);
+ vqrshrn.s32 d8, q4, #14 ; >> 14
+ vqrshrn.s32 d9, q5, #14 ; >> 14
+
+ vdup.16 d30, r8 ; duplicate cospi_8_64
+ vdup.16 d31, r9 ; duplicate cospi_24_64
+
+ ; cospi_8_64 * x4
+ vmull.s16 q5, d2, d30
+ vmull.s16 q6, d3, d30
+
+ ; s4 = cospi_8_64 * x4 + cospi_24_64 * x5;
+ vmlal.s16 q5, d6, d31
+ vmlal.s16 q6, d7, d31
+
+ ; cospi_24_64 * x4
+ vmull.s16 q7, d2, d31
+ vmull.s16 q0, d3, d31
+
+ ; s5 = cospi_24_64 * x4 - cospi_8_64 * x5;
+ vmlsl.s16 q7, d6, d30
+ vmlsl.s16 q0, d7, d30
+
+ ; cospi_8_64 * x7
+ vmull.s16 q1, d4, d30
+ vmull.s16 q3, d5, d30
+
+ ; s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
+ vmlsl.s16 q1, d8, d31
+ vmlsl.s16 q3, d9, d31
+
+ ; cospi_24_64 * x7
+ vmull.s16 q10, d4, d31
+ vmull.s16 q2, d5, d31
+
+ ; s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
+ vmlal.s16 q10, d8, d30
+ vmlal.s16 q2, d9, d30
+
+ vadd.s16 q8, q11, q9 ; x0 = s0 + s2;
+
+ vsub.s16 q11, q11, q9 ; x2 = s0 - s2;
+
+ vadd.s16 q4, q12, q14 ; x1 = s1 + s3;
+
+ vsub.s16 q12, q12, q14 ; x3 = s1 - s3;
+
+ ; (s4 + s6)
+ vadd.s32 q14, q5, q1
+ vadd.s32 q15, q6, q3
+
+ ; x4 = dct_const_round_shift(s4 + s6);
+ vqrshrn.s32 d18, q14, #14 ; >> 14
+ vqrshrn.s32 d19, q15, #14 ; >> 14
+
+ ; (s4 - s6)
+ vsub.s32 q5, q5, q1
+ vsub.s32 q6, q6, q3
+
+ ; x6 = dct_const_round_shift(s4 - s6);
+ vqrshrn.s32 d10, q5, #14 ; >> 14
+ vqrshrn.s32 d11, q6, #14 ; >> 14
+
+ ; (s5 + s7)
+ vadd.s32 q1, q7, q10
+ vadd.s32 q3, q0, q2
+
+ ; x5 = dct_const_round_shift(s5 + s7);
+ vqrshrn.s32 d28, q1, #14 ; >> 14
+ vqrshrn.s32 d29, q3, #14 ; >> 14
+
+ ; (s5 - s7))
+ vsub.s32 q7, q7, q10
+ vsub.s32 q0, q0, q2
+
+ ; x7 = dct_const_round_shift(s5 - s7);
+ vqrshrn.s32 d14, q7, #14 ; >> 14
+ vqrshrn.s32 d15, q0, #14 ; >> 14
+
+ vdup.16 d30, r12 ; duplicate cospi_16_64
+
+ ; cospi_16_64 * x2
+ vmull.s16 q2, d22, d30
+ vmull.s16 q3, d23, d30
+
+ ; cospi_16_64 * x2 + cospi_16_64 * x3;
+ vmlal.s16 q2, d24, d30
+ vmlal.s16 q3, d25, d30
+
+ ; x2 = dct_const_round_shift(s2);
+ vqrshrn.s32 d4, q2, #14 ; >> 14
+ vqrshrn.s32 d5, q3, #14 ; >> 14
+
+ ; cospi_6_64 * x6
+ vmull.s16 q13, d22, d30
+ vmull.s16 q1, d23, d30
+
+ ; cospi_16_64 * x2 - cospi_16_64 * x3;
+ vmlsl.s16 q13, d24, d30
+ vmlsl.s16 q1, d25, d30
+
+ ;x3 = dct_const_round_shift(s3);
+ vqrshrn.s32 d24, q13, #14 ; >> 14
+ vqrshrn.s32 d25, q1, #14 ; >> 14
+
+ ; cospi_16_64 * x6
+ vmull.s16 q13, d10, d30
+ vmull.s16 q1, d11, d30
+
+ ; cospi_16_64 * x6 + cospi_16_64 * x7;
+ vmlal.s16 q13, d14, d30
+ vmlal.s16 q1, d15, d30
+
+ ; x6 = dct_const_round_shift(s6);
+ vqrshrn.s32 d20, q13, #14 ; >> 14
+ vqrshrn.s32 d21, q1, #14 ; >> 14
+
+ ; cospi_6_64 * x6
+ vmull.s16 q13, d10, d30
+ vmull.s16 q1, d11, d30
+
+ ; cospi_16_64 * x6 - cospi_16_64 * x7;
+ vmlsl.s16 q13, d14, d30
+ vmlsl.s16 q1, d15, d30
+
+ ;x7 = dct_const_round_shift(s7);
+ vqrshrn.s32 d12, q13, #14 ; >> 14
+ vqrshrn.s32 d13, q1, #14 ; >> 14
+
+ vdup.16 q5, r10 ; duplicate 0
+
+ vsub.s16 q9, q5, q9 ; output[1] = -x4;
+ vsub.s16 q13, q5, q6 ; output[5] = -x7;
+ vsub.s16 q11, q5, q2 ; output[3] = -x2;
+ vsub.s16 q15, q5, q4 ; output[7] = -x1;
+ MEND
+
+
+ AREA Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+; int dest_stride, int tx_type)
+;
+; r0 int16_t input
+; r1 uint8_t *dest
+; r2 int dest_stride
+; r3 int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht8x8_add_neon| PROC
+
+ ; load the inputs into d16-d19
+ vld1.s16 {q8,q9}, [r0]!
+ vld1.s16 {q10,q11}, [r0]!
+ vld1.s16 {q12,q13}, [r0]!
+ vld1.s16 {q14,q15}, [r0]!
+
+ push {r0-r10}
+
+ ; transpose the input data
+ TRANSPOSE8X8
+
+ ; decide the type of transform
+ cmp r3, #2
+ beq idct_iadst
+ cmp r3, #3
+ beq iadst_iadst
+
+iadst_idct
+ ; generate IDCT constants
+ GENERATE_IDCT_CONSTANTS
+
+ ; first transform rows
+ IDCT8x8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; then transform columns
+ IADST8X8_1D
+
+ b end_vp9_short_iht8x8_add_neon
+
+idct_iadst
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; first transform rows
+ IADST8X8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; generate IDCT constants
+ GENERATE_IDCT_CONSTANTS
+
+ ; then transform columns
+ IDCT8x8_1D
+
+ b end_vp9_short_iht8x8_add_neon
+
+iadst_iadst
+ ; generate IADST constants
+ GENERATE_IADST_CONSTANTS
+
+ ; first transform rows
+ IADST8X8_1D
+
+ ; transpose the matrix
+ TRANSPOSE8X8
+
+ ; then transform columns
+ IADST8X8_1D
+
+end_vp9_short_iht8x8_add_neon
+ pop {r0-r10}
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+ vrshr.s16 q8, q8, #5
+ vrshr.s16 q9, q9, #5
+ vrshr.s16 q10, q10, #5
+ vrshr.s16 q11, q11, #5
+ vrshr.s16 q12, q12, #5
+ vrshr.s16 q13, q13, #5
+ vrshr.s16 q14, q14, #5
+ vrshr.s16 q15, q15, #5
+
+ ; save dest pointer
+ mov r0, r1
+
+ ; load destination data
+ vld1.64 {d0}, [r1], r2
+ vld1.64 {d1}, [r1], r2
+ vld1.64 {d2}, [r1], r2
+ vld1.64 {d3}, [r1], r2
+ vld1.64 {d4}, [r1], r2
+ vld1.64 {d5}, [r1], r2
+ vld1.64 {d6}, [r1], r2
+ vld1.64 {d7}, [r1]
+
+ ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+ vaddw.u8 q8, q8, d0
+ vaddw.u8 q9, q9, d1
+ vaddw.u8 q10, q10, d2
+ vaddw.u8 q11, q11, d3
+ vaddw.u8 q12, q12, d4
+ vaddw.u8 q13, q13, d5
+ vaddw.u8 q14, q14, d6
+ vaddw.u8 q15, q15, d7
+
+ ; clip_pixel
+ vqmovun.s16 d0, q8
+ vqmovun.s16 d1, q9
+ vqmovun.s16 d2, q10
+ vqmovun.s16 d3, q11
+ vqmovun.s16 d4, q12
+ vqmovun.s16 d5, q13
+ vqmovun.s16 d6, q14
+ vqmovun.s16 d7, q15
+
+ ; store the data
+ vst1.64 {d0}, [r0], r2
+ vst1.64 {d1}, [r0], r2
+ vst1.64 {d2}, [r0], r2
+ vst1.64 {d3}, [r0], r2
+ vst1.64 {d4}, [r0], r2
+ vst1.64 {d5}, [r0], r2
+ vst1.64 {d6}, [r0], r2
+ vst1.64 {d7}, [r0], r2
+ bx lr
+ ENDP ; |vp9_short_iht8x8_add_neon|
+
+ END
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index f5eeb2c..042afbb 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -319,7 +319,7 @@
specialize vp9_short_idct10_16x16_add sse2 neon
prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2
+specialize vp9_short_idct32x32_add sse2 neon
prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_32x32
@@ -328,7 +328,7 @@
specialize vp9_short_iht4x4_add sse2 neon
prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2
+specialize vp9_short_iht8x8_add sse2 neon
prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
specialize vp9_short_iht16x16_add sse2
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 2aa4188..54e69fd 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -16,8 +16,28 @@
#include "vp9/encoder/vp9_onyx_int.h"
#include "vp9/common/vp9_onyxc_int.h"
+typedef enum {
+ RD_DC_PRED = DC_PRED,
+ RD_V_PRED = V_PRED,
+ RD_H_PRED = H_PRED,
+ RD_D45_PRED = D45_PRED,
+ RD_D135_PRED = D135_PRED,
+ RD_D117_PRED = D117_PRED,
+ RD_D153_PRED = D153_PRED,
+ RD_D207_PRED = D207_PRED,
+ RD_D63_PRED = D63_PRED,
+ RD_TM_PRED = TM_PRED,
+ RD_NEARESTMV = NEARESTMV,
+ RD_NEARMV = NEARMV,
+ RD_ZEROMV = ZEROMV,
+ RD_NEWMV = NEWMV,
+ RD_I4X4_PRED,
+ RD_SPLITMV,
+ RD_MODE_COUNT
+} RD_PREDICTION_MODE;
+
typedef struct {
- MB_PREDICTION_MODE mode;
+ RD_PREDICTION_MODE mode;
MV_REFERENCE_FRAME ref_frame;
MV_REFERENCE_FRAME second_ref_frame;
} MODE_DEFINITION;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 07850d4..cbc8d46 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -48,58 +48,53 @@
DECLARE_ALIGNED(16, extern const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-#define I4X4_PRED 0x8000
-#define SPLITMV 0x10000
-
#define LAST_FRAME_MODE_MASK 0xFFDADCD60
#define GOLDEN_FRAME_MODE_MASK 0xFFB5A3BB0
#define ALT_REF_MODE_MASK 0xFF8C648D0
const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
- {NEARESTMV, LAST_FRAME, NONE},
- {NEARESTMV, ALTREF_FRAME, NONE},
- {NEARESTMV, GOLDEN_FRAME, NONE},
+ {RD_NEARESTMV, LAST_FRAME, NONE},
+ {RD_NEARESTMV, ALTREF_FRAME, NONE},
+ {RD_NEARESTMV, GOLDEN_FRAME, NONE},
- {DC_PRED, INTRA_FRAME, NONE},
+ {RD_DC_PRED, INTRA_FRAME, NONE},
- {NEWMV, LAST_FRAME, NONE},
- {NEWMV, ALTREF_FRAME, NONE},
- {NEWMV, GOLDEN_FRAME, NONE},
+ {RD_NEWMV, LAST_FRAME, NONE},
+ {RD_NEWMV, GOLDEN_FRAME, NONE},
- {NEARMV, LAST_FRAME, NONE},
- {NEARMV, ALTREF_FRAME, NONE},
- {NEARESTMV, LAST_FRAME, ALTREF_FRAME},
- {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_NEARMV, LAST_FRAME, NONE},
+ {RD_NEARESTMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
- {TM_PRED, INTRA_FRAME, NONE},
+ {RD_TM_PRED, INTRA_FRAME, NONE},
- {NEARMV, LAST_FRAME, ALTREF_FRAME},
- {NEWMV, LAST_FRAME, ALTREF_FRAME},
- {NEARMV, GOLDEN_FRAME, NONE},
- {NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
- {NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_NEARMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEWMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_NEARMV, GOLDEN_FRAME, NONE},
+ {RD_NEARMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_NEWMV, GOLDEN_FRAME, ALTREF_FRAME},
- {SPLITMV, LAST_FRAME, NONE},
- {SPLITMV, GOLDEN_FRAME, NONE},
- {SPLITMV, ALTREF_FRAME, NONE},
- {SPLITMV, LAST_FRAME, ALTREF_FRAME},
- {SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_SPLITMV, LAST_FRAME, NONE},
+ {RD_SPLITMV, GOLDEN_FRAME, NONE},
+ {RD_SPLITMV, ALTREF_FRAME, NONE},
+ {RD_SPLITMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_SPLITMV, GOLDEN_FRAME, ALTREF_FRAME},
- {ZEROMV, LAST_FRAME, NONE},
- {ZEROMV, GOLDEN_FRAME, NONE},
- {ZEROMV, ALTREF_FRAME, NONE},
- {ZEROMV, LAST_FRAME, ALTREF_FRAME},
- {ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
+ {RD_ZEROMV, LAST_FRAME, NONE},
+ {RD_ZEROMV, GOLDEN_FRAME, NONE},
+ {RD_ZEROMV, ALTREF_FRAME, NONE},
+ {RD_ZEROMV, LAST_FRAME, ALTREF_FRAME},
+ {RD_ZEROMV, GOLDEN_FRAME, ALTREF_FRAME},
- {I4X4_PRED, INTRA_FRAME, NONE},
- {H_PRED, INTRA_FRAME, NONE},
- {V_PRED, INTRA_FRAME, NONE},
- {D135_PRED, INTRA_FRAME, NONE},
- {D207_PRED, INTRA_FRAME, NONE},
- {D153_PRED, INTRA_FRAME, NONE},
- {D63_PRED, INTRA_FRAME, NONE},
- {D117_PRED, INTRA_FRAME, NONE},
- {D45_PRED, INTRA_FRAME, NONE},
+ {RD_I4X4_PRED, INTRA_FRAME, NONE},
+ {RD_H_PRED, INTRA_FRAME, NONE},
+ {RD_V_PRED, INTRA_FRAME, NONE},
+ {RD_D135_PRED, INTRA_FRAME, NONE},
+ {RD_D207_PRED, INTRA_FRAME, NONE},
+ {RD_D153_PRED, INTRA_FRAME, NONE},
+ {RD_D63_PRED, INTRA_FRAME, NONE},
+ {RD_D117_PRED, INTRA_FRAME, NONE},
+ {RD_D45_PRED, INTRA_FRAME, NONE},
};
// The baseline rd thresholds for breaking out of the rd loop for
@@ -163,6 +158,15 @@
return (11 * q * q) >> 2;
}
+static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
+ if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
+ assert(!"Invalid rd_mode");
+ return MB_MODE_COUNT;
+ }
+ assert((int)rd_mode < (int)MB_MODE_COUNT);
+ return (MB_PREDICTION_MODE)rd_mode;
+}
+
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
@@ -3057,7 +3061,7 @@
MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
const struct segmentation *seg = &cm->seg;
const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
- MB_PREDICTION_MODE this_mode;
+ RD_PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred, i;
@@ -3285,16 +3289,23 @@
// SPLITMV.
if (ref_frame > 0 &&
vp9_is_scaled(&scale_factor[ref_frame]) &&
- this_mode == SPLITMV)
+ this_mode == RD_SPLITMV)
continue;
if (second_ref_frame > 0 &&
vp9_is_scaled(&scale_factor[second_ref_frame]) &&
- this_mode == SPLITMV)
+ this_mode == RD_SPLITMV)
+ continue;
+
+ if (bsize >= BLOCK_8X8 &&
+ (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
+ continue;
+
+ if (bsize < BLOCK_8X8 &&
+ !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
continue;
set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
- mbmi->mode = this_mode;
mbmi->uv_mode = DC_PRED;
// Evaluate all sub-pel filters irrespective of whether we can use
@@ -3302,13 +3313,6 @@
mbmi->interp_filter = cm->mcomp_filter_type;
vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
- if (bsize >= BLOCK_8X8 &&
- (this_mode == I4X4_PRED || this_mode == SPLITMV))
- continue;
- if (bsize < BLOCK_8X8 &&
- !(this_mode == I4X4_PRED || this_mode == SPLITMV))
- continue;
-
if (comp_pred) {
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
continue;
@@ -3341,7 +3345,7 @@
// If the segment skip feature is enabled....
// then do nothing if the current mode is not allowed..
} else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
- (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
+ (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
continue;
// Disable this drop out case if the ref frame
// segment level feature is enabled for this segment. This is to
@@ -3353,11 +3357,11 @@
// an unfiltered alternative. We allow near/nearest as well
// because they may result in zero-zero MVs but be cheaper.
if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
- if ((this_mode != ZEROMV &&
- !(this_mode == NEARMV &&
- frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
- !(this_mode == NEARESTMV &&
- frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+ if ((this_mode != RD_ZEROMV &&
+ !(this_mode == RD_NEARMV &&
+ frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
+ !(this_mode == RD_NEARESTMV &&
+ frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
ref_frame != ALTREF_FRAME) {
continue;
}
@@ -3369,7 +3373,7 @@
// a representative block in the boundary ( first ) and then implement a
// function that does sads when inside the border..
if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
- this_mode == NEWMV) {
+ this_mode == RD_NEWMV) {
continue;
}
@@ -3379,7 +3383,7 @@
cpi->mode_test_hits[bsize]++;
#endif
- if (this_mode == I4X4_PRED) {
+ if (this_mode == RD_I4X4_PRED) {
int rate;
/*
@@ -3388,7 +3392,7 @@
continue;
*/
- // I4X4_PRED is only considered for block sizes less than 8x8.
+ // RD_I4X4_PRED is only considered for block sizes less than 8x8.
mbmi->tx_size = TX_4X4;
if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
&distortion_y, best_rd) >= best_rd)
@@ -3420,20 +3424,22 @@
64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
};
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
- this_mode != DC_PRED &&
+ this_mode != RD_DC_PRED &&
x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
continue;
// Only search the oblique modes if the best so far is
// one of the neighboring directional modes
if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
- (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+ (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
continue;
}
+ mbmi->mode = rd_mode_to_mode(this_mode);
if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
if (conditional_skipintra(mbmi->mode, best_intra_mode))
continue;
}
+
super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
bsize, tx_cache, best_rd);
@@ -3454,10 +3460,10 @@
mbmi->uv_mode = mode_uv[uv_tx];
rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
- if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+ if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
rate2 += intra_cost_penalty;
distortion2 = distortion_y + distortion_uv;
- } else if (this_mode == SPLITMV) {
+ } else if (this_mode == RD_SPLITMV) {
const int is_comp_pred = second_ref_frame > 0;
int rate;
int64_t distortion;
@@ -3638,6 +3644,7 @@
tx_cache[i] = tx_cache[ONLY_4X4];
}
} else {
+ mbmi->mode = rd_mode_to_mode(this_mode);
compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
this_rd = handle_inter_mode(cpi, x, bsize,
tx_cache,
@@ -3745,7 +3752,7 @@
best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
}
- if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+ if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
// Store the respective mode distortions for later use.
if (mode_distortions[this_mode] == -1
|| distortion2 < mode_distortions[this_mode]) {
@@ -3777,7 +3784,7 @@
best_skip2 = this_skip2;
best_partition = *x->partition_info;
- if (this_mode == I4X4_PRED || this_mode == SPLITMV)
+ if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
for (i = 0; i < 4; i++)
best_bmodes[i] = xd->mode_info_context->bmi[i];
@@ -3847,7 +3854,7 @@
/* keep record of best txfm size */
if (bsize < BLOCK_32X32) {
if (bsize < BLOCK_16X16) {
- if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+ if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
}
@@ -3856,7 +3863,7 @@
if (!mode_excluded && this_rd != INT64_MAX) {
for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
int64_t adj_rd = INT64_MAX;
- if (this_mode != I4X4_PRED) {
+ if (this_mode != RD_I4X4_PRED) {
adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
} else {
adj_rd = this_rd;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 866d397..7f2523f 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -93,6 +93,7 @@
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
@@ -103,7 +104,9 @@
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f44cd27..48866d2 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -89,6 +89,18 @@
unsigned int fixed_kf_cntr;
};
+static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+ switch (frame) {
+ case VP8_LAST_FRAME:
+ return VP9_LAST_FLAG;
+ case VP8_GOLD_FRAME:
+ return VP9_GOLD_FLAG;
+ case VP8_ALTR_FRAME:
+ return VP9_ALT_FLAG;
+ }
+ assert(!"Invalid Reference Frame");
+ return VP9_LAST_FLAG;
+}
static vpx_codec_err_t
update_error_state(vpx_codec_alg_priv_t *ctx,
@@ -853,7 +865,8 @@
YV12_BUFFER_CONFIG sd;
image2yuvconfig(&frame->img, &sd);
- vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
+ vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
+ &sd);
return VPX_CODEC_OK;
} else
return VPX_CODEC_INVALID_PARAM;
@@ -871,7 +884,8 @@
YV12_BUFFER_CONFIG sd;
image2yuvconfig(&frame->img, &sd);
- vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd);
+ vp9_copy_reference_enc(ctx->cpi,
+ ref_frame_to_vp9_reframe(frame->frame_type), &sd);
return VPX_CODEC_OK;
} else
return VPX_CODEC_INVALID_PARAM;