Merge "Deleted #include <inttypes.h>"
diff --git a/vp9/common/arm/neon/vp9_idct16x16_neon.c b/vp9/common/arm/neon/vp9_idct16x16_neon.c
index fc38605..3e3e400 100644
--- a/vp9/common/arm/neon/vp9_idct16x16_neon.c
+++ b/vp9/common/arm/neon/vp9_idct16x16_neon.c
@@ -29,8 +29,8 @@
                                                int16_t skip_adding,
                                                uint8_t *dest,
                                                int dest_stride);
-extern void save_registers();
-extern void restore_registers();
+extern void save_neon_registers();
+extern void restore_neon_registers();
 
 
 void vp9_short_idct16x16_add_neon(int16_t *input,
@@ -39,7 +39,7 @@
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_registers();
+  save_neon_registers();
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -102,7 +102,7 @@
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_registers();
+  restore_neon_registers();
 
   return;
 }
@@ -113,7 +113,7 @@
   int16_t row_idct_output[16*16] = {0};
 
   // save d8-d15 register values.
-  save_registers();
+  save_neon_registers();
 
   /* Parallel idct on the upper 8 rows */
   // First pass processes even elements 0, 2, 4, 6, 8, 10, 12, 14 and save the
@@ -163,7 +163,7 @@
                                      dest_stride);
 
   // restore d8-d15 register values.
-  restore_registers();
+  restore_neon_registers();
 
   return;
 }
diff --git a/vp9/common/arm/neon/vp9_idct32x32_neon.c b/vp9/common/arm/neon/vp9_idct32x32_neon.c
new file mode 100644
index 0000000..ceecd6f
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_idct32x32_neon.c
@@ -0,0 +1,47 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp9/common/vp9_common.h"
+
+// defined in vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
+extern void idct32_transpose_and_transform(int16_t *transpose_buffer,
+                                           int16_t *output, int16_t *input);
+extern void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+
+
+// defined in vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+extern void save_neon_registers();
+extern void restore_neon_registers();
+
+void vp9_short_idct32x32_add_neon(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  // TODO(cd): move the creation of these buffers within the ASM file
+  // internal buffer used to transpose 8 lines into before transforming them
+  int16_t transpose_buffer[32 * 8];
+  // results of the first pass (transpose and transform rows)
+  int16_t pass1[32 * 32];
+  // results of the second pass (transpose and transform columns)
+  int16_t pass2[32 * 32];
+
+  // save register we need to preserve
+  save_neon_registers();
+  // process rows
+  idct32_transpose_and_transform(transpose_buffer, pass1, input);
+  // process columns
+  // TODO(cd): do these two steps/passes within the ASM file
+  idct32_transpose_and_transform(transpose_buffer, pass2, pass1);
+  // combine and add to dest
+  // TODO(cd): integrate this within the last storage step of the second pass
+  idct32_combine_add(dest, pass2, dest_stride);
+  // restore register we need to preserve
+  restore_neon_registers();
+}
+
+// TODO(cd): Eliminate this file altogether when everything is in ASM file
diff --git a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
index 92f3ceb..7464e80 100644
--- a/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
+++ b/vp9/common/arm/neon/vp9_short_idct16x16_add_neon.asm
@@ -12,8 +12,8 @@
     EXPORT  |vp9_short_idct16x16_add_neon_pass2|
     EXPORT  |vp9_short_idct10_16x16_add_neon_pass1|
     EXPORT  |vp9_short_idct10_16x16_add_neon_pass2|
-    EXPORT  |save_registers|
-    EXPORT  |restore_registers|
+    EXPORT  |save_neon_registers|
+    EXPORT  |restore_neon_registers|
     ARM
     REQUIRE8
     PRESERVE8
@@ -1178,13 +1178,13 @@
     pop             {r3-r9}
     bx              lr
     ENDP  ; |vp9_short_idct10_16x16_add_neon_pass2|
-;void |save_registers|()
-|save_registers| PROC
+;void |save_neon_registers|()
+|save_neon_registers| PROC
     vpush           {d8-d15}
     bx              lr
     ENDP  ; |save_registers|
-;void |restore_registers|()
-|restore_registers| PROC
+;void |restore_neon_registers|()
+|restore_neon_registers| PROC
     vpop           {d8-d15}
     bx             lr
     ENDP  ; |restore_registers|
diff --git a/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
new file mode 100644
index 0000000..5c097cc
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_idct32x32_add_neon.asm
@@ -0,0 +1,1013 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+;TODO(cd): adjust these constant to be able to use vqdmulh for faster
+;          dct_const_round_shift(a * b) within butterfly calculations.
+cospi_1_64  EQU 16364
+cospi_2_64  EQU 16305
+cospi_3_64  EQU 16207
+cospi_4_64  EQU 16069
+cospi_5_64  EQU 15893
+cospi_6_64  EQU 15679
+cospi_7_64  EQU 15426
+cospi_8_64  EQU 15137
+cospi_9_64  EQU 14811
+cospi_10_64 EQU 14449
+cospi_11_64 EQU 14053
+cospi_12_64 EQU 13623
+cospi_13_64 EQU 13160
+cospi_14_64 EQU 12665
+cospi_15_64 EQU 12140
+cospi_16_64 EQU 11585
+cospi_17_64 EQU 11003
+cospi_18_64 EQU 10394
+cospi_19_64 EQU  9760
+cospi_20_64 EQU  9102
+cospi_21_64 EQU  8423
+cospi_22_64 EQU  7723
+cospi_23_64 EQU  7005
+cospi_24_64 EQU  6270
+cospi_25_64 EQU  5520
+cospi_26_64 EQU  4756
+cospi_27_64 EQU  3981
+cospi_28_64 EQU  3196
+cospi_29_64 EQU  2404
+cospi_30_64 EQU  1606
+cospi_31_64 EQU   804
+
+
+    EXPORT  |idct32_transpose_and_transform|
+    EXPORT  |idct32_combine_add|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    AREA     Block, CODE, READONLY
+
+    ; --------------------------------------------------------------------------
+    ; Load from transposed_buffer
+    ;   q13 = transposed_buffer[first_offset]
+    ;   q14 = transposed_buffer[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   transposed_buffer must be passed in. use 0 for first use.
+    MACRO
+    LOAD_FROM_TRANSPOSED $prev_offset, $first_offset, $second_offset
+    ; address calculation with proper stride and loading
+    add r0, #($first_offset  - $prev_offset )*8*2
+    vld1.s16        {q14}, [r0]
+    add r0, #($second_offset - $first_offset)*8*2
+    vld1.s16        {q13}, [r0]
+    ; (used) two registers (q14, q13)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Load from output (used as temporary storage)
+    ;   reg1 = output[first_offset]
+    ;   reg2 = output[second_offset]
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    LOAD_FROM_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and loading
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vld1.s16        {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vld1.s16        {$reg2}, [r1]
+    ; (used) two registers ($reg1, $reg2)
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Store into output (sometimes as as temporary storage)
+    ;   output[first_offset] = reg1
+    ;   output[second_offset] = reg2
+    ;   for proper address calculation, the last offset used when manipulating
+    ;   output, wethere reading or storing) must be passed in. use 0 for first
+    ;   use.
+    MACRO
+    STORE_IN_OUTPUT $prev_offset, $first_offset, $second_offset, $reg1, $reg2
+    ; address calculation with proper stride and storing
+    add r1, #($first_offset  - $prev_offset )*32*2
+    vst1.16 {$reg1}, [r1]
+    add r1, #($second_offset - $first_offset)*32*2
+    vst1.16 {$reg2}, [r1]
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY $regC, $regD, $regA, $regB, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    ; TODO(cd): have special case to re-use constants when they are similar for
+    ;           consecutive butterflies
+    ; TODO(cd): have special case when both constants are the same, do the
+    ;           additions/substractions before the multiplies.
+    ; generate the constants
+    ;   generate scalar constants
+    mov             r3,  #$first_constant  & 0xFF00
+    add             r3,  #$first_constant  & 0x00FF
+    mov             r12, #$second_constant & 0xFF00
+    add             r12, #$second_constant & 0x00FF
+    ;   generate vector constants
+    vdup.16         d30, r3
+    vdup.16         d31, r12
+    ; (used) two for inputs (regA-regD), one for constants (q15)
+    ; do some multiplications (ordered for maximum latency hiding)
+    vmull.s16 q8,  $regC, d30
+    vmull.s16 q10, $regA, d31
+    vmull.s16 q9,  $regD, d30
+    vmull.s16 q11, $regB, d31
+    vmull.s16 q12, $regC, d31
+    ; (used) five for intermediate (q8-q12), one for constants (q15)
+    ; do some addition/substractions (to get back two register)
+    vsub.s32  q8, q8, q10
+    vsub.s32  q9, q9, q11
+    ; do more multiplications (ordered for maximum latency hiding)
+    vmull.s16 q10, $regD, d31
+    vmull.s16 q11, $regA, d30
+    vmull.s16 q15, $regB, d30
+    ; (used) six for intermediate (q8-q12, q15)
+    ; do more addition/substractions
+    vadd.s32  q11, q12, q11
+    vadd.s32  q10, q10, q15
+    ; (used) four for intermediate (q8-q11)
+    ; dct_const_round_shift
+    vqrshrn.s32 $reg1, q8,  #14
+    vqrshrn.s32 $reg2, q9,  #14
+    vqrshrn.s32 $reg3, q11, #14
+    vqrshrn.s32 $reg4, q10, #14
+    ; (used) two for results, well four d registers
+    MEND
+    ; --------------------------------------------------------------------------
+    ; Touches q8-q12, q15 (q13-q14 are preserved)
+    ; valid output registers are anything but q8-q11
+    MACRO
+    DO_BUTTERFLY_STD $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    DO_BUTTERFLY d28, d29, d26, d27, $first_constant, $second_constant, $reg1, $reg2, $reg3, $reg4
+    MEND
+    ; --------------------------------------------------------------------------
+
+;void idct32_transpose_and_transform(int16_t *transpose_buffer, int16_t *output, int16_t *input);
+;
+; r0  int16_t *transpose_buffer
+; r1  int16_t *output
+; r2  int16_t *input)
+; TODO(cd): have more logical parameter ordering but this issue will disappear
+;           when functions are combined.
+
+|idct32_transpose_and_transform| PROC
+    ; This function does one pass of idct32x32 transform.
+    ;
+    ; This is done by transposing the input and then doing a 1d transform on
+    ; columns. In the first pass, the transposed columns are the original
+    ; rows. In the second pass, after the transposition, the colums are the
+    ; original columns.
+    ; The 1d transform is done by looping over bands of eight columns (the
+    ; idct32_bands loop). For each band, the transform input transposition
+    ; is done on demand, one band of four 8x8 matrices at a time. The four
+    ; matrices are trsnposed by pairs (the idct32_transpose_pair loop).
+    push {r4}
+    mov r4, #0          ; initialize bands loop counter
+idct32_bands_loop
+    ; TODO(cd) get rid of these push/pop by properly adjusting register
+    ;          content at end of loop
+    push {r0}
+    push {r1}
+    push {r2}
+    mov r3, #0          ; initialize transpose loop counter
+idct32_transpose_pair_loop
+    ; Load two horizontally consecutive 8x8 16bit data matrices. The first one
+    ; into q0-q7 and the second one into q8-q15. There is a stride of 64,
+    ; adjusted to 32 because of the two post-increments.
+    vld1.s16        {q8},  [r2]!
+    vld1.s16        {q0},  [r2]!
+    add r2, #32
+    vld1.s16        {q9},  [r2]!
+    vld1.s16        {q1},  [r2]!
+    add r2, #32
+    vld1.s16        {q10}, [r2]!
+    vld1.s16        {q2},  [r2]!
+    add r2, #32
+    vld1.s16        {q11}, [r2]!
+    vld1.s16        {q3},  [r2]!
+    add r2, #32
+    vld1.s16        {q12}, [r2]!
+    vld1.s16        {q4},  [r2]!
+    add r2, #32
+    vld1.s16        {q13}, [r2]!
+    vld1.s16        {q5},  [r2]!
+    add r2, #32
+    vld1.s16        {q14}, [r2]!
+    vld1.s16        {q6},  [r2]!
+    add r2, #32
+    vld1.s16        {q15}, [r2]!
+    vld1.s16        {q7},  [r2]!
+
+    ; Transpose the two 8x8 16bit data matrices.
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vswp            d1,  d8
+    vswp            d7,  d14
+    vswp            d5,  d12
+    vswp            d3,  d10
+    vtrn.32         q8,  q10
+    vtrn.32         q9,  q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.32         q0,  q2
+    vtrn.32         q1,  q3
+    vtrn.32         q4,  q6
+    vtrn.32         q5,  q7
+    vtrn.16         q8,  q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    vtrn.16         q0,  q1
+    vtrn.16         q2,  q3
+    vtrn.16         q4,  q5
+    vtrn.16         q6,  q7
+
+    ; Store both matrices after each other. There is a stride of 32, which
+    ; adjusts to nothing because of the post-increments.
+    vst1.16        {q8},  [r0]!
+    vst1.16        {q9},  [r0]!
+    vst1.16        {q10}, [r0]!
+    vst1.16        {q11}, [r0]!
+    vst1.16        {q12}, [r0]!
+    vst1.16        {q13}, [r0]!
+    vst1.16        {q14}, [r0]!
+    vst1.16        {q15}, [r0]!
+    vst1.16        {q0},  [r0]!
+    vst1.16        {q1},  [r0]!
+    vst1.16        {q2},  [r0]!
+    vst1.16        {q3},  [r0]!
+    vst1.16        {q4},  [r0]!
+    vst1.16        {q5},  [r0]!
+    vst1.16        {q6},  [r0]!
+    vst1.16        {q7},  [r0]!
+
+    ; increment pointers by adjusted stride (not necessary for r0/out)
+    sub r2,  r2,  #8*32*2-32-16*2
+    ; transpose pair loop processing
+    add r3, r3, #1
+    cmp r3, #1
+    BLE idct32_transpose_pair_loop
+
+    ; restore r0/input to its original value
+    sub r0, r0, #32*8*2
+
+    ; Instead of doing the transforms stage by stage, it is done by loading
+    ; some input values and doing as many stages as possible to minimize the
+    ; storing/loading of intermediate results. To fit within registers, the
+    ; final coefficients are cut into four blocks:
+    ; BLOCK A: 16-19,28-31
+    ; BLOCK B: 20-23,24-27
+    ; BLOCK C: 8-10,11-15
+    ; BLOCK D: 0-3,4-7
+    ; Blocks A and C are straight calculation through the various stages. In
+    ; block B, further calculations are performed using the results from
+    ; block A. In block D, further calculations are performed using the results
+    ; from block C and then the final calculations are done using results from
+    ; block A and B which have been combined at the end of block B.
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK A: 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; generate 16,17,30,31
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[1 * 32] * cospi_31_64 - input[31 * 32] *  cospi_1_64;
+    ;temp2 = input[1 * 32] *  cospi_1_64 + input[31 * 32] * cospi_31_64;
+    ;step1b[16][i] = dct_const_round_shift(temp1);
+    ;step1b[31][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 0, 1, 31
+    DO_BUTTERFLY_STD cospi_31_64, cospi_1_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[17 * 32] * cospi_15_64 - input[15 * 32] * cospi_17_64;
+    ;temp2 = input[17 * 32] * cospi_17_64 + input[15 * 32] * cospi_15_64;
+    ;step1b[17][i] = dct_const_round_shift(temp1);
+    ;step1b[30][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 31, 17, 15
+    DO_BUTTERFLY_STD cospi_15_64, cospi_17_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[16] =  step1b[16][i] + step1b[17][i];
+    ;step2[17] =  step1b[16][i] - step1b[17][i];
+    ;step2[30] = -step1b[30][i] + step1b[31][i];
+    ;step2[31] =  step1b[30][i] + step1b[31][i];
+    vadd.s16  q4, q0, q1
+    vsub.s16  q13, q0, q1
+    vadd.s16  q6, q2, q3
+    vsub.s16  q14, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[30][i] * cospi_28_64 - step1b[17][i] * cospi_4_64;
+    ;temp2 = step1b[30][i] * cospi_4_64  - step1b[17][i] * cospi_28_64;
+    ;step3[17] = dct_const_round_shift(temp1);
+    ;step3[30] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; generate 18,19,28,29
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[9 * 32] * cospi_23_64 - input[23 * 32] * cospi_9_64;
+    ;temp2 = input[9 * 32] *  cospi_9_64 + input[23 * 32] * cospi_23_64;
+    ;step1b[18][i] = dct_const_round_shift(temp1);
+    ;step1b[29][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 15, 9, 23
+    DO_BUTTERFLY_STD cospi_23_64, cospi_9_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[25 * 32] *  cospi_7_64 - input[7 * 32] * cospi_25_64;
+    ;temp2 = input[25 * 32] * cospi_25_64 + input[7 * 32] * cospi_7_64;
+    ;step1b[19][i] = dct_const_round_shift(temp1);
+    ;step1b[28][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 23, 25, 7
+    DO_BUTTERFLY_STD cospi_7_64, cospi_25_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[18] = -step1b[18][i] + step1b[19][i];
+    ;step2[19] =  step1b[18][i] + step1b[19][i];
+    ;step2[28] =  step1b[28][i] + step1b[29][i];
+    ;step2[29] =  step1b[28][i] - step1b[29][i];
+    vsub.s16  q13, q3, q2
+    vadd.s16  q3,  q3, q2
+    vsub.s16  q14, q1, q0
+    vadd.s16  q2,  q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[18][i] * (-cospi_4_64)  - step1b[29][i] * (-cospi_28_64);
+    ;temp2 = step1b[18][i] * (-cospi_28_64) + step1b[29][i] * (-cospi_4_64);
+    ;step3[29] = dct_const_round_shift(temp1);
+    ;step3[18] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_4_64), (-cospi_28_64), d2, d3, d0, d1
+    ; --------------------------------------------------------------------------
+    ; combine 16-19,28-31
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[16] = step1b[16][i] + step1b[19][i];
+    ;step1[17] = step1b[17][i] + step1b[18][i];
+    ;step1[18] = step1b[17][i] - step1b[18][i];
+    ;step1[29] = step1b[30][i] - step1b[29][i];
+    ;step1[30] = step1b[30][i] + step1b[29][i];
+    ;step1[31] = step1b[31][i] + step1b[28][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q0
+    vadd.s16  q10, q7, q1
+    vadd.s16  q15, q6, q3
+    vsub.s16  q13, q5, q0
+    vsub.s16  q14, q7, q1
+    STORE_IN_OUTPUT 0,  16, 31, q8,  q15
+    STORE_IN_OUTPUT 31, 17, 30, q9,  q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[29][i] * cospi_24_64 - step1b[18][i] * cospi_8_64;
+    ;temp2 = step1b[29][i] * cospi_8_64  + step1b[18][i] * cospi_24_64;
+    ;step2[18] = dct_const_round_shift(temp1);
+    ;step2[29] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d0, d1, d2, d3
+    STORE_IN_OUTPUT 30, 29, 18, q1, q0
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[19] = step1b[16][i] - step1b[19][i];
+    ;step1[28] = step1b[31][i] - step1b[28][i];
+    vsub.s16  q13, q4, q2
+    vsub.s16  q14, q6, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[28][i] * cospi_24_64 - step1b[19][i] * cospi_8_64;
+    ;temp2 = step1b[28][i] * cospi_8_64  + step1b[19][i] * cospi_24_64;
+    ;step2[19] = dct_const_round_shift(temp1);
+    ;step2[28] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d8, d9, d12, d13
+    STORE_IN_OUTPUT 18, 19, 28, q4, q6
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK B: 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; generate 20,21,26,27
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[5 * 32] * cospi_27_64 - input[27 * 32] * cospi_5_64;
+    ;temp2 = input[5 * 32] *  cospi_5_64 + input[27 * 32] * cospi_27_64;
+    ;step1b[20][i] = dct_const_round_shift(temp1);
+    ;step1b[27][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 7, 5, 27
+    DO_BUTTERFLY_STD cospi_27_64, cospi_5_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[21 * 32] * cospi_11_64 - input[11 * 32] * cospi_21_64;
+    ;temp2 = input[21 * 32] * cospi_21_64 + input[11 * 32] * cospi_11_64;
+    ;step1b[21][i] = dct_const_round_shift(temp1);
+    ;step1b[26][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 27, 21, 11
+    DO_BUTTERFLY_STD cospi_11_64, cospi_21_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[20] =  step1b[20][i] + step1b[21][i];
+    ;step2[21] =  step1b[20][i] - step1b[21][i];
+    ;step2[26] = -step1b[26][i] + step1b[27][i];
+    ;step2[27] =  step1b[26][i] + step1b[27][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[26][i] * cospi_12_64 - step1b[21][i] * cospi_20_64;
+    ;temp2 = step1b[26][i] * cospi_20_64 + step1b[21][i] * cospi_12_64;
+    ;step3[21] = dct_const_round_shift(temp1);
+    ;step3[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 22,23,24,25
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[13 * 32] * cospi_19_64 - input[19 * 32] * cospi_13_64;
+    ;temp2 = input[13 * 32] * cospi_13_64 + input[19 * 32] * cospi_19_64;
+    ;step1b[22][i] = dct_const_round_shift(temp1);
+    ;step1b[25][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 11, 13, 19
+    DO_BUTTERFLY_STD cospi_19_64, cospi_13_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 1
+    ;temp1 = input[29 * 32] *  cospi_3_64 - input[3 * 32] * cospi_29_64;
+    ;temp2 = input[29 * 32] * cospi_29_64 + input[3 * 32] * cospi_3_64;
+    ;step1b[23][i] = dct_const_round_shift(temp1);
+    ;step1b[24][i] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 19, 29, 3
+    DO_BUTTERFLY_STD cospi_3_64, cospi_29_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;step2[22] = -step1b[22][i] + step1b[23][i];
+    ;step2[23] =  step1b[22][i] + step1b[23][i];
+    ;step2[24] =  step1b[24][i] + step1b[25][i];
+    ;step2[25] =  step1b[24][i] - step1b[25][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = step1b[22][i] * (-cospi_20_64) - step1b[25][i] * (-cospi_12_64);
+    ;temp2 = step1b[22][i] * (-cospi_12_64) + step1b[25][i] * (-cospi_20_64);
+    ;step3[25] = dct_const_round_shift(temp1);
+    ;step3[22] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_20_64), (-cospi_12_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 20-23,24-27
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[22] = step1b[22][i] + step1b[21][i];
+    ;step1[23] = step1b[23][i] + step1b[20][i];
+    vadd.s16  q10, q7, q1
+    vadd.s16  q11, q5, q0
+    ;step1[24] = step1b[24][i] + step1b[27][i];
+    ;step1[25] = step1b[25][i] + step1b[26][i];
+    vadd.s16  q12, q6, q2
+    vadd.s16  q15, q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[16] = step1b[16][i] + step1b[23][i];
+    ;step3[17] = step1b[17][i] + step1b[22][i];
+    ;step3[22] = step1b[17][i] - step1b[22][i];
+    ;step3[23] = step1b[16][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 28, 16, 17, q14, q13
+    vadd.s16  q8,  q14, q11
+    vadd.s16  q9,  q13, q10
+    vsub.s16  q13, q13, q10
+    vsub.s16  q11, q14, q11
+    STORE_IN_OUTPUT 17, 17, 16, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[24] = step1b[31][i] - step1b[24][i];
+    ;step3[25] = step1b[30][i] - step1b[25][i];
+    ;step3[30] = step1b[30][i] + step1b[25][i];
+    ;step3[31] = step1b[31][i] + step1b[24][i];
+    LOAD_FROM_OUTPUT 16, 30, 31, q14, q9
+    vsub.s16  q8,  q9,  q12
+    vadd.s16  q10, q14, q15
+    vsub.s16  q14, q14, q15
+    vadd.s16  q12, q9,  q12
+    STORE_IN_OUTPUT 31, 30, 31, q10, q12
+    ; --------------------------------------------------------------------------
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpush {q8}  ; [24]
+    vpush {q11} ; [23]
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[25][i] - step1b[22][i]) * cospi_16_64;
+    ;temp2 = (step1b[25][i] + step1b[22][i]) * cospi_16_64;
+    ;step1[22] = dct_const_round_shift(temp1);
+    ;step1[25] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 31, 25, 22, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[24][i] - step1b[23][i]) * cospi_16_64;
+    ;temp2 = (step1b[24][i] + step1b[23][i]) * cospi_16_64;
+    ;step1[23] = dct_const_round_shift(temp1);
+    ;step1[24] = dct_const_round_shift(temp2);
+    ; TODO(cd) do some register allocation change to remove these push/pop
+    vpop  {q13} ; [23]
+    vpop  {q14} ; [24]
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 22, 24, 23, q14, q13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[20] = step1b[23][i] - step1b[20][i];
+    ;step1[27] = step1b[24][i] - step1b[27][i];
+    vsub.s16  q14, q5, q0
+    vsub.s16  q13, q6, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[20][i] * (-cospi_8_64)  - step1b[27][i] * (-cospi_24_64);
+    ;temp2 = step1b[20][i] * (-cospi_24_64) + step1b[27][i] * (-cospi_8_64);
+    ;step2[27] = dct_const_round_shift(temp1);
+    ;step2[20] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d10, d11, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[21] = step1b[22][i] - step1b[21][i];
+    ;step1[26] = step1b[25][i] - step1b[26][i];
+    vsub.s16  q14,  q7, q1
+    vsub.s16  q13,  q4, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = step1b[21][i] * (-cospi_8_64)  - step1b[26][i] * (-cospi_24_64);
+    ;temp2 = step1b[21][i] * (-cospi_24_64) + step1b[26][i] * (-cospi_8_64);
+    ;step2[26] = dct_const_round_shift(temp1);
+    ;step2[21] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d0, d1, d2, d3
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[18] = step1b[18][i] + step1b[21][i];
+    ;step3[19] = step1b[19][i] + step1b[20][i];
+    ;step3[20] = step1b[19][i] - step1b[20][i];
+    ;step3[21] = step1b[18][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 23, 18, 19, q14, q13
+    vadd.s16  q8,  q14, q1
+    vadd.s16  q9,  q13, q6
+    vsub.s16  q13, q13, q6
+    vsub.s16  q1,  q14, q1
+    STORE_IN_OUTPUT 19, 18, 19, q8, q9
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[27] = step1b[28][i] - step1b[27][i];
+    ;step3[28] = step1b[28][i] + step1b[27][i];
+    ;step3[29] = step1b[29][i] + step1b[26][i];
+    ;step3[26] = step1b[29][i] - step1b[26][i];
+    LOAD_FROM_OUTPUT 19, 28, 29, q8, q9
+    vsub.s16  q14, q8, q5
+    vadd.s16  q10, q8, q5
+    vadd.s16  q11, q9, q0
+    vsub.s16  q0, q9, q0
+    STORE_IN_OUTPUT 29, 28, 29, q10, q11
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[27][i] - step1b[20][i]) * cospi_16_64;
+    ;temp2 = (step1b[27][i] + step1b[20][i]) * cospi_16_64;
+    ;step1[20] = dct_const_round_shift(temp1);
+    ;step1[27] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d26, d27, d28, d29
+    STORE_IN_OUTPUT 29, 20, 27, q13, q14
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;temp1 = (step1b[26][i] - step1b[21][i]) * cospi_16_64;
+    ;temp2 = (step1b[26][i] + step1b[21][i]) * cospi_16_64;
+    ;step1[21] = dct_const_round_shift(temp1);
+    ;step1[26] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY d0, d1, d2, d3, cospi_16_64, cospi_16_64, d2, d3, d0, d1
+    STORE_IN_OUTPUT 27, 21, 26, q1, q0
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK C: 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; generate 8,9,14,15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[2 * 32] * cospi_30_64 - input[30 * 32] * cospi_2_64;
+    ;temp2 = input[2 * 32] * cospi_2_64 + input[30 * 32] * cospi_30_64;
+    ;step2[8] = dct_const_round_shift(temp1);
+    ;step2[15] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 3, 2, 30
+    DO_BUTTERFLY_STD cospi_30_64, cospi_2_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[18 * 32] * cospi_14_64 - input[14 * 32] * cospi_18_64;
+    ;temp2 = input[18 * 32] * cospi_18_64 + input[14 * 32] * cospi_14_64;
+    ;step2[9] = dct_const_round_shift(temp1);
+    ;step2[14] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 30, 18, 14
+    DO_BUTTERFLY_STD cospi_14_64, cospi_18_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[8] = step1b[8][i] + step1b[9][i];
+    ;step3[9] = step1b[8][i] - step1b[9][i];
+    ;step3[14] = step1b[15][i] - step1b[14][i];
+    ;step3[15] = step1b[15][i] + step1b[14][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[14][i] * cospi_24_64 - step1b[9][i] * cospi_8_64;
+    ;temp2 = step1b[14][i] * cospi_8_64  + step1b[9][i] * cospi_24_64;
+    ;step1[9]  = dct_const_round_shift(temp1);
+    ;step1[14] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 10,11,12,13
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[10 * 32] * cospi_22_64 - input[22 * 32] * cospi_10_64;
+    ;temp2 = input[10 * 32] * cospi_10_64 + input[22 * 32] * cospi_22_64;
+    ;step2[10] = dct_const_round_shift(temp1);
+    ;step2[13] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 14, 10, 22
+    DO_BUTTERFLY_STD cospi_22_64, cospi_10_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 2
+    ;temp1 = input[26 * 32] * cospi_6_64 - input[6 * 32] * cospi_26_64;
+    ;temp2 = input[26 * 32] * cospi_26_64 + input[6 * 32] * cospi_6_64;
+    ;step2[11] = dct_const_round_shift(temp1);
+    ;step2[12] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 22, 26, 6
+    DO_BUTTERFLY_STD cospi_6_64, cospi_26_64, d8, d9, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;step3[10] = step1b[11][i] - step1b[10][i];
+    ;step3[11] = step1b[11][i] + step1b[10][i];
+    ;step3[12] = step1b[12][i] + step1b[13][i];
+    ;step3[13] = step1b[12][i] - step1b[13][i];
+    vsub.s16  q14, q4, q5
+    vadd.s16  q5, q4, q5
+    vsub.s16  q13, q6, q7
+    vadd.s16  q6, q6, q7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = step1b[10][i] * (-cospi_8_64)  - step1b[13][i] * (-cospi_24_64);
+    ;temp2 = step1b[10][i] * (-cospi_24_64) + step1b[13][i] * (-cospi_8_64);
+    ;step1[13] = dct_const_round_shift(temp1);
+    ;step1[10] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD (-cospi_8_64), (-cospi_24_64), d8, d9, d14, d15
+    ; --------------------------------------------------------------------------
+    ; combine 8-10,11-15
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[8]  = step1b[8][i] + step1b[11][i];
+    ;step2[9]  = step1b[9][i] + step1b[10][i];
+    ;step2[10] = step1b[9][i] - step1b[10][i];
+    vadd.s16  q8,  q0, q5
+    vadd.s16  q9,  q1, q7
+    vsub.s16  q13, q1, q7
+    ;step2[13] = step1b[14][i] - step1b[13][i];
+    ;step2[14] = step1b[14][i] + step1b[13][i];
+    ;step2[15] = step1b[15][i] + step1b[12][i];
+    vsub.s16  q14, q3, q4
+    vadd.s16  q10, q3, q4
+    vadd.s16  q15, q2, q6
+    STORE_IN_OUTPUT 26, 8, 15, q8, q15
+    STORE_IN_OUTPUT 15, 9, 14, q9, q10
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[13][i] - step1b[10][i]) * cospi_16_64;
+    ;temp2 = (step1b[13][i] + step1b[10][i]) * cospi_16_64;
+    ;step3[10] = dct_const_round_shift(temp1);
+    ;step3[13] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 14, 13, 10, q3, q1
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[11] = step1b[8][i] - step1b[11][i];
+    ;step2[12] = step1b[15][i] - step1b[12][i];
+    vsub.s16  q13, q0, q5
+    vsub.s16  q14,  q2, q6
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;temp1 = (step1b[12][i] - step1b[11][i]) * cospi_16_64;
+    ;temp2 = (step1b[12][i] + step1b[11][i]) * cospi_16_64;
+    ;step3[11] = dct_const_round_shift(temp1);
+    ;step3[12] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    STORE_IN_OUTPUT 10, 11, 12, q1, q3
+    ; --------------------------------------------------------------------------
+
+
+    ; --------------------------------------------------------------------------
+    ; BLOCK D: 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; generate 4,5,6,7
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[4 * 32] * cospi_28_64 - input[28 * 32] * cospi_4_64;
+    ;temp2 = input[4 * 32] * cospi_4_64 + input[28 * 32] * cospi_28_64;
+    ;step3[4] = dct_const_round_shift(temp1);
+    ;step3[7] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 6, 4, 28
+    DO_BUTTERFLY_STD cospi_28_64, cospi_4_64, d0, d1, d4, d5
+    ; --------------------------------------------------------------------------
+    ; part of stage 3
+    ;temp1 = input[20 * 32] * cospi_12_64 - input[12 * 32] * cospi_20_64;
+    ;temp2 = input[20 * 32] * cospi_20_64 + input[12 * 32] * cospi_12_64;
+    ;step3[5] = dct_const_round_shift(temp1);
+    ;step3[6] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 28, 20, 12
+    DO_BUTTERFLY_STD cospi_12_64, cospi_20_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;step1[4] = step1b[4][i] + step1b[5][i];
+    ;step1[5] = step1b[4][i] - step1b[5][i];
+    ;step1[6] = step1b[7][i] - step1b[6][i];
+    ;step1[7] = step1b[7][i] + step1b[6][i];
+    vsub.s16  q13, q0, q1
+    vadd.s16  q0, q0, q1
+    vsub.s16  q14, q2, q3
+    vadd.s16  q2, q2, q3
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;temp1 = (step1b[6][i] - step1b[5][i]) * cospi_16_64;
+    ;temp2 = (step1b[5][i] + step1b[6][i]) * cospi_16_64;
+    ;step2[5] = dct_const_round_shift(temp1);
+    ;step2[6] = dct_const_round_shift(temp2);
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d2, d3, d6, d7
+    ; --------------------------------------------------------------------------
+    ; generate 0,1,2,3
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = (input[0 * 32] - input[16 * 32]) * cospi_16_64;
+    ;temp2 = (input[0 * 32] + input[16 * 32]) * cospi_16_64;
+    ;step1[1] = dct_const_round_shift(temp1);
+    ;step1[0] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 12, 0, 16
+    DO_BUTTERFLY_STD cospi_16_64, cospi_16_64, d10, d11, d14, d15
+    ; --------------------------------------------------------------------------
+    ; part of stage 4
+    ;temp1 = input[8 * 32] * cospi_24_64 - input[24 * 32] * cospi_8_64;
+    ;temp2 = input[8 * 32] * cospi_8_64 + input[24 * 32] * cospi_24_64;
+    ;step1[2] = dct_const_round_shift(temp1);
+    ;step1[3] = dct_const_round_shift(temp2);
+    LOAD_FROM_TRANSPOSED 16, 8, 24
+    DO_BUTTERFLY_STD cospi_24_64, cospi_8_64, d28, d29, d12, d13
+    ; --------------------------------------------------------------------------
+    ; part of stage 5
+    ;step2[0] = step1b[0][i] + step1b[3][i];
+    ;step2[1] = step1b[1][i] + step1b[2][i];
+    ;step2[2] = step1b[1][i] - step1b[2][i];
+    ;step2[3] = step1b[0][i] - step1b[3][i];
+    vadd.s16  q4, q7, q6
+    vsub.s16  q7, q7, q6
+    vsub.s16  q6, q5, q14
+    vadd.s16  q5, q5, q14
+    ; --------------------------------------------------------------------------
+    ; combine 0-3,4-7
+    ; --------------------------------------------------------------------------
+    ; part of stage 6
+    ;step3[0] = step1b[0][i] + step1b[7][i];
+    ;step3[1] = step1b[1][i] + step1b[6][i];
+    ;step3[2] = step1b[2][i] + step1b[5][i];
+    ;step3[3] = step1b[3][i] + step1b[4][i];
+    vadd.s16  q8,  q4, q2
+    vadd.s16  q9,  q5, q3
+    vadd.s16  q10, q6, q1
+    vadd.s16  q11, q7, q0
+    ;step3[4] = step1b[3][i] - step1b[4][i];
+    ;step3[5] = step1b[2][i] - step1b[5][i];
+    ;step3[6] = step1b[1][i] - step1b[6][i];
+    ;step3[7] = step1b[0][i] - step1b[7][i];
+    vsub.s16  q12, q7, q0
+    vsub.s16  q13, q6, q1
+    vsub.s16  q14, q5, q3
+    vsub.s16  q15, q4, q2
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[0] = step1b[0][i] + step1b[15][i];
+    ;step1[1] = step1b[1][i] + step1b[14][i];
+    ;step1[14] = step1b[1][i] - step1b[14][i];
+    ;step1[15] = step1b[0][i] - step1b[15][i];
+    LOAD_FROM_OUTPUT 12, 14, 15, q0, q1
+    vadd.s16  q2, q8, q1
+    vadd.s16  q3, q9, q0
+    vsub.s16  q4, q9, q0
+    vsub.s16  q5, q8, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[14 * 32] = step1b[14][i] + step1b[17][i];
+    ;output[15 * 32] = step1b[15][i] + step1b[16][i];
+    ;output[16 * 32] = step1b[15][i] - step1b[16][i];
+    ;output[17 * 32] = step1b[14][i] - step1b[17][i];
+    LOAD_FROM_OUTPUT 15, 16, 17, q0, q1
+    vadd.s16  q8, q4, q1
+    vadd.s16  q9, q5, q0
+    vsub.s16  q6, q5, q0
+    vsub.s16  q7, q4, q1
+    STORE_IN_OUTPUT 17, 17, 16, q7, q6
+    STORE_IN_OUTPUT 16, 15, 14, q9, q8
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 0 * 32] = step1b[0][i] + step1b[31][i];
+    ;output[ 1 * 32] = step1b[1][i] + step1b[30][i];
+    ;output[30 * 32] = step1b[1][i] - step1b[30][i];
+    ;output[31 * 32] = step1b[0][i] - step1b[31][i];
+    LOAD_FROM_OUTPUT 14, 30, 31, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 31, 31, 30, q7, q6
+    STORE_IN_OUTPUT 30,  0,  1, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[2] = step1b[2][i] + step1b[13][i];
+    ;step1[3] = step1b[3][i] + step1b[12][i];
+    ;step1[12] = step1b[3][i] - step1b[12][i];
+    ;step1[13] = step1b[2][i] - step1b[13][i];
+    LOAD_FROM_OUTPUT 1, 12, 13, q0, q1
+    vadd.s16  q2, q10, q1
+    vadd.s16  q3, q11, q0
+    vsub.s16  q4, q11, q0
+    vsub.s16  q5, q10, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[12 * 32] = step1b[12][i] + step1b[19][i];
+    ;output[13 * 32] = step1b[13][i] + step1b[18][i];
+    ;output[18 * 32] = step1b[13][i] - step1b[18][i];
+    ;output[19 * 32] = step1b[12][i] - step1b[19][i];
+    LOAD_FROM_OUTPUT 13, 18, 19, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 19, 19, 18, q9, q8
+    STORE_IN_OUTPUT 18, 13, 12, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 2 * 32] = step1b[2][i] + step1b[29][i];
+    ;output[ 3 * 32] = step1b[3][i] + step1b[28][i];
+    ;output[28 * 32] = step1b[3][i] - step1b[28][i];
+    ;output[29 * 32] = step1b[2][i] - step1b[29][i];
+    LOAD_FROM_OUTPUT 12, 28, 29, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 29, 29, 28, q7, q6
+    STORE_IN_OUTPUT 28,  2,  3, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[4] = step1b[4][i] + step1b[11][i];
+    ;step1[5] = step1b[5][i] + step1b[10][i];
+    ;step1[10] = step1b[5][i] - step1b[10][i];
+    ;step1[11] = step1b[4][i] - step1b[11][i];
+    LOAD_FROM_OUTPUT 3, 10, 11, q0, q1
+    vadd.s16  q2, q12, q1
+    vadd.s16  q3, q13, q0
+    vsub.s16  q4, q13, q0
+    vsub.s16  q5, q12, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[10 * 32] = step1b[10][i] + step1b[21][i];
+    ;output[11 * 32] = step1b[11][i] + step1b[20][i];
+    ;output[20 * 32] = step1b[11][i] - step1b[20][i];
+    ;output[21 * 32] = step1b[10][i] - step1b[21][i];
+    LOAD_FROM_OUTPUT 11, 20, 21, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 21, 21, 20, q9, q8
+    STORE_IN_OUTPUT 20, 11, 10, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 4 * 32] = step1b[4][i] + step1b[27][i];
+    ;output[ 5 * 32] = step1b[5][i] + step1b[26][i];
+    ;output[26 * 32] = step1b[5][i] - step1b[26][i];
+    ;output[27 * 32] = step1b[4][i] - step1b[27][i];
+    LOAD_FROM_OUTPUT 10, 26, 27, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 27, 27, 26, q7, q6
+    STORE_IN_OUTPUT 26,  4,  5, q4, q5
+    ; --------------------------------------------------------------------------
+    ; part of stage 7
+    ;step1[6] = step1b[6][i] + step1b[9][i];
+    ;step1[7] = step1b[7][i] + step1b[8][i];
+    ;step1[8] = step1b[7][i] - step1b[8][i];
+    ;step1[9] = step1b[6][i] - step1b[9][i];
+    LOAD_FROM_OUTPUT 5, 8, 9, q0, q1
+    vadd.s16  q2, q14, q1
+    vadd.s16  q3, q15, q0
+    vsub.s16  q4, q15, q0
+    vsub.s16  q5, q14, q1
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 8 * 32] = step1b[8][i] + step1b[23][i];
+    ;output[ 9 * 32] = step1b[9][i] + step1b[22][i];
+    ;output[22 * 32] = step1b[9][i] - step1b[22][i];
+    ;output[23 * 32] = step1b[8][i] - step1b[23][i];
+    LOAD_FROM_OUTPUT 9, 22, 23, q0, q1
+    vadd.s16  q6, q4, q1
+    vadd.s16  q7, q5, q0
+    vsub.s16  q8, q5, q0
+    vsub.s16  q9, q4, q1
+    STORE_IN_OUTPUT 23, 23, 22, q9, q8
+    STORE_IN_OUTPUT 22, 9, 8, q7, q6
+    ; --------------------------------------------------------------------------
+    ; part of final stage
+    ;output[ 6 * 32] = step1b[6][i] + step1b[25][i];
+    ;output[ 7 * 32] = step1b[7][i] + step1b[24][i];
+    ;output[24 * 32] = step1b[7][i] - step1b[24][i];
+    ;output[25 * 32] = step1b[6][i] - step1b[25][i];
+    LOAD_FROM_OUTPUT 8, 24, 25, q0, q1
+    vadd.s16  q4, q2, q1
+    vadd.s16  q5, q3, q0
+    vsub.s16  q6, q3, q0
+    vsub.s16  q7, q2, q1
+    STORE_IN_OUTPUT 25, 25, 24, q7, q6
+    STORE_IN_OUTPUT 24,  6,  7, q4, q5
+    ; --------------------------------------------------------------------------
+
+    ; TODO(cd) get rid of these push/pop by properly adjusting register
+    ;          content at end of loop
+    pop {r2}
+    pop {r1}
+    pop {r0}
+    add r1, r1, #8*2
+    add r2, r2, #8*32*2
+
+    ; bands loop processing
+    add r4, r4, #1
+    cmp r4, #3
+    BLE idct32_bands_loop
+
+    pop {r4}
+    bx              lr
+    ENDP  ; |idct32_transpose_and_transform|
+
+;void idct32_combine_add(uint8_t *dest, int16_t *out, int dest_stride);
+;
+; r0  uint8_t *dest
+; r1  int16_t *out
+; r2  int dest_stride)
+
+|idct32_combine_add| PROC
+
+    mov r12, r0         ; dest pointer used for stores
+    sub r2, r2, #32     ; adjust the stride (remove the post-increments)
+    mov r3, #0          ; initialize loop counter
+
+idct32_combine_add_loop
+    ; load out[j * 32 + 0-31]
+    vld1.s16        {q12},  [r1]!
+    vld1.s16        {q13},  [r1]!
+    vld1.s16        {q14},  [r1]!
+    vld1.s16        {q15},  [r1]!
+    ; load dest[j * dest_stride + 0-31]
+    vld1.s16        {q6}, [r0]!
+    vld1.s16        {q7}, [r0]!
+    ; ROUND_POWER_OF_TWO
+    vrshr.s16       q12, q12, #6
+    vrshr.s16       q13, q13, #6
+    vrshr.s16       q14, q14, #6
+    vrshr.s16       q15, q15, #6
+    ; add to dest[j * dest_stride + 0-31]
+    vaddw.u8        q12, q12, d12
+    vaddw.u8        q13, q13, d13
+    vaddw.u8        q14, q14, d14
+    vaddw.u8        q15, q15, d15
+    ; clip pixel
+    vqmovun.s16     d12, q12
+    vqmovun.s16     d13, q13
+    vqmovun.s16     d14, q14
+    vqmovun.s16     d15, q15
+    ; store back into dest[j * dest_stride + 0-31]
+    vst1.16         {q6}, [r12]!
+    vst1.16         {q7}, [r12]!
+    ; increment pointers by adjusted stride (not necessary for r1/out)
+    add r0,  r0,  r2
+    add r12, r12, r2
+    ; loop processing
+    add r3, r3, #1
+    cmp r3, #31
+    BLE idct32_combine_add_loop
+
+    bx              lr
+    ENDP  ; |idct32_transpose|
+
+    END
diff --git a/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
new file mode 100644
index 0000000..53ccee0
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_short_iht8x8_add_neon.asm
@@ -0,0 +1,696 @@
+;
+;  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+    EXPORT  |vp9_short_iht8x8_add_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+    ; Generate IADST constants in r0 - r12 for the IADST.
+    MACRO
+    GENERATE_IADST_CONSTANTS
+    ; generate  cospi_2_64  = 16305
+    mov             r0, #0x3f00
+    add             r0, #0xb1
+
+    ; generate cospi_30_64 = 1606
+    mov             r1, #0x600
+    add             r1, #0x46
+
+    ; generate cospi_10_64 = 14449
+    mov             r2, #0x3800
+    add             r2, #0x71
+
+    ; generate cospi_22_64 = 7723
+    mov             r3, #0x1e00
+    add             r3, #0x2b
+
+    ; generate cospi_18_64 = 10394
+    mov             r4, #0x2800
+    add             r4, #0x9a
+
+    ; generate cospi_14_64 = 12665
+    mov             r5, #0x3100
+    add             r5, #0x79
+
+    ; generate cospi_26_64 = 4756
+    mov             r6, #0x1200
+    add             r6, #0x94
+
+    ; generate cospi_6_64  = 15679
+    mov             r7, #0x3d00
+    add             r7, #0x3f
+
+    ; generate cospi_8_64  = 15137
+    mov             r8, #0x3b00
+    add             r8, #0x21
+
+    ; generate cospi_24_64 = 6270
+    mov             r9, #0x1800
+    add             r9, #0x7e
+
+    ; generate 0
+    mov             r10, #0
+
+    ; generate  cospi_16_64 = 11585
+    mov             r12, #0x2d00
+    add             r12, #0x41
+    MEND
+
+    ; Generate IDCT constants in r3 - r9 for the IDCT.
+    MACRO
+    GENERATE_IDCT_CONSTANTS
+    ; generate  cospi_28_64 = 3196
+    mov             r3, #0x0c00
+    add             r3, #0x7c
+
+    ; generate cospi_4_64  = 16069
+    mov             r4, #0x3e00
+    add             r4, #0xc5
+
+    ; generate cospi_12_64 = 13623
+    mov             r5, #0x3500
+    add             r5, #0x37
+
+    ; generate cospi_20_64 = 9102
+    mov             r6, #0x2300
+    add             r6, #0x8e
+
+    ; generate cospi_16_64 = 11585
+    mov             r7, #0x2d00
+    add             r7, #0x41
+
+    ; generate cospi_24_64 = 6270
+    mov             r8, #0x1800
+    add             r8, #0x7e
+
+    ; generate cospi_8_64 = 15137
+    mov             r9, #0x3b00
+    add             r9, #0x21
+    MEND
+
+    ; Transpose a 8x8 16bits data matrix. Datas are loaded in q8-q15.
+    MACRO
+    TRANSPOSE8X8
+    vswp            d17, d24
+    vswp            d23, d30
+    vswp            d21, d28
+    vswp            d19, d26
+    vtrn.32         q8, q10
+    vtrn.32         q9, q11
+    vtrn.32         q12, q14
+    vtrn.32         q13, q15
+    vtrn.16         q8, q9
+    vtrn.16         q10, q11
+    vtrn.16         q12, q13
+    vtrn.16         q14, q15
+    MEND
+
+    ; Parallel 1D IDCT on all the columns of a 8x8 16bits data matrix which are
+    ; loaded in q8-q15. The IDCT constants are loaded in r3 - r9. The output
+    ; will be stored back into q8-q15 registers. This macro will touch q0-q7
+    ; registers and use them as buffer during calculation.
+    MACRO
+    IDCT8x8_1D
+    ; stage 1
+    vdup.16         d0, r3                    ; duplicate cospi_28_64
+    vdup.16         d1, r4                    ; duplicate cospi_4_64
+    vdup.16         d2, r5                    ; duplicate cospi_12_64
+    vdup.16         d3, r6                    ; duplicate cospi_20_64
+
+    ; input[1] * cospi_28_64
+    vmull.s16       q2, d18, d0
+    vmull.s16       q3, d19, d0
+
+    ; input[5] * cospi_12_64
+    vmull.s16       q5, d26, d2
+    vmull.s16       q6, d27, d2
+
+    ; input[1]*cospi_28_64-input[7]*cospi_4_64
+    vmlsl.s16       q2, d30, d1
+    vmlsl.s16       q3, d31, d1
+
+    ; input[5] * cospi_12_64 - input[3] * cospi_20_64
+    vmlsl.s16       q5, d22, d3
+    vmlsl.s16       q6, d23, d3
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d8, q2, #14               ; >> 14
+    vqrshrn.s32     d9, q3, #14               ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; input[1] * cospi_4_64
+    vmull.s16       q2, d18, d1
+    vmull.s16       q3, d19, d1
+
+    ; input[5] * cospi_20_64
+    vmull.s16       q9, d26, d3
+    vmull.s16       q13, d27, d3
+
+    ; input[1]*cospi_4_64+input[7]*cospi_28_64
+    vmlal.s16       q2, d30, d0
+    vmlal.s16       q3, d31, d0
+
+    ; input[5] * cospi_20_64 + input[3] * cospi_12_64
+    vmlal.s16       q9, d22, d2
+    vmlal.s16       q13, d23, d2
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d14, q2, #14              ; >> 14
+    vqrshrn.s32     d15, q3, #14              ; >> 14
+
+    ; stage 2 & stage 3 - even half
+    vdup.16         d0, r7                    ; duplicate cospi_16_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q9, #14              ; >> 14
+    vqrshrn.s32     d13, q13, #14             ; >> 14
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q2, d16, d0
+    vmull.s16       q3, d17, d0
+
+    ; input[0] * cospi_16_64
+    vmull.s16       q13, d16, d0
+    vmull.s16       q15, d17, d0
+
+    ; (input[0] + input[2]) * cospi_16_64
+    vmlal.s16       q2,  d24, d0
+    vmlal.s16       q3, d25, d0
+
+    ; (input[0] - input[2]) * cospi_16_64
+    vmlsl.s16       q13, d24, d0
+    vmlsl.s16       q15, d25, d0
+
+    vdup.16         d0, r8                    ; duplicate cospi_24_64
+    vdup.16         d1, r9                    ; duplicate cospi_8_64
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d18, q2, #14              ; >> 14
+    vqrshrn.s32     d19, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d22, q13, #14             ; >> 14
+    vqrshrn.s32     d23, q15, #14             ; >> 14
+
+    ; input[1] * cospi_24_64
+    vmull.s16       q2, d20, d0
+    vmull.s16       q3, d21, d0
+
+    ; input[1] * cospi_8_64
+    vmull.s16       q8, d20, d1
+    vmull.s16       q12, d21, d1
+
+    ; input[1] * cospi_24_64 - input[3] * cospi_8_64
+    vmlsl.s16       q2, d28, d1
+    vmlsl.s16       q3, d29, d1
+
+    ; input[1] * cospi_8_64 + input[3] * cospi_24_64
+    vmlal.s16       q8, d28, d0
+    vmlal.s16       q12, d29, d0
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d26, q2, #14              ; >> 14
+    vqrshrn.s32     d27, q3, #14              ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d30, q8, #14              ; >> 14
+    vqrshrn.s32     d31, q12, #14             ; >> 14
+
+    vadd.s16        q0, q9, q15               ; output[0] = step[0] + step[3]
+    vadd.s16        q1, q11, q13              ; output[1] = step[1] + step[2]
+    vsub.s16        q2, q11, q13              ; output[2] = step[1] - step[2]
+    vsub.s16        q3, q9, q15               ; output[3] = step[0] - step[3]
+
+    ; stage 3 -odd half
+    vdup.16         d16, r7                   ; duplicate cospi_16_64
+
+    ; stage 2 - odd half
+    vsub.s16        q13, q4, q5               ; step2[5] = step1[4] - step1[5]
+    vadd.s16        q4, q4, q5                ; step2[4] = step1[4] + step1[5]
+    vsub.s16        q14, q7, q6               ; step2[6] = -step1[6] + step1[7]
+    vadd.s16        q7, q7, q6                ; step2[7] = step1[6] + step1[7]
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q9, d28, d16
+    vmull.s16       q10, d29, d16
+
+    ; step2[6] * cospi_16_64
+    vmull.s16       q11, d28, d16
+    vmull.s16       q12, d29, d16
+
+    ; (step2[6] - step2[5]) * cospi_16_64
+    vmlsl.s16       q9, d26, d16
+    vmlsl.s16       q10, d27, d16
+
+    ; (step2[5] + step2[6]) * cospi_16_64
+    vmlal.s16       q11, d26, d16
+    vmlal.s16       q12, d27, d16
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d10, q9, #14              ; >> 14
+    vqrshrn.s32     d11, q10, #14             ; >> 14
+
+    ; dct_const_round_shift(input_dc * cospi_16_64)
+    vqrshrn.s32     d12, q11, #14             ; >> 14
+    vqrshrn.s32     d13, q12, #14             ; >> 14
+
+    ; stage 4
+    vadd.s16        q8, q0, q7                ; output[0] = step1[0] + step1[7];
+    vadd.s16        q9, q1, q6                ; output[1] = step1[1] + step1[6];
+    vadd.s16        q10, q2, q5               ; output[2] = step1[2] + step1[5];
+    vadd.s16        q11, q3, q4               ; output[3] = step1[3] + step1[4];
+    vsub.s16        q12, q3, q4               ; output[4] = step1[3] - step1[4];
+    vsub.s16        q13, q2, q5               ; output[5] = step1[2] - step1[5];
+    vsub.s16        q14, q1, q6               ; output[6] = step1[1] - step1[6];
+    vsub.s16        q15, q0, q7               ; output[7] = step1[0] - step1[7];
+    MEND
+
+    ; Parallel 1D IADST on all the columns of a 8x8 16bits data matrix which
+    ; loaded in q8-q15. IADST constants are loaded in r0 - r12 registers. The
+    ; output will be stored back into q8-q15 registers. This macro will touch
+    ; q0 - q7 registers and use them as buffer during calculation.
+    MACRO
+    IADST8X8_1D
+    vdup.16         d14, r0                   ; duplicate cospi_2_64
+    vdup.16         d15, r1                   ; duplicate cospi_30_64
+
+    ; cospi_2_64  * x0
+    vmull.s16       q1, d30, d14
+    vmull.s16       q2, d31, d14
+
+    ; s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+    vmlal.s16       q1, d16, d15
+    vmlal.s16       q2, d17, d15
+
+    ; cospi_30_64 * x0
+    vmull.s16       q3, d30, d15
+    vmull.s16       q4, d31, d15
+
+    ; s1 = cospi_30_64 * x0 - cospi_2_64  * x1
+    vmlsl.s16       q3, d16, d14
+    vmlsl.s16       q4, d17, d14
+
+    vdup.16         d30, r4                   ; duplicate cospi_18_64
+    vdup.16         d31, r5                   ; duplicate cospi_14_64
+
+    ; cospi_18_64 * x4
+    vmull.s16       q5, d22, d30
+    vmull.s16       q6, d23, d30
+
+    ; s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+    vmlal.s16       q5, d24, d31
+    vmlal.s16       q6, d25, d31
+
+    ; cospi_14_64 * x4
+    vmull.s16       q7, d22, d31
+    vmull.s16       q8, d23, d31
+
+    ; s5 = cospi_14_64 * x4 - cospi_18_64 * x5
+    vmlsl.s16       q7, d24, d30
+    vmlsl.s16       q8, d25, d30
+
+    ; (s0 + s4)
+    vadd.s32        q11, q1, q5
+    vadd.s32        q12, q2, q6
+
+    ; x0 = dct_const_round_shift(s0 + s4);
+    vqrshrn.s32     d22, q11, #14             ; >> 14
+    vqrshrn.s32     d23, q12, #14             ; >> 14
+
+    ; (s0 - s4)
+    vsub.s32        q1, q1, q5
+    vsub.s32        q2, q2, q6
+
+    ; x4 = dct_const_round_shift(s0 - s4);
+    vqrshrn.s32     d2, q1, #14               ; >> 14
+    vqrshrn.s32     d3, q2, #14               ; >> 14
+
+    ; (s1 + s5)
+    vadd.s32        q12, q3, q7
+    vadd.s32        q15, q4, q8
+
+    ; x1 = dct_const_round_shift(s1 + s5);
+    vqrshrn.s32     d24, q12, #14             ; >> 14
+    vqrshrn.s32     d25, q15, #14             ; >> 14
+
+    ; (s1 - s5)
+    vsub.s32        q3, q3, q7
+    vsub.s32        q4, q4, q8
+
+    ; x5 = dct_const_round_shift(s1 - s5);
+    vqrshrn.s32     d6, q3, #14               ; >> 14
+    vqrshrn.s32     d7, q4, #14               ; >> 14
+
+    vdup.16         d30, r2                   ; duplicate cospi_10_64
+    vdup.16         d31, r3                   ; duplicate cospi_22_64
+
+    ; cospi_10_64 * x2
+    vmull.s16       q4, d26, d30
+    vmull.s16       q5, d27, d30
+
+    ; s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+    vmlal.s16       q4, d20, d31
+    vmlal.s16       q5, d21, d31
+
+    ; cospi_22_64 * x2
+    vmull.s16       q2, d26, d31
+    vmull.s16       q6, d27, d31
+
+    ; s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+    vmlsl.s16       q2, d20, d30
+    vmlsl.s16       q6, d21, d30
+
+    vdup.16         d30, r6                   ; duplicate cospi_26_64
+    vdup.16         d31, r7                   ; duplicate cospi_6_64
+
+    ; cospi_26_64 * x6
+    vmull.s16       q0, d18, d30
+    vmull.s16       q13, d19, d30
+
+    ; s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+    vmlal.s16       q0, d28, d31
+    vmlal.s16       q13, d29, d31
+
+    ; cospi_6_64  * x6
+    vmull.s16       q10, d18, d31
+    vmull.s16       q9, d19, d31
+
+    ; s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+    vmlsl.s16       q10, d28, d30
+    vmlsl.s16       q9, d29, d30
+
+    ; (s3 + s7)
+    vadd.s32        q14, q2, q10
+    vadd.s32        q15, q6, q9
+
+    ; x3 = dct_const_round_shift(s3 + s7);
+    vqrshrn.s32     d28, q14, #14             ; >> 14
+    vqrshrn.s32     d29, q15, #14             ; >> 14
+
+    ; (s3 - s7)
+    vsub.s32        q2, q2, q10
+    vsub.s32        q6, q6, q9
+
+    ; x7 = dct_const_round_shift(s3 - s7);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q6, #14               ; >> 14
+
+    ; (s2 + s6)
+    vadd.s32        q9, q4, q0
+    vadd.s32        q10, q5, q13
+
+    ; x2 = dct_const_round_shift(s2 + s6);
+    vqrshrn.s32     d18, q9, #14              ; >> 14
+    vqrshrn.s32     d19, q10, #14             ; >> 14
+
+    ; (s2 - s6)
+    vsub.s32        q4, q4, q0
+    vsub.s32        q5, q5, q13
+
+    ; x6 = dct_const_round_shift(s2 - s6);
+    vqrshrn.s32     d8, q4, #14               ; >> 14
+    vqrshrn.s32     d9, q5, #14               ; >> 14
+
+    vdup.16         d30, r8                   ; duplicate cospi_8_64
+    vdup.16         d31, r9                   ; duplicate cospi_24_64
+
+    ; cospi_8_64  * x4
+    vmull.s16       q5, d2, d30
+    vmull.s16       q6, d3, d30
+
+    ; s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+    vmlal.s16       q5, d6, d31
+    vmlal.s16       q6, d7, d31
+
+    ; cospi_24_64 * x4
+    vmull.s16       q7, d2, d31
+    vmull.s16       q0, d3, d31
+
+    ; s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+    vmlsl.s16       q7, d6, d30
+    vmlsl.s16       q0, d7, d30
+
+    ; cospi_8_64  * x7
+    vmull.s16       q1, d4, d30
+    vmull.s16       q3, d5, d30
+
+    ; s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+    vmlsl.s16       q1, d8, d31
+    vmlsl.s16       q3, d9, d31
+
+    ; cospi_24_64 * x7
+    vmull.s16       q10, d4, d31
+    vmull.s16       q2, d5, d31
+
+    ; s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+    vmlal.s16       q10, d8, d30
+    vmlal.s16       q2, d9, d30
+
+    vadd.s16        q8, q11, q9               ; x0 = s0 + s2;
+
+    vsub.s16        q11, q11, q9              ; x2 = s0 - s2;
+
+    vadd.s16        q4, q12, q14              ; x1 = s1 + s3;
+
+    vsub.s16        q12, q12, q14             ; x3 = s1 - s3;
+
+    ; (s4 + s6)
+    vadd.s32        q14, q5, q1
+    vadd.s32        q15, q6, q3
+
+    ; x4 = dct_const_round_shift(s4 + s6);
+    vqrshrn.s32     d18, q14, #14             ; >> 14
+    vqrshrn.s32     d19, q15, #14             ; >> 14
+
+    ; (s4 - s6)
+    vsub.s32        q5, q5, q1
+    vsub.s32        q6, q6, q3
+
+    ; x6 = dct_const_round_shift(s4 - s6);
+    vqrshrn.s32     d10, q5, #14              ; >> 14
+    vqrshrn.s32     d11, q6, #14              ; >> 14
+
+    ; (s5 + s7)
+    vadd.s32        q1, q7, q10
+    vadd.s32        q3, q0, q2
+
+    ; x5 = dct_const_round_shift(s5 + s7);
+    vqrshrn.s32     d28, q1, #14               ; >> 14
+    vqrshrn.s32     d29, q3, #14               ; >> 14
+
+    ; (s5 - s7))
+    vsub.s32        q7, q7, q10
+    vsub.s32        q0, q0, q2
+
+    ; x7 = dct_const_round_shift(s5 - s7);
+    vqrshrn.s32     d14, q7, #14              ; >> 14
+    vqrshrn.s32     d15, q0, #14              ; >> 14
+
+    vdup.16         d30, r12                  ; duplicate cospi_16_64
+
+    ; cospi_16_64 * x2
+    vmull.s16       q2, d22, d30
+    vmull.s16       q3, d23, d30
+
+    ; cospi_16_64 * x2 + cospi_16_64  * x3;
+    vmlal.s16       q2, d24, d30
+    vmlal.s16       q3, d25, d30
+
+    ; x2 = dct_const_round_shift(s2);
+    vqrshrn.s32     d4, q2, #14               ; >> 14
+    vqrshrn.s32     d5, q3, #14               ; >> 14
+
+    ; cospi_6_64  * x6
+    vmull.s16       q13, d22, d30
+    vmull.s16       q1, d23, d30
+
+    ; cospi_16_64 * x2 - cospi_16_64  * x3;
+    vmlsl.s16       q13, d24, d30
+    vmlsl.s16       q1, d25, d30
+
+    ;x3 = dct_const_round_shift(s3);
+    vqrshrn.s32     d24, q13, #14             ; >> 14
+    vqrshrn.s32     d25, q1, #14              ; >> 14
+
+    ; cospi_16_64 * x6
+    vmull.s16       q13, d10, d30
+    vmull.s16       q1, d11, d30
+
+    ; cospi_16_64 * x6 + cospi_16_64  * x7;
+    vmlal.s16       q13, d14, d30
+    vmlal.s16       q1, d15, d30
+
+    ; x6 = dct_const_round_shift(s6);
+    vqrshrn.s32     d20, q13, #14             ; >> 14
+    vqrshrn.s32     d21, q1, #14              ; >> 14
+
+    ; cospi_6_64  * x6
+    vmull.s16       q13, d10, d30
+    vmull.s16       q1, d11, d30
+
+    ; cospi_16_64 * x6 - cospi_16_64  * x7;
+    vmlsl.s16       q13, d14, d30
+    vmlsl.s16       q1, d15, d30
+
+    ;x7 = dct_const_round_shift(s7);
+    vqrshrn.s32     d12, q13, #14             ; >> 14
+    vqrshrn.s32     d13, q1, #14              ; >> 14
+
+    vdup.16         q5, r10                   ; duplicate 0
+
+    vsub.s16        q9, q5, q9                ; output[1] = -x4;
+    vsub.s16        q13, q5, q6               ; output[5] = -x7;
+    vsub.s16        q11, q5, q2               ; output[3] = -x2;
+    vsub.s16        q15, q5, q4               ; output[7] = -x1;
+    MEND
+
+
+    AREA     Block, CODE, READONLY ; name this block of code
+;void vp9_short_iht8x8_add_neon(int16_t *input, uint8_t *dest,
+;                               int dest_stride, int tx_type)
+;
+; r0  int16_t input
+; r1  uint8_t *dest
+; r2  int dest_stride
+; r3  int tx_type)
+; This function will only handle tx_type of 1,2,3.
+|vp9_short_iht8x8_add_neon| PROC
+
+    ; load the inputs into d16-d19
+    vld1.s16        {q8,q9}, [r0]!
+    vld1.s16        {q10,q11}, [r0]!
+    vld1.s16        {q12,q13}, [r0]!
+    vld1.s16        {q14,q15}, [r0]!
+
+    push            {r0-r10}
+
+    ; transpose the input data
+    TRANSPOSE8X8
+
+    ; decide the type of transform
+    cmp         r3, #2
+    beq         idct_iadst
+    cmp         r3, #3
+    beq         iadst_iadst
+
+iadst_idct
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; first transform rows
+    IDCT8x8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; then transform columns
+    IADST8X8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+idct_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; generate IDCT constants
+    GENERATE_IDCT_CONSTANTS
+
+    ; then transform columns
+    IDCT8x8_1D
+
+    b end_vp9_short_iht8x8_add_neon
+
+iadst_iadst
+    ; generate IADST constants
+    GENERATE_IADST_CONSTANTS
+
+    ; first transform rows
+    IADST8X8_1D
+
+    ; transpose the matrix
+    TRANSPOSE8X8
+
+    ; then transform columns
+    IADST8X8_1D
+
+end_vp9_short_iht8x8_add_neon
+    pop            {r0-r10}
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5)
+    vrshr.s16       q8, q8, #5
+    vrshr.s16       q9, q9, #5
+    vrshr.s16       q10, q10, #5
+    vrshr.s16       q11, q11, #5
+    vrshr.s16       q12, q12, #5
+    vrshr.s16       q13, q13, #5
+    vrshr.s16       q14, q14, #5
+    vrshr.s16       q15, q15, #5
+
+    ; save dest pointer
+    mov             r0, r1
+
+    ; load destination data
+    vld1.64         {d0}, [r1], r2
+    vld1.64         {d1}, [r1], r2
+    vld1.64         {d2}, [r1], r2
+    vld1.64         {d3}, [r1], r2
+    vld1.64         {d4}, [r1], r2
+    vld1.64         {d5}, [r1], r2
+    vld1.64         {d6}, [r1], r2
+    vld1.64         {d7}, [r1]
+
+    ; ROUND_POWER_OF_TWO(temp_out[j], 5) + dest[j * dest_stride + i]
+    vaddw.u8        q8, q8, d0
+    vaddw.u8        q9, q9, d1
+    vaddw.u8        q10, q10, d2
+    vaddw.u8        q11, q11, d3
+    vaddw.u8        q12, q12, d4
+    vaddw.u8        q13, q13, d5
+    vaddw.u8        q14, q14, d6
+    vaddw.u8        q15, q15, d7
+
+    ; clip_pixel
+    vqmovun.s16     d0, q8
+    vqmovun.s16     d1, q9
+    vqmovun.s16     d2, q10
+    vqmovun.s16     d3, q11
+    vqmovun.s16     d4, q12
+    vqmovun.s16     d5, q13
+    vqmovun.s16     d6, q14
+    vqmovun.s16     d7, q15
+
+    ; store the data
+    vst1.64         {d0}, [r0], r2
+    vst1.64         {d1}, [r0], r2
+    vst1.64         {d2}, [r0], r2
+    vst1.64         {d3}, [r0], r2
+    vst1.64         {d4}, [r0], r2
+    vst1.64         {d5}, [r0], r2
+    vst1.64         {d6}, [r0], r2
+    vst1.64         {d7}, [r0], r2
+    bx          lr
+    ENDP  ; |vp9_short_iht8x8_add_neon|
+
+    END
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index f5eeb2c..042afbb 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -319,7 +319,7 @@
 specialize vp9_short_idct10_16x16_add sse2 neon
 
 prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_short_idct32x32_add sse2
+specialize vp9_short_idct32x32_add sse2 neon
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
@@ -328,7 +328,7 @@
 specialize vp9_short_iht4x4_add sse2 neon
 
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
-specialize vp9_short_iht8x8_add sse2
+specialize vp9_short_iht8x8_add sse2 neon
 
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add sse2
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 2aa4188..54e69fd 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -16,8 +16,28 @@
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/common/vp9_onyxc_int.h"
 
+typedef enum {
+  RD_DC_PRED = DC_PRED,
+  RD_V_PRED =  V_PRED,
+  RD_H_PRED = H_PRED,
+  RD_D45_PRED = D45_PRED,
+  RD_D135_PRED = D135_PRED,
+  RD_D117_PRED = D117_PRED,
+  RD_D153_PRED = D153_PRED,
+  RD_D207_PRED = D207_PRED,
+  RD_D63_PRED = D63_PRED,
+  RD_TM_PRED = TM_PRED,
+  RD_NEARESTMV = NEARESTMV,
+  RD_NEARMV = NEARMV,
+  RD_ZEROMV = ZEROMV,
+  RD_NEWMV = NEWMV,
+  RD_I4X4_PRED,
+  RD_SPLITMV,
+  RD_MODE_COUNT
+} RD_PREDICTION_MODE;
+
 typedef struct {
-  MB_PREDICTION_MODE mode;
+  RD_PREDICTION_MODE mode;
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME second_ref_frame;
 } MODE_DEFINITION;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 07850d4..cbc8d46 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -48,58 +48,53 @@
 DECLARE_ALIGNED(16, extern const uint8_t,
                 vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
 
-#define I4X4_PRED 0x8000
-#define SPLITMV 0x10000
-
 #define LAST_FRAME_MODE_MASK    0xFFDADCD60
 #define GOLDEN_FRAME_MODE_MASK  0xFFB5A3BB0
 #define ALT_REF_MODE_MASK       0xFF8C648D0
 
 const MODE_DEFINITION vp9_mode_order[MAX_MODES] = {
-  {NEARESTMV, LAST_FRAME,   NONE},
-  {NEARESTMV, ALTREF_FRAME, NONE},
-  {NEARESTMV, GOLDEN_FRAME, NONE},
+  {RD_NEARESTMV, LAST_FRAME,   NONE},
+  {RD_NEARESTMV, ALTREF_FRAME, NONE},
+  {RD_NEARESTMV, GOLDEN_FRAME, NONE},
 
-  {DC_PRED,   INTRA_FRAME,  NONE},
+  {RD_DC_PRED,   INTRA_FRAME,  NONE},
 
-  {NEWMV,     LAST_FRAME,   NONE},
-  {NEWMV,     ALTREF_FRAME, NONE},
-  {NEWMV,     GOLDEN_FRAME, NONE},
+  {RD_NEWMV,     LAST_FRAME,   NONE},
+  {RD_NEWMV,     GOLDEN_FRAME, NONE},
 
-  {NEARMV,    LAST_FRAME,   NONE},
-  {NEARMV,    ALTREF_FRAME, NONE},
-  {NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
-  {NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_NEARMV,    LAST_FRAME,   NONE},
+  {RD_NEARESTMV, LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEARESTMV, GOLDEN_FRAME, ALTREF_FRAME},
 
-  {TM_PRED,   INTRA_FRAME,  NONE},
+  {RD_TM_PRED,   INTRA_FRAME,  NONE},
 
-  {NEARMV,    LAST_FRAME,   ALTREF_FRAME},
-  {NEWMV,     LAST_FRAME,   ALTREF_FRAME},
-  {NEARMV,    GOLDEN_FRAME, NONE},
-  {NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
-  {NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_NEARMV,    LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEWMV,     LAST_FRAME,   ALTREF_FRAME},
+  {RD_NEARMV,    GOLDEN_FRAME, NONE},
+  {RD_NEARMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_NEWMV,     GOLDEN_FRAME, ALTREF_FRAME},
 
-  {SPLITMV,   LAST_FRAME,   NONE},
-  {SPLITMV,   GOLDEN_FRAME, NONE},
-  {SPLITMV,   ALTREF_FRAME, NONE},
-  {SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
-  {SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_SPLITMV,   LAST_FRAME,   NONE},
+  {RD_SPLITMV,   GOLDEN_FRAME, NONE},
+  {RD_SPLITMV,   ALTREF_FRAME, NONE},
+  {RD_SPLITMV,   LAST_FRAME,   ALTREF_FRAME},
+  {RD_SPLITMV,   GOLDEN_FRAME, ALTREF_FRAME},
 
-  {ZEROMV,    LAST_FRAME,   NONE},
-  {ZEROMV,    GOLDEN_FRAME, NONE},
-  {ZEROMV,    ALTREF_FRAME, NONE},
-  {ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
-  {ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
+  {RD_ZEROMV,    LAST_FRAME,   NONE},
+  {RD_ZEROMV,    GOLDEN_FRAME, NONE},
+  {RD_ZEROMV,    ALTREF_FRAME, NONE},
+  {RD_ZEROMV,    LAST_FRAME,   ALTREF_FRAME},
+  {RD_ZEROMV,    GOLDEN_FRAME, ALTREF_FRAME},
 
-  {I4X4_PRED, INTRA_FRAME,  NONE},
-  {H_PRED,    INTRA_FRAME,  NONE},
-  {V_PRED,    INTRA_FRAME,  NONE},
-  {D135_PRED, INTRA_FRAME,  NONE},
-  {D207_PRED,  INTRA_FRAME,  NONE},
-  {D153_PRED, INTRA_FRAME,  NONE},
-  {D63_PRED,  INTRA_FRAME,  NONE},
-  {D117_PRED, INTRA_FRAME,  NONE},
-  {D45_PRED,  INTRA_FRAME,  NONE},
+  {RD_I4X4_PRED, INTRA_FRAME,  NONE},
+  {RD_H_PRED,    INTRA_FRAME,  NONE},
+  {RD_V_PRED,    INTRA_FRAME,  NONE},
+  {RD_D135_PRED, INTRA_FRAME,  NONE},
+  {RD_D207_PRED, INTRA_FRAME,  NONE},
+  {RD_D153_PRED, INTRA_FRAME,  NONE},
+  {RD_D63_PRED,  INTRA_FRAME,  NONE},
+  {RD_D117_PRED, INTRA_FRAME,  NONE},
+  {RD_D45_PRED,  INTRA_FRAME,  NONE},
 };
 
 // The baseline rd thresholds for breaking out of the rd loop for
@@ -163,6 +158,15 @@
   return (11 * q * q) >> 2;
 }
 
+static MB_PREDICTION_MODE rd_mode_to_mode(RD_PREDICTION_MODE rd_mode) {
+  if (rd_mode == RD_SPLITMV || rd_mode == RD_I4X4_PRED) {
+    assert(!"Invalid rd_mode");
+    return MB_MODE_COUNT;
+  }
+  assert((int)rd_mode < (int)MB_MODE_COUNT);
+  return (MB_PREDICTION_MODE)rd_mode;
+}
+
 void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) {
   cpi->mb.sadperbit16 = sad_per_bit16lut[qindex];
   cpi->mb.sadperbit4 = sad_per_bit4lut[qindex];
@@ -3057,7 +3061,7 @@
   MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi;
   const struct segmentation *seg = &cm->seg;
   const BLOCK_SIZE block_size = get_plane_block_size(bsize, &xd->plane[0]);
-  MB_PREDICTION_MODE this_mode;
+  RD_PREDICTION_MODE this_mode;
   MV_REFERENCE_FRAME ref_frame, second_ref_frame;
   unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
   int comp_pred, i;
@@ -3285,16 +3289,23 @@
     // SPLITMV.
     if (ref_frame > 0 &&
         vp9_is_scaled(&scale_factor[ref_frame]) &&
-        this_mode == SPLITMV)
+        this_mode == RD_SPLITMV)
       continue;
 
     if (second_ref_frame > 0 &&
         vp9_is_scaled(&scale_factor[second_ref_frame]) &&
-        this_mode == SPLITMV)
+        this_mode == RD_SPLITMV)
+      continue;
+
+    if (bsize >= BLOCK_8X8 &&
+        (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
+      continue;
+
+    if (bsize < BLOCK_8X8 &&
+        !(this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV))
       continue;
 
     set_scale_factors(xd, ref_frame, second_ref_frame, scale_factor);
-    mbmi->mode = this_mode;
     mbmi->uv_mode = DC_PRED;
 
     // Evaluate all sub-pel filters irrespective of whether we can use
@@ -3302,13 +3313,6 @@
     mbmi->interp_filter = cm->mcomp_filter_type;
     vp9_setup_interp_filters(xd, mbmi->interp_filter, &cpi->common);
 
-    if (bsize >= BLOCK_8X8 &&
-        (this_mode == I4X4_PRED || this_mode == SPLITMV))
-      continue;
-    if (bsize < BLOCK_8X8 &&
-        !(this_mode == I4X4_PRED || this_mode == SPLITMV))
-      continue;
-
     if (comp_pred) {
       if (!(cpi->ref_frame_flags & flag_list[second_ref_frame]))
         continue;
@@ -3341,7 +3345,7 @@
     // If the segment skip feature is enabled....
     // then do nothing if the current mode is not allowed..
     } else if (vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP) &&
-               (this_mode != ZEROMV && ref_frame != INTRA_FRAME)) {
+               (this_mode != RD_ZEROMV && ref_frame != INTRA_FRAME)) {
       continue;
     // Disable this drop out case if the ref frame
     // segment level feature is enabled for this segment. This is to
@@ -3353,11 +3357,11 @@
       // an unfiltered alternative. We allow near/nearest as well
       // because they may result in zero-zero MVs but be cheaper.
       if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-        if ((this_mode != ZEROMV &&
-             !(this_mode == NEARMV &&
-               frame_mv[NEARMV][ALTREF_FRAME].as_int == 0) &&
-             !(this_mode == NEARESTMV &&
-               frame_mv[NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
+        if ((this_mode != RD_ZEROMV &&
+             !(this_mode == RD_NEARMV &&
+               frame_mv[RD_NEARMV][ALTREF_FRAME].as_int == 0) &&
+             !(this_mode == RD_NEARESTMV &&
+               frame_mv[RD_NEARESTMV][ALTREF_FRAME].as_int == 0)) ||
             ref_frame != ALTREF_FRAME) {
           continue;
         }
@@ -3369,7 +3373,7 @@
     // a representative block in the boundary ( first ) and then implement a
     // function that does sads when inside the border..
     if (((mi_row + bhs) > cm->mi_rows || (mi_col + bws) > cm->mi_cols) &&
-        this_mode == NEWMV) {
+        this_mode == RD_NEWMV) {
       continue;
     }
 
@@ -3379,7 +3383,7 @@
     cpi->mode_test_hits[bsize]++;
 #endif
 
-    if (this_mode == I4X4_PRED) {
+    if (this_mode == RD_I4X4_PRED) {
       int rate;
 
       /*
@@ -3388,7 +3392,7 @@
         continue;
         */
 
-      // I4X4_PRED is only considered for block sizes less than 8x8.
+      // RD_I4X4_PRED is only considered for block sizes less than 8x8.
       mbmi->tx_size = TX_4X4;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
                                        &distortion_y, best_rd) >= best_rd)
@@ -3420,20 +3424,22 @@
         64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
       };
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_LOWVAR) &&
-          this_mode != DC_PRED &&
+          this_mode != RD_DC_PRED &&
           x->source_variance < skip_intra_var_thresh[mbmi->sb_type])
         continue;
       // Only search the oblique modes if the best so far is
       // one of the neighboring directional modes
       if ((cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_BESTINTER) &&
-          (this_mode >= D45_PRED && this_mode <= TM_PRED)) {
+          (this_mode >= RD_D45_PRED && this_mode <= RD_TM_PRED)) {
         if (vp9_mode_order[best_mode_index].ref_frame > INTRA_FRAME)
           continue;
       }
+      mbmi->mode = rd_mode_to_mode(this_mode);
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mbmi->mode, best_intra_mode))
             continue;
       }
+
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable, NULL,
                       bsize, tx_cache, best_rd);
 
@@ -3454,10 +3460,10 @@
       mbmi->uv_mode = mode_uv[uv_tx];
 
       rate2 = rate_y + x->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
-      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED)
+      if (this_mode != RD_DC_PRED && this_mode != RD_TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
-    } else if (this_mode == SPLITMV) {
+    } else if (this_mode == RD_SPLITMV) {
       const int is_comp_pred = second_ref_frame > 0;
       int rate;
       int64_t distortion;
@@ -3638,6 +3644,7 @@
           tx_cache[i] = tx_cache[ONLY_4X4];
       }
     } else {
+      mbmi->mode = rd_mode_to_mode(this_mode);
       compmode_cost = vp9_cost_bit(comp_mode_p, second_ref_frame > INTRA_FRAME);
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   tx_cache,
@@ -3745,7 +3752,7 @@
         best_filter_rd[i] = MIN(best_filter_rd[i], this_rd);
     }
 
-    if (this_mode != I4X4_PRED && this_mode != SPLITMV) {
+    if (this_mode != RD_I4X4_PRED && this_mode != RD_SPLITMV) {
       // Store the respective mode distortions for later use.
       if (mode_distortions[this_mode] == -1
           || distortion2 < mode_distortions[this_mode]) {
@@ -3777,7 +3784,7 @@
         best_skip2 = this_skip2;
         best_partition = *x->partition_info;
 
-        if (this_mode == I4X4_PRED || this_mode == SPLITMV)
+        if (this_mode == RD_I4X4_PRED || this_mode == RD_SPLITMV)
           for (i = 0; i < 4; i++)
             best_bmodes[i] = xd->mode_info_context->bmi[i];
 
@@ -3847,7 +3854,7 @@
     /* keep record of best txfm size */
     if (bsize < BLOCK_32X32) {
       if (bsize < BLOCK_16X16) {
-        if (this_mode == SPLITMV || this_mode == I4X4_PRED)
+        if (this_mode == RD_SPLITMV || this_mode == RD_I4X4_PRED)
           tx_cache[ALLOW_8X8] = tx_cache[ONLY_4X4];
         tx_cache[ALLOW_16X16] = tx_cache[ALLOW_8X8];
       }
@@ -3856,7 +3863,7 @@
     if (!mode_excluded && this_rd != INT64_MAX) {
       for (i = 0; i < TX_MODES && tx_cache[i] < INT64_MAX; i++) {
         int64_t adj_rd = INT64_MAX;
-        if (this_mode != I4X4_PRED) {
+        if (this_mode != RD_I4X4_PRED) {
           adj_rd = this_rd + tx_cache[i] - tx_cache[cm->tx_mode];
         } else {
           adj_rd = this_rd;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 866d397..7f2523f 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -93,6 +93,7 @@
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct32x32_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)
@@ -103,7 +104,9 @@
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_1_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct16x16_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_idct32x32_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht4x4_add_neon$(ASM)
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_short_iht8x8_add_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_copy_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_avg_neon$(ASM)
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f44cd27..48866d2 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -89,6 +89,18 @@
   unsigned int                fixed_kf_cntr;
 };
 
+static const VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(!"Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
 
 static vpx_codec_err_t
 update_error_state(vpx_codec_alg_priv_t                 *ctx,
@@ -853,7 +865,8 @@
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    vp9_set_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
+                          &sd);
     return VPX_CODEC_OK;
   } else
     return VPX_CODEC_INVALID_PARAM;
@@ -871,7 +884,8 @@
     YV12_BUFFER_CONFIG sd;
 
     image2yuvconfig(&frame->img, &sd);
-    vp9_copy_reference_enc(ctx->cpi, frame->frame_type, &sd);
+    vp9_copy_reference_enc(ctx->cpi,
+                           ref_frame_to_vp9_reframe(frame->frame_type), &sd);
     return VPX_CODEC_OK;
   } else
     return VPX_CODEC_INVALID_PARAM;