Merge "Adding search_site_config struct."
diff --git a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
index e3ea91f..a8730aa 100644
--- a/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
+++ b/vp8/common/arm/neon/buildintrapredictorsmby_neon.asm
@@ -26,6 +26,7 @@
 
 |vp8_build_intra_predictors_mby_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     cmp             r3, #0
     beq             case_dc_pred
@@ -37,8 +38,8 @@
     beq             case_tm_pred
 
 case_dc_pred
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -143,6 +144,7 @@
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred
     ; Copy down above row
@@ -165,6 +167,7 @@
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
     vst1.u8         {q0}, [r1]!
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred
@@ -224,6 +227,7 @@
     vst1.u8         {q2}, [r1]!
     vst1.u8         {q3}, [r1]!
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred
@@ -293,6 +297,7 @@
     subs            r12, r12, #1
     bne             case_tm_pred_loop
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
@@ -307,6 +312,7 @@
 
 |vp8_build_intra_predictors_mby_s_neon_func| PROC
     push            {r4-r8, lr}
+    vpush           {d8-d15}
 
     mov             r1, r0      ;   unsigned char *ypred_ptr = x->dst.y_buffer; //x->Predictor;
 
@@ -320,8 +326,8 @@
     beq             case_tm_pred_s
 
 case_dc_pred_s
-    ldr             r4, [sp, #24]       ; Up
-    ldr             r5, [sp, #28]       ; Left
+    ldr             r4, [sp, #88]       ; Up
+    ldr             r5, [sp, #92]       ; Left
 
     ; Default the DC average to 128
     mov             r12, #128
@@ -426,6 +432,7 @@
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 case_v_pred_s
     ; Copy down above row
@@ -448,6 +455,8 @@
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
     vst1.u8         {q0}, [r1], r2
+
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_h_pred_s
@@ -507,6 +516,7 @@
     vst1.u8         {q2}, [r1], r2
     vst1.u8         {q3}, [r1], r2
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
 case_tm_pred_s
@@ -576,6 +586,7 @@
     subs            r12, r12, #1
     bne             case_tm_pred_loop_s
 
+    vpop            {d8-d15}
     pop             {r4-r8,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
index 6c29c55..3a39210 100644
--- a/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_0_2x_neon.asm
@@ -22,6 +22,7 @@
 ; r3   stride
 |idct_dequant_0_2x_neon| PROC
     push            {r4, r5}
+    vpush           {d8-d15}
 
     add             r12, r2, #4
     vld1.32         {d2[0]}, [r2], r3
@@ -72,6 +73,7 @@
     vst1.32         {d4[1]}, [r2]
     vst1.32         {d10[1]}, [r0]
 
+    vpop            {d8-d15}
     pop             {r4, r5}
     bx              lr
 
diff --git a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
index d5dce63..8da0fa0 100644
--- a/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
+++ b/vp8/common/arm/neon/idct_dequant_full_2x_neon.asm
@@ -22,6 +22,8 @@
 ; r2    *dst
 ; r3    stride
 |idct_dequant_full_2x_neon| PROC
+    vpush           {d8-d15}
+
     vld1.16         {q0, q1}, [r1]          ; dq (same l/r)
     vld1.16         {q2, q3}, [r0]          ; l q
     add             r0, r0, #32
@@ -184,6 +186,7 @@
     vst1.32         {d3[0]}, [r2]
     vst1.32         {d3[1]}, [r1]
 
+    vpop            {d8-d15}
     bx             lr
 
     ENDP           ; |idct_dequant_full_2x_neon|
diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm
index e44be0a..c4f09c7 100644
--- a/vp8/common/arm/neon/loopfilter_neon.asm
+++ b/vp8/common/arm/neon/loopfilter_neon.asm
@@ -24,10 +24,12 @@
 ; sp    unsigned char thresh,
 |vp8_loop_filter_horizontal_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, r1, lsl #2         ; move src pointer down by 4 lines
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1
     add         r1, r1, r1
 
@@ -52,6 +54,7 @@
     vst1.u8     {q7}, [r2@128], r1              ; store oq0
     vst1.u8     {q8}, [r12@128], r1             ; store oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_y_neon|
 
@@ -64,10 +67,12 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_horizontal_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     vdup.u8     q1, r3                      ; duplicate limit
-    ldr         r12, [sp, #4]               ; load thresh
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r12, [sp, #68]              ; load thresh
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q2, r12                     ; duplicate thresh
 
     sub         r3, r0, r1, lsl #2          ; move u pointer down by 4 lines
@@ -104,6 +109,7 @@
     vst1.u8     {d16}, [r0@64]                 ; store u oq1
     vst1.u8     {d17}, [r2@64]                 ; store v oq1
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_horizontal_edge_uv_neon|
 
@@ -120,11 +126,13 @@
 
 |vp8_loop_filter_vertical_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                     ; duplicate blimit
     vdup.u8     q1, r3                     ; duplicate limit
     sub         r2, r0, #4                 ; src ptr down by 4 columns
     add         r1, r1, r1
-    ldr         r3, [sp, #4]               ; load thresh
+    ldr         r3, [sp, #68]              ; load thresh
     add         r12, r2, r1, asr #1
 
     vld1.u8     {d6}, [r2], r1
@@ -194,6 +202,7 @@
     vst4.8      {d14[6], d15[6], d16[6], d17[6]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_y_neon|
 
@@ -210,9 +219,11 @@
 ; sp+4  unsigned char *v
 |vp8_loop_filter_vertical_edge_uv_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     vdup.u8     q0, r2                      ; duplicate blimit
     sub         r12, r0, #4                 ; move u pointer down by 4 columns
-    ldr         r2, [sp, #8]                ; load v ptr
+    ldr         r2, [sp, #72]               ; load v ptr
     vdup.u8     q1, r3                      ; duplicate limit
     sub         r3, r2, #4                  ; move v pointer down by 4 columns
 
@@ -233,7 +244,7 @@
     vld1.u8     {d20}, [r12]
     vld1.u8     {d21}, [r3]
 
-    ldr        r12, [sp, #4]               ; load thresh
+    ldr        r12, [sp, #68]              ; load thresh
 
     ;transpose to 8x16 matrix
     vtrn.32     q3, q7
@@ -281,6 +292,7 @@
     vst4.8      {d10[7], d11[7], d12[7], d13[7]}, [r0]
     vst4.8      {d14[7], d15[7], d16[7], d17[7]}, [r2]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_loop_filter_vertical_edge_uv_neon|
 
diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
index adf848b..6eb0651 100644
--- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm
@@ -9,7 +9,6 @@
 ;
 
 
-    ;EXPORT  |vp8_loop_filter_simple_horizontal_edge_neon|
     EXPORT  |vp8_loop_filter_bhs_neon|
     EXPORT  |vp8_loop_filter_mbhs_neon|
     ARM
@@ -22,7 +21,7 @@
 ; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_horizontal_edge_neon| PROC
-
+    vpush       {d8-d15}
     sub         r3, r0, r1, lsl #1          ; move src pointer down by 2 lines
 
     vld1.u8     {q7}, [r0@128], r1          ; q0
@@ -82,6 +81,7 @@
     vst1.u8     {q6}, [r3@128]              ; store op0
     vst1.u8     {q7}, [r0@128]              ; store oq0
 
+    vpop        {d8-d15}
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_horizontal_edge_neon|
 
diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
index e690df2..78d13c8 100644
--- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
+++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm
@@ -9,7 +9,6 @@
 ;
 
 
-    ;EXPORT  |vp8_loop_filter_simple_vertical_edge_neon|
     EXPORT |vp8_loop_filter_bvs_neon|
     EXPORT |vp8_loop_filter_mbvs_neon|
     ARM
@@ -22,6 +21,8 @@
 ; q1    limit, PRESERVE
 
 |vp8_loop_filter_simple_vertical_edge_neon| PROC
+    vpush       {d8-d15}
+
     sub         r0, r0, #2                  ; move src pointer down by 2 columns
     add         r12, r1, r1
     add         r3, r0, r1
@@ -120,6 +121,7 @@
     vst2.8      {d14[6], d15[6]}, [r0], r12
     vst2.8      {d14[7], d15[7]}, [r3]
 
+    vpop        {d8-d15}
     bx          lr
     ENDP        ; |vp8_loop_filter_simple_vertical_edge_neon|
 
diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm
index f41c156..d200c30 100644
--- a/vp8/common/arm/neon/mbloopfilter_neon.asm
+++ b/vp8/common/arm/neon/mbloopfilter_neon.asm
@@ -28,8 +28,10 @@
 ; sp    unsigned char thresh,
 |vp8_mbloop_filter_horizontal_edge_y_neon| PROC
     push        {lr}
+    vpush       {d8-d15}
+
     add         r1, r1, r1                  ; double stride
-    ldr         r12, [sp, #4]               ; load thresh
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, r1, lsl #1          ; move src pointer down by 4 lines
     vdup.u8     q2, r12                     ; thresh
     add         r12, r0, r1,  lsr #1        ; move src pointer up by 1 line
@@ -55,6 +57,7 @@
     vst1.u8     {q8}, [r12@128]            ; store oq1
     vst1.u8     {q9}, [r0@128]             ; store oq2
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_y_neon|
 
@@ -72,10 +75,12 @@
 
 |vp8_mbloop_filter_horizontal_edge_uv_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]                 ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]                ; load thresh
     sub         r0, r0, r1, lsl #2            ; move u pointer down by 4 lines
     vdup.u8     q2, r12                       ; thresh
-    ldr         r12, [sp, #8]                 ; load v ptr
+    ldr         r12, [sp, #72]                ; load v ptr
     sub         r12, r12, r1, lsl #2          ; move v pointer down by 4 lines
 
     vld1.u8     {d6}, [r0@64], r1              ; p3
@@ -116,6 +121,7 @@
     vst1.u8     {d18}, [r0@64], r1             ; store u oq2
     vst1.u8     {d19}, [r12@64], r1             ; store v oq2
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_horizontal_edge_uv_neon|
 
@@ -130,7 +136,9 @@
 ; sp    unsigned char thresh,
 |vp8_mbloop_filter_vertical_edge_y_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, #4                  ; move src pointer down by 4 columns
     vdup.s8     q2, r12                     ; thresh
     add         r12, r0, r1, lsl #3         ; move src pointer down by 8 lines
@@ -208,6 +216,7 @@
     vst1.8      {d20}, [r0]
     vst1.8      {d21}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_y_neon|
 
@@ -224,10 +233,12 @@
 ; sp+4  unsigned char *v
 |vp8_mbloop_filter_vertical_edge_uv_neon| PROC
     push        {lr}
-    ldr         r12, [sp, #4]               ; load thresh
+    vpush       {d8-d15}
+
+    ldr         r12, [sp, #68]              ; load thresh
     sub         r0, r0, #4                  ; move u pointer down by 4 columns
     vdup.u8     q2, r12                     ; thresh
-    ldr         r12, [sp, #8]               ; load v ptr
+    ldr         r12, [sp, #72]              ; load v ptr
     sub         r12, r12, #4                ; move v pointer down by 4 columns
 
     vld1.u8     {d6}, [r0], r1              ;load u data
@@ -303,6 +314,7 @@
     vst1.8      {d20}, [r0]
     vst1.8      {d21}, [r12]
 
+    vpop        {d8-d15}
     pop         {pc}
     ENDP        ; |vp8_mbloop_filter_vertical_edge_uv_neon|
 
diff --git a/vp8/common/arm/neon/sad16_neon.asm b/vp8/common/arm/neon/sad16_neon.asm
index d7c590e..7197e56 100644
--- a/vp8/common/arm/neon/sad16_neon.asm
+++ b/vp8/common/arm/neon/sad16_neon.asm
@@ -24,6 +24,7 @@
 ; r3    int  ref_stride
 |vp8_sad16x16_neon| PROC
 ;;
+    vpush           {d8-d15}
     vld1.8          {q0}, [r0], r1
     vld1.8          {q4}, [r2], r3
 
@@ -132,6 +133,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -143,6 +145,8 @@
 ;    unsigned char *ref_ptr,
 ;    int  ref_stride)
 |vp8_sad16x8_neon| PROC
+    vpush           {d8-d15}
+
     vld1.8          {q0}, [r0], r1
     vld1.8          {q4}, [r2], r3
 
@@ -200,6 +204,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/sad8_neon.asm b/vp8/common/arm/neon/sad8_neon.asm
index 23ba6df..6b849d9 100644
--- a/vp8/common/arm/neon/sad8_neon.asm
+++ b/vp8/common/arm/neon/sad8_neon.asm
@@ -25,6 +25,7 @@
 ;    int  ref_stride)
 
 |vp8_sad8x8_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -70,6 +71,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -82,6 +84,7 @@
 ;    int  ref_stride)
 
 |vp8_sad8x16_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -167,6 +170,7 @@
 
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
@@ -179,6 +183,7 @@
 ;    int  ref_stride)
 
 |vp8_sad4x4_neon| PROC
+    vpush           {d8-d15}
     vld1.8          {d0}, [r0], r1
     vld1.8          {d8}, [r2], r3
 
@@ -202,6 +207,7 @@
     vpaddl.u32      d0, d1
     vmov.32         r0, d0[0]
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.asm b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
index 67d2ab0..87ca887 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.asm
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.asm
@@ -37,12 +37,14 @@
 ; result of the multiplication that is needed in IDCT.
 
 |vp8_short_idct4x4llm_neon| PROC
+    vpush           {d8-d15}
+
     adr             r12, idct_coeff
     vld1.16         {q1, q2}, [r0]
     vld1.16         {d0}, [r12]
 
     vswp            d3, d4                  ;q2(vp[4] vp[12])
-    ldr             r0, [sp]                ; stride
+    ldr             r0, [sp, #64]           ; stride
 
     vqdmulh.s16     q3, q2, d0[2]
     vqdmulh.s16     q4, q2, d0[0]
@@ -125,6 +127,7 @@
     vst1.32         d2[0], [r3], r0
     vst1.32         d2[1], [r3], r0
 
+    vpop            {d8-d15}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict16x16_neon.asm b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
index 9fdafd3..dd27719 100644
--- a/vp8/common/arm/neon/sixtappredict16x16_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict16x16_neon.asm
@@ -43,10 +43,11 @@
 
 |vp8_sixtap_predict16x16_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter16_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter16x16_only
@@ -291,6 +292,8 @@
     bne filt_blk2d_sp16x16_outloop_neon
 
     add             sp, sp, #336
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -384,6 +387,7 @@
 
     bne             filt_blk2d_fpo16x16_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -482,6 +486,7 @@
 
     bne filt_blk2d_spo16x16_outloop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict4x4_neon.asm b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
index a4222bc..e32e713 100644
--- a/vp8/common/arm/neon/sixtappredict4x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict4x4_neon.asm
@@ -35,10 +35,11 @@
 
 |vp8_sixtap_predict4x4_neon| PROC
     push            {r4, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter4_coeff
-    ldr             r4, [sp, #8]            ;load parameters from stack
-    ldr             lr, [sp, #12]           ;load parameters from stack
+    ldr             r4, [sp, #72]            ;load parameters from stack
+    ldr             lr, [sp, #76]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter4x4_only
@@ -261,6 +262,7 @@
     vst1.32         {d4[0]}, [r1]
     vst1.32         {d4[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
 
@@ -348,6 +350,7 @@
     vst1.32         {d28[0]}, [r1]
     vst1.32         {d28[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
 
@@ -413,6 +416,7 @@
     vst1.32         {d4[0]}, [r1]
     vst1.32         {d4[1]}, [r2]
 
+    vpop            {d8-d15}
     pop             {r4, pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict8x4_neon.asm b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
index a57ec01..d19bf89 100644
--- a/vp8/common/arm/neon/sixtappredict8x4_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x4_neon.asm
@@ -35,10 +35,11 @@
 
 |vp8_sixtap_predict8x4_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, filter8_coeff
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x4_only
@@ -297,6 +298,8 @@
     vst1.u8         {d9}, [r4], r5
 
     add             sp, sp, #32
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;--------------------
@@ -392,6 +395,7 @@
     vst1.u8         {d24}, [r4], r5
     vst1.u8         {d25}, [r4], r5
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -464,6 +468,7 @@
     vst1.u8         {d8}, [r4], r5
     vst1.u8         {d9}, [r4], r5
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/sixtappredict8x8_neon.asm b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
index 00ed5ae..4b04925 100644
--- a/vp8/common/arm/neon/sixtappredict8x8_neon.asm
+++ b/vp8/common/arm/neon/sixtappredict8x8_neon.asm
@@ -35,11 +35,11 @@
 
 |vp8_sixtap_predict8x8_neon| PROC
     push            {r4-r5, lr}
-
+    vpush           {d8-d15}
     adr             r12, filter8_coeff
 
-    ldr             r4, [sp, #12]           ;load parameters from stack
-    ldr             r5, [sp, #16]           ;load parameters from stack
+    ldr             r4, [sp, #76]           ;load parameters from stack
+    ldr             r5, [sp, #80]           ;load parameters from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_filter8x8_only
@@ -324,6 +324,8 @@
     bne filt_blk2d_sp8x8_loop_neon
 
     add             sp, sp, #64
+
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -428,6 +430,7 @@
 
     bne             filt_blk2d_fpo8x8_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
 ;---------------------
@@ -515,6 +518,7 @@
 
     bne filt_blk2d_spo8x8_loop_neon
 
+    vpop            {d8-d15}
     pop             {r4-r5,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/variance_neon.asm b/vp8/common/arm/neon/variance_neon.asm
index e3b4832..8ecad72 100644
--- a/vp8/common/arm/neon/variance_neon.asm
+++ b/vp8/common/arm/neon/variance_neon.asm
@@ -26,6 +26,7 @@
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance16x16_neon| PROC
+    vpush           {q5}
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -67,7 +68,7 @@
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -87,6 +88,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -99,6 +102,8 @@
 ;    int  recon_stride,
 ;   unsigned int *sse)
 |vp8_variance16x8_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -137,7 +142,7 @@
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -149,6 +154,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -162,6 +169,8 @@
 ;   unsigned int *sse)
 
 |vp8_variance8x16_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -192,7 +201,7 @@
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -204,6 +213,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
@@ -215,6 +226,8 @@
 ; r3    int  recon_stride
 ; stack unsigned int *sse
 |vp8_variance8x8_neon| PROC
+    vpush           {q5}
+
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -257,7 +270,7 @@
     vadd.u32        q10, q9, q10                ;accumulate sse
     vpaddl.s32      q0, q8                      ;accumulate sum
 
-    ldr             r12, [sp]                   ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vpaddl.u32      q1, q10
     vadd.s64        d0, d0, d1
@@ -269,6 +282,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {q5}
     bx              lr
 
     ENDP
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
index 9d22c52..adc5b7e 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16_neon.asm
@@ -31,11 +31,12 @@
 
 |vp8_sub_pixel_variance16x16_neon_func| PROC
     push            {r4-r6, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
-    ldr             r6, [sp, #24]           ;load *sse from stack
+    ldr             r4, [sp, #80]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #84]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #88]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16_only
@@ -416,6 +417,7 @@
     add             sp, sp, #528
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4-r6,pc}
 
     ENDP
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 155be4f..b0829af 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -31,9 +31,10 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_h_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                  ;loop counter
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]            ;load *sse from stack
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
     vmov.i8         q10, #0
@@ -116,6 +117,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -131,11 +134,12 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_v_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     mov             r12, #4                     ;loop counter
 
     vld1.u8         {q0}, [r0], r1              ;load src data
-    ldr             lr, [sp, #4]                ;load *sse from stack
+    ldr             lr, [sp, #68]               ;load *sse from stack
 
     vmov.i8         q8, #0                      ;q8 - sum
     vmov.i8         q9, #0                      ;q9, q10 - sse
@@ -212,6 +216,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -227,10 +233,11 @@
 ;================================================
 |vp8_variance_halfpixvar16x16_hv_neon| PROC
     push            {lr}
+    vpush           {d8-d15}
 
     vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
 
-    ldr             lr, [sp, #4]           ;load *sse from stack
+    ldr             lr, [sp, #68]           ;load *sse from stack
     vmov.i8         q13, #0                      ;q8 - sum
     vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
 
@@ -331,6 +338,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {pc}
     ENDP
 
@@ -349,10 +358,11 @@
 
 |vp8_sub_pixel_variance16x16s_neon| PROC
     push            {r4, lr}
+    vpush           {d8-d15}
 
-    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
-    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #16]           ;load *sse from stack
+    ldr             r4, [sp, #72]           ;load *dst_ptr from stack
+    ldr             r12, [sp, #76]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #80]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             secondpass_bfilter16x16s_only
@@ -566,6 +576,7 @@
     add             sp, sp, #256
     vmov.32         r0, d0[0]                   ;return
 
+    vpop            {d8-d15}
     pop             {r4, pc}
     ENDP
 
diff --git a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
index f6b6847..9d9f9e0 100644
--- a/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
+++ b/vp8/common/arm/neon/vp8_subpixelvariance8x8_neon.asm
@@ -26,11 +26,12 @@
 
 |vp8_sub_pixel_variance8x8_neon| PROC
     push            {r4-r5, lr}
+    vpush           {d8-d15}
 
     adr             r12, bilinear_taps_coeff
-    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
-    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
-    ldr             lr, [sp, #20]           ;load *sse from stack
+    ldr             r4, [sp, #76]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #80]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #84]           ;load *sse from stack
 
     cmp             r2, #0                  ;skip first_pass filter if xoffset=0
     beq             skip_firstpass_filter
@@ -210,6 +211,8 @@
     vsub.u32        d0, d1, d10
 
     vmov.32         r0, d0[0]                   ;return
+
+    vpop            {d8-d15}
     pop             {r4-r5, pc}
 
     ENDP
diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 5bda786..840cb33 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm
@@ -65,8 +65,10 @@
 ;                           unsigned char *pred, int pred_stride)
 |vp8_subtract_mby_neon| PROC
     push            {r4-r7}
+    vpush           {d8-d15}
+
     mov             r12, #4
-    ldr             r4, [sp, #16]           ; pred_stride
+    ldr             r4, [sp, #80]           ; pred_stride
     mov             r6, #32                 ; "diff" stride x2
     add             r5, r0, #16             ; second diff pointer
 
@@ -101,6 +103,7 @@
     subs            r12, r12, #1
     bne             subtract_mby_loop
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
     ENDP
@@ -112,9 +115,11 @@
 
 |vp8_subtract_mbuv_neon| PROC
     push            {r4-r7}
-    ldr             r4, [sp, #16]       ; upred
-    ldr             r5, [sp, #20]       ; vpred
-    ldr             r6, [sp, #24]       ; pred_stride
+    vpush           {d8-d15}
+
+    ldr             r4, [sp, #80]       ; upred
+    ldr             r5, [sp, #84]       ; vpred
+    ldr             r6, [sp, #88]       ; pred_stride
     add             r0, r0, #512        ; short *udiff = diff + 256;
     mov             r12, #32            ; "diff" stride x2
     add             r7, r0, #16         ; second diff pointer
@@ -191,6 +196,7 @@
     vst1.16         {q14}, [r0], r12
     vst1.16         {q15}, [r7], r12
 
+    vpop            {d8-d15}
     pop             {r4-r7}
     bx              lr
 
diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
index 5b9f11e..d219e2d 100644
--- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
@@ -21,6 +21,7 @@
 ;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr,
 ;                             int sz);
 |vp8_memcpy_partial_neon| PROC
+    vpush               {d8-d15}
     ;pld                [r1]                        ;preload pred data
     ;pld                [r1, #128]
     ;pld                [r1, #256]
@@ -64,6 +65,7 @@
     bne             extra_copy_neon_loop
 
 done_copy_neon_loop
+    vpop            {d8-d15}
     bx              lr
     ENDP
 
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
index 55edbf5..f82af3e 100644
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
@@ -27,6 +27,8 @@
 ;from vp8_variance().
 
 |vp8_mse16x16_neon| PROC
+    vpush           {q7}
+
     vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
     vmov.i8         q8, #0
     vmov.i8         q9, #0
@@ -62,7 +64,7 @@
     vadd.u32        q7, q7, q8
     vadd.u32        q9, q9, q10
 
-    ldr             r12, [sp]               ;load *sse from stack
+    ldr             r12, [sp, #16]              ;load *sse from stack
 
     vadd.u32        q10, q7, q9
     vpaddl.u32      q1, q10
@@ -71,6 +73,7 @@
     vst1.32         {d0[0]}, [r12]
     vmov.32         r0, d0[0]
 
+    vpop            {q7}
     bx              lr
 
     ENDP
@@ -82,6 +85,8 @@
 ; r2    unsigned char *ref_ptr,
 ; r3    int  recon_stride
 |vp8_get4x4sse_cs_neon| PROC
+    vpush           {q7}
+
     vld1.8          {d0}, [r0], r1              ;Load up source and reference
     vld1.8          {d4}, [r2], r3
     vld1.8          {d1}, [r0], r1
@@ -109,6 +114,8 @@
     vadd.u64        d0, d2, d3
 
     vmov.32         r0, d0[0]
+
+    vpop            {q7}
     bx              lr
 
     ENDP
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index a0666f4..99fd6ca 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -563,33 +563,6 @@
 add_proto qw/unsigned int vp9_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred, unsigned int max_sad";
 specialize qw/vp9_sad4x4_avg/, "$sse_x86inc";
 
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_h/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_v/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar16x16_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar16x16_hv/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_h/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_v/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar64x64_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar64x64_hv/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_h/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_h/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_v/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_v/;
-
-add_proto qw/unsigned int vp9_variance_halfpixvar32x32_hv/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance_halfpixvar32x32_hv/;
-
 add_proto qw/void vp9_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad64x64x3/;
 
diff --git a/vp9/common/vp9_tapify.py b/vp9/common/vp9_tapify.py
deleted file mode 100644
index 99529cf..0000000
--- a/vp9/common/vp9_tapify.py
+++ /dev/null
@@ -1,106 +0,0 @@
-"""
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
-"""
-#!/usr/bin/env python
-import sys,string,os,re,math,numpy
-scale = 2**16
-def dist(p1,p2):
-  x1,y1 = p1
-  x2,y2 = p2
-  if x1==x2 and y1==y2 :
-    return 1.0 
-  return 1/ math.sqrt((x1-x2)*(x1-x2)+(y1-y2)*(y1-y2))
-
-def gettaps(p):
-  def l(b):
-    return int(math.floor(b))
-  def h(b):
-    return int(math.ceil(b))
-  def t(b,p,s):
-    return int((scale*dist(b,p)+s/2)/s)
-  r,c = p
-  ul=[l(r),l(c)]
-  ur=[l(r),h(c)]
-  ll=[h(r),l(c)]
-  lr=[h(r),h(c)]
-  sum = dist(ul,p)+dist(ur,p)+dist(ll,p)+dist(lr,p)
-  t4 = scale - t(ul,p,sum) - t(ur,p,sum) - t(ll,p,sum);
-  return [[ul,t(ul,p,sum)],[ur,t(ur,p,sum)],
-          [ll,t(ll,p,sum)],[lr,t4]]
-
-def print_mb_taps(angle,blocksize):
-  theta = angle / 57.2957795;
-  affine = [[math.cos(theta),-math.sin(theta)],
-            [math.sin(theta),math.cos(theta)]]
-  radius = (float(blocksize)-1)/2
-  print " // angle of",angle,"degrees"
-  for y in range(blocksize) :
-    for x in range(blocksize) :
-      r,c = numpy.dot(affine,[y-radius, x-radius])
-      tps = gettaps([r+radius,c+radius])
-      for t in tps :
-        p,t = t
-        tr,tc = p
-        print " %2d, %2d, %5d, " % (tr,tc,t,),
-      print " // %2d,%2d " % (y,x)
-
-i=float(sys.argv[1])
-while  i <= float(sys.argv[2]) :
-  print_mb_taps(i,float(sys.argv[4]))
-  i=i+float(sys.argv[3])
-"""
-
-taps = []
-pt=dict()
-ptr=dict()
-for y in range(16) :
-  for x in range(16) :
-    r,c = numpy.dot(affine,[y-7.5, x-7.5])
-    tps = gettaps([r+7.5,c+7.5])
-    j=0
-    for tp in tps : 
-      p,i = tp
-      r,c = p
-      pt[y,x,j]= [p,i]
-      try: 
-        ptr[r,j,c].append([y,x])
-      except:
-        ptr[r,j,c]=[[y,x]]
-      j = j+1 
-
-for key in sorted(pt.keys()) :
-  print key,pt[key]
-
-lr = -99
-lj = -99 
-lc = 0
-
-shuf=""
-mask=""
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    if lr != r or lj != j :
-      print "shuf_"+str(lr)+"_"+str(lj)+"_"+shuf.ljust(16,"0"), lc
-      shuf=""
-      lc = 0
-    for i in range(lc,c-1) :
-      shuf = shuf +"0"
-    shuf = shuf + hex(x)[2]
-    lc =c
-    break
-  lr = r
-  lj = j
-#  print r,j,c,ptr[r,j,c]    
-#  print 
-
-for r,j,c in sorted(ptr.keys()) :
-  for y,x in ptr[r,j,c] :
-    print r,j,c,y,x 
-    break
-"""
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index faf710c..385b2eb 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -32,74 +32,6 @@
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_dthread.h"
 
-#define WRITE_RECON_BUFFER 0
-#if WRITE_RECON_BUFFER == 1
-static void recon_write_yuv_frame(const char *name,
-                                  const YV12_BUFFER_CONFIG *s,
-                                  int w, int _h) {
-  FILE *yuv_file = fopen(name, "ab");
-  const uint8_t *src = s->y_buffer;
-  int h = _h;
-
-  do {
-    fwrite(src, w, 1,  yuv_file);
-    src += s->y_stride;
-  } while (--h);
-
-  src = s->u_buffer;
-  h = (_h + 1) >> 1;
-  w = (w + 1) >> 1;
-
-  do {
-    fwrite(src, w, 1,  yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  src = s->v_buffer;
-  h = (_h + 1) >> 1;
-
-  do {
-    fwrite(src, w, 1, yuv_file);
-    src += s->uv_stride;
-  } while (--h);
-
-  fclose(yuv_file);
-}
-#endif
-#if WRITE_RECON_BUFFER == 2
-void write_dx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame) {
-  // write the frame
-  FILE *yframe;
-  int i;
-  char filename[255];
-
-  snprintf(filename, sizeof(filename)-1, "dx\\y%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->y_height; i++)
-    fwrite(frame->y_buffer + i * frame->y_stride,
-           frame->y_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename)-1, "dx\\u%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->u_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-  snprintf(filename, sizeof(filename)-1, "dx\\v%04d.raw", this_frame);
-  yframe = fopen(filename, "wb");
-
-  for (i = 0; i < frame->uv_height; i++)
-    fwrite(frame->v_buffer + i * frame->uv_stride,
-           frame->uv_width, 1, yframe);
-
-  fclose(yframe);
-}
-#endif
-
 void vp9_initialize_dec() {
   static int init_done = 0;
 
@@ -348,15 +280,6 @@
 
   swap_frame_buffers(pbi);
 
-#if WRITE_RECON_BUFFER == 2
-  if (cm->show_frame)
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame);
-  else
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 1000);
-#endif
-
   if (!pbi->do_loopfilter_inline) {
     // If multiple threads are used to decode tiles, then we use those threads
     // to do parallel loopfiltering.
@@ -367,21 +290,6 @@
     }
   }
 
-#if WRITE_RECON_BUFFER == 2
-  if (cm->show_frame)
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 2000);
-  else
-    write_dx_frame_to_file(cm->frame_to_show,
-                           cm->current_video_frame + 3000);
-#endif
-
-#if WRITE_RECON_BUFFER == 1
-  if (cm->show_frame)
-    recon_write_yuv_frame("recon.yuv", cm->frame_to_show,
-                          cm->width, cm->height);
-#endif
-
   vp9_clear_system_state();
 
   cm->last_width = cm->width;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index f04b515..395d26a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -970,95 +970,73 @@
       cpi->rd.thresh_freq_fact[i][j] = 32;
   }
 
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, \
-            SDX3F, SDX8F, SDX4DF)\
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX3F, SDX8F, SDX4DF)\
     cpi->fn_ptr[BT].sdf            = SDF; \
     cpi->fn_ptr[BT].sdaf           = SDAF; \
     cpi->fn_ptr[BT].vf             = VF; \
     cpi->fn_ptr[BT].svf            = SVF; \
     cpi->fn_ptr[BT].svaf           = SVAF; \
-    cpi->fn_ptr[BT].svf_halfpix_h  = SVFHH; \
-    cpi->fn_ptr[BT].svf_halfpix_v  = SVFHV; \
-    cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
     cpi->fn_ptr[BT].sdx3f          = SDX3F; \
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg,
       vp9_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x16x4d)
+      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d)
 
   BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg,
       vp9_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad16x32x4d)
+      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d)
 
   BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg,
       vp9_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad64x32x4d)
+      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d)
 
   BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg,
       vp9_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL,
-      NULL, NULL, NULL,
-      vp9_sad32x64x4d)
+      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d)
 
   BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg,
       vp9_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
-      vp9_variance_halfpixvar32x32_v,
-      vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
+      vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8,
       vp9_sad32x32x4d)
 
   BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg,
       vp9_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
-      vp9_variance_halfpixvar64x64_v,
-      vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
+      vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8,
       vp9_sad64x64x4d)
 
   BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg,
       vp9_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
-      vp9_variance_halfpixvar16x16_v,
-      vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+      vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8,
       vp9_sad16x16x4d)
 
   BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg,
       vp9_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance16x8,
       vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
 
   BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg,
       vp9_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance8x16,
       vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
 
   BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg,
       vp9_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance8x8,
       vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
 
   BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg,
       vp9_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, NULL,
-      NULL, NULL, vp9_sad8x4x8,
-      vp9_sad8x4x4d)
+      vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d)
 
   BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg,
       vp9_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, NULL,
-      NULL, NULL, vp9_sad4x8x8,
-      vp9_sad4x8x4d)
+      vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d)
 
   BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg,
       vp9_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+      vp9_sub_pixel_avg_variance4x4,
       vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
 
   cpi->full_search_sad = vp9_full_search_sad;
@@ -1440,77 +1418,67 @@
 }
 #endif
 
-static void scale_and_extend_frame_nonnormative(YV12_BUFFER_CONFIG *src_fb,
-                                                YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_crop_width;
-  const int in_h = src_fb->y_crop_height;
-  const int out_w = dst_fb->y_crop_width;
-  const int out_h = dst_fb->y_crop_height;
-  const int in_w_uv = src_fb->uv_crop_width;
-  const int in_h_uv = src_fb->uv_crop_height;
-  const int out_w_uv = dst_fb->uv_crop_width;
-  const int out_h_uv = dst_fb->uv_crop_height;
+static void scale_and_extend_frame_nonnormative(const YV12_BUFFER_CONFIG *src,
+                                                YV12_BUFFER_CONFIG *dst) {
+  // TODO(dkovalev): replace YV12_BUFFER_CONFIG with vpx_image_t
   int i;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  const int src_widths[4] = {src->y_crop_width, src->uv_crop_width,
+                             src->uv_crop_width, src->y_crop_width};
+  const int src_heights[4] = {src->y_crop_height, src->uv_crop_height,
+                              src->uv_crop_height, src->y_crop_height};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
+  const int dst_widths[4] = {dst->y_crop_width, dst->uv_crop_width,
+                             dst->uv_crop_width, dst->y_crop_width};
+  const int dst_heights[4] = {dst->y_crop_height, dst->uv_crop_height,
+                              dst->uv_crop_height, dst->y_crop_height};
 
-  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
-    src_fb->alpha_buffer};
-  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
-    src_fb->alpha_stride};
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    vp9_resize_plane(srcs[i], src_heights[i], src_widths[i], src_strides[i],
+                     dsts[i], dst_heights[i], dst_widths[i], dst_strides[i]);
 
-  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
-    dst_fb->alpha_buffer};
-  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
-    dst_fb->alpha_stride};
-
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    if (i == 0 || i == 3) {
-      // Y and alpha planes
-      vp9_resize_plane(srcs[i], in_h, in_w, src_strides[i],
-                       dsts[i], out_h, out_w, dst_strides[i]);
-    } else {
-      // Chroma planes
-      vp9_resize_plane(srcs[i], in_h_uv, in_w_uv, src_strides[i],
-                       dsts[i], out_h_uv, out_w_uv, dst_strides[i]);
-    }
-  }
   // TODO(hkuang): Call C version explicitly
   // as neon version only expand border size 32.
-  vp8_yv12_extend_frame_borders_c(dst_fb);
+  vp8_yv12_extend_frame_borders_c(dst);
 }
 
-static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb,
-                                   YV12_BUFFER_CONFIG *dst_fb) {
-  const int in_w = src_fb->y_crop_width;
-  const int in_h = src_fb->y_crop_height;
-  const int out_w = dst_fb->y_crop_width;
-  const int out_h = dst_fb->y_crop_height;
+static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
+                                   YV12_BUFFER_CONFIG *dst) {
+  const int src_w = src->y_crop_width;
+  const int src_h = src->y_crop_height;
+  const int dst_w = dst->y_crop_width;
+  const int dst_h = dst->y_crop_height;
+  const uint8_t *const srcs[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
+                                  src->alpha_buffer};
+  const int src_strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
+                              src->alpha_stride};
+  uint8_t *const dsts[4] = {dst->y_buffer, dst->u_buffer, dst->v_buffer,
+                            dst->alpha_buffer};
+  const int dst_strides[4] = {dst->y_stride, dst->uv_stride, dst->uv_stride,
+                              dst->alpha_stride};
   int x, y, i;
 
-  uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer,
-                      src_fb->alpha_buffer};
-  int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride,
-                        src_fb->alpha_stride};
-
-  uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer,
-                      dst_fb->alpha_buffer};
-  int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride,
-                        dst_fb->alpha_stride};
-
-  for (y = 0; y < out_h; y += 16) {
-    for (x = 0; x < out_w; x += 16) {
+  for (y = 0; y < dst_h; y += 16) {
+    for (x = 0; x < dst_w; x += 16) {
       for (i = 0; i < MAX_MB_PLANE; ++i) {
         const int factor = (i == 0 || i == 3 ? 1 : 2);
-        const int x_q4 = x * (16 / factor) * in_w / out_w;
-        const int y_q4 = y * (16 / factor) * in_h / out_h;
+        const int x_q4 = x * (16 / factor) * src_w / dst_w;
+        const int y_q4 = y * (16 / factor) * src_h / dst_h;
         const int src_stride = src_strides[i];
         const int dst_stride = dst_strides[i];
-        uint8_t *src = srcs[i] + y / factor * in_h / out_h * src_stride +
-                                 x / factor * in_w / out_w;
-        uint8_t *dst = dsts[i] + y / factor * dst_stride + x / factor;
+        const uint8_t *src_ptr = srcs[i] + (y / factor) * src_h / dst_h *
+                                     src_stride + (x / factor) * src_w / dst_w;
+        uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor);
 
-        vp9_convolve8(src, src_stride, dst, dst_stride,
-                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * in_w / out_w,
-                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * in_h / out_h,
+        vp9_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
+                      vp9_sub_pel_filters_8[x_q4 & 0xf], 16 * src_w / dst_w,
+                      vp9_sub_pel_filters_8[y_q4 & 0xf], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
       }
     }
@@ -1518,7 +1486,7 @@
 
   // TODO(hkuang): Call C version explicitly
   // as neon version only expand border size 32.
-  vp8_yv12_extend_frame_borders_c(dst_fb);
+  vp8_yv12_extend_frame_borders_c(dst);
 }
 
 static int find_fp_qindex() {
@@ -1697,7 +1665,7 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     const int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, ref_frame)];
-    YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
+    const YV12_BUFFER_CONFIG *const ref = &cm->frame_bufs[idx].buf;
 
     if (ref->y_crop_width != cm->width ||
         ref->y_crop_height != cm->height) {
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 7496cc0..bbec4da 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1359,108 +1359,89 @@
                           int sad_per_bit, int distance,
                           const vp9_variance_fn_ptr_t *fn_ptr,
                           const MV *center_mv, MV *best_mv) {
+  int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
-  const uint8_t *const what = x->plane[0].src.buf;
-  const int what_stride = x->plane[0].src.stride;
-  const uint8_t *const in_what = xd->plane[0].pre[0].buf;
-  const int in_what_stride = xd->plane[0].pre[0].stride;
-  MV this_mv;
-  unsigned int bestsad = INT_MAX;
-  int r, c;
-  int ref_row = ref_mv->row;
-  int ref_col = ref_mv->col;
-
-  // Apply further limits to prevent us looking using vectors that stretch
-  // beyond the UMV border
-  const int row_min = MAX(ref_row - distance, x->mv_row_min);
-  const int row_max = MIN(ref_row + distance, x->mv_row_max);
-  const int col_min = MAX(ref_col - distance, x->mv_col_min);
-  const int col_max = MIN(ref_col + distance, x->mv_col_max);
-  DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
-  unsigned int sad_array[3];
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int row_min = MAX(ref_mv->row - distance, x->mv_row_min);
+  const int row_max = MIN(ref_mv->row + distance, x->mv_row_max);
+  const int col_min = MAX(ref_mv->col - distance, x->mv_col_min);
+  const int col_max = MIN(ref_mv->col + distance, x->mv_col_max);
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->sdf(what->buf, what->stride,
+      get_buf_from_mv(in_what, ref_mv), in_what->stride, 0x7fffffff) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  *best_mv = *ref_mv;
 
-  // Work out the mid point for the search
-  const uint8_t *bestaddress = &in_what[ref_row * in_what_stride + ref_col];
+  for (r = row_min; r < row_max; ++r) {
+    int c = col_min;
+    const uint8_t *check_here = &in_what->buf[r * in_what->stride + c];
 
-  best_mv->row = ref_row;
-  best_mv->col = ref_col;
+    if (fn_ptr->sdx8f != NULL) {
+      while ((c + 7) < col_max) {
+        int i;
+        unsigned int sads[8];
 
-  // Baseline value at the center
-  bestsad = fn_ptr->sdf(what, what_stride,
-                        bestaddress, in_what_stride, 0x7fffffff)
-            + mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+        fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
 
-  for (r = row_min; r < row_max; r++) {
-    const uint8_t *check_here = &in_what[r * in_what_stride + col_min];
-    this_mv.row = r;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; i++) {
-        unsigned int thissad = (unsigned int)sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.col = c;
-          thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->row = r;
-            best_mv->col = c;
+        for (i = 0; i < 8; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
-    while ((c + 2) < col_max && fn_ptr->sdx3f != NULL) {
-      int i;
+    if (fn_ptr->sdx3f != NULL) {
+      while ((c + 2) < col_max) {
+        int i;
+        unsigned int sads[3];
 
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
+        fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
+                      sads);
 
-      for (i = 0; i < 3; i++) {
-        unsigned int thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.col = c;
-          thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->row = r;
-            best_mv->col = c;
+        for (i = 0; i < 3; ++i) {
+          unsigned int sad = sads[i];
+          if (sad < best_sad) {
+            const MV mv = {r, c};
+            sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              *best_mv = mv;
+            }
           }
+          ++check_here;
+          ++c;
         }
-
-        check_here++;
-        c++;
       }
     }
 
     while (c < col_max) {
-      unsigned int thissad = fn_ptr->sdf(what, what_stride,
-                                         check_here, in_what_stride, bestsad);
-
-      if (thissad < bestsad) {
-        this_mv.col = c;
-        thissad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->row = r;
-          best_mv->col = c;
+      unsigned int sad = fn_ptr->sdf(what->buf, what->stride,
+                                     check_here, in_what->stride, best_sad);
+      if (sad < best_sad) {
+        const MV mv = {r, c};
+        sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+        if (sad < best_sad) {
+          best_sad = sad;
+          *best_mv = mv;
         }
       }
-
-      check_here++;
-      c++;
+      ++check_here;
+      ++c;
     }
   }
-  return bestsad;
+
+  return best_sad;
 }
 
 int vp9_refining_search_sad_c(const MACROBLOCK *x,
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index 520ee44..ae3c86a 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -276,87 +276,6 @@
 SUBPIX_VAR(64, 64)
 SUBPIX_AVG_VAR(64, 64)
 
-unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_h_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 0,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_v_c(const uint8_t *src_ptr,
-                                              int  source_stride,
-                                              const uint8_t *ref_ptr,
-                                              int  recon_stride,
-                                              unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 0, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar16x16_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance16x16_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar32x32_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance32x32_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
-unsigned int vp9_variance_halfpixvar64x64_hv_c(const uint8_t *src_ptr,
-                                               int  source_stride,
-                                               const uint8_t *ref_ptr,
-                                               int  recon_stride,
-                                               unsigned int *sse) {
-  return vp9_sub_pixel_variance64x64_c(src_ptr, source_stride, 8, 8,
-                                       ref_ptr, recon_stride, sse);
-}
-
 void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                        int height, const uint8_t *ref, int ref_stride) {
   int i, j;
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 4c8be71..152c3d9 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -82,9 +82,6 @@
   vp9_variance_fn_t          vf;
   vp9_subpixvariance_fn_t    svf;
   vp9_subp_avg_variance_fn_t svaf;
-  vp9_variance_fn_t          svf_halfpix_h;
-  vp9_variance_fn_t          svf_halfpix_v;
-  vp9_variance_fn_t          svf_halfpix_hv;
   vp9_sad_multi_fn_t         sdx3f;
   vp9_sad_multi_fn_t         sdx8f;
   vp9_sad_multi_d_fn_t       sdx4df;
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 9e65694..25d5946 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -494,58 +494,3 @@
 
 #undef FNS
 #undef FN
-
-unsigned int vp9_variance_halfpixvar16x16_h_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-  vp9_half_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_sse2(
-  const unsigned char *src_ptr,
-  int  src_pixels_per_line,
-  const unsigned char *dst_ptr,
-  int  dst_pixels_per_line,
-  unsigned int *sse) {
-  int xsum0;
-  unsigned int xxsum0;
-
-  vp9_half_horiz_vert_variance16x_h_sse2(
-    src_ptr, src_pixels_per_line,
-    dst_ptr, dst_pixels_per_line, 16,
-    &xsum0, &xxsum0);
-
-  *sse = xxsum0;
-  return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}