Merge "Modified the inverse walsh to output directly"

diff --git a/build/make/ads2gas.pl b/build/make/ads2gas.pl
index 388133a..cea967f 100755
--- a/build/make/ads2gas.pl
+++ b/build/make/ads2gas.pl

@@ -126,15 +126,14 @@
     # ALIGN directive
     s/ALIGN/.balign/g;
 
-    # Strip ARM
-    s/\sARM/@ ARM/g;
+    # ARM code
+    s/\sARM/.arm/g;
 
-    # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;      #EQU cause problem
+    # REQUIRE8 Stack is required to be 8-byte aligned
+    s/\sREQUIRE8/.eabi_attribute Tag_ABI_align_needed, 1/g;
 
-    # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    # PRESERVE8 Stack 8-byte align is preserved
+    s/\sPRESERVE8/.eabi_attribute Tag_ABI_align_preserved, 1/g;
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.

diff --git a/configure b/configure
index e6fcc87..cca94a2 100755
--- a/configure
+++ b/configure

@@ -121,6 +121,7 @@
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
+all_platforms="${all_platforms} x86_64-win64-gcc"
 all_platforms="${all_platforms} x86_64-win64-vs8"
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} universal-darwin8-gcc"

diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 28cbaed..3f04dab 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h

@@ -104,7 +104,7 @@
         int Version;            // 4 versions of bitstream defined 0 best quality/slowest decode, 3 lowest quality/fastest decode
         int Width;              // width of data passed to the compressor
         int Height;             // height of data passed to the compressor
-        double frame_rate;       // set to passed in framerate
+        struct vpx_rational  timebase;
         int target_bandwidth;    // bandwidth to be used in kilobits per second
 
         int noise_sensitivity;   // parameter used for applying pre processing blur: recommendation 0

diff --git a/vp8/common/x86/filter_x86.c b/vp8/common/x86/filter_x86.c
new file mode 100644
index 0000000..ebab814
--- /dev/null
+++ b/vp8/common/x86/filter_x86.c

@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_4[8][8]) =
+{
+    { 128, 128, 128, 128,   0,   0,   0,   0 },
+    { 112, 112, 112, 112,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  96,  96,  96,  96 },
+    {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_x86_8[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,   0,   0,   0,   0,   0,   0,   0,   0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112,  16,  16,  16,  16,  16,  16,  16,  16 },
+    {  96,  96,  96,  96,  96,  96,  96,  96,  32,  32,  32,  32,  32,  32,  32,  32 },
+    {  80,  80,  80,  80,  80,  80,  80,  80,  48,  48,  48,  48,  48,  48,  48,  48 },
+    {  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64,  64 },
+    {  48,  48,  48,  48,  48,  48,  48,  48,  80,  80,  80,  80,  80,  80,  80,  80 },
+    {  32,  32,  32,  32,  32,  32,  32,  32,  96,  96,  96,  96,  96,  96,  96,  96 },
+    {  16,  16,  16,  16,  16,  16,  16,  16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};

diff --git a/vp8/common/x86/filter_x86.h b/vp8/common/x86/filter_x86.h
new file mode 100644
index 0000000..efcc4dc
--- /dev/null
+++ b/vp8/common/x86/filter_x86.h

@@ -0,0 +1,19 @@
+/*
+ *  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef FILTER_X86_H
+#define FILTER_X86_H
+
+/* x86 assembly specific copy of vp8/common/filter.c:vp8_bilinear_filters with
+ * duplicated values */
+extern const short vp8_bilinear_filters_x86_4[8][8];  /* duplicated 4x */
+extern const short vp8_bilinear_filters_x86_8[8][16]; /* duplicated 8x */
+
+#endif /* FILTER_X86_H */

diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index e68d950..5528fd0 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm

@@ -10,6 +10,7 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
 
 
 %define BLOCK_HEIGHT_WIDTH 4
@@ -222,14 +223,14 @@
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
         shl         rax,        5 ; offset * 32
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 
         add         rax,        rcx ; HFilter
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -379,13 +380,13 @@
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         shl         rax,        5
 
         mov         rsi,        arg(0) ;src_ptr              ;
@@ -534,13 +535,13 @@
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset];
-    ;const short *VFilter = bilinear_filters_mmx[yoffset];
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset];
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset];
 
         movsxd      rax,        dword ptr arg(2) ;xoffset
         mov         rdi,        arg(4) ;dst_ptr           ;
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         shl         rax,        5
 
         add         rax,        rcx ; HFilter
@@ -699,29 +700,3 @@
     times 8 dw 0
 
 
-align 16
-global HIDDEN_DATA(sym(vp8_bilinear_filters_mmx))
-sym(vp8_bilinear_filters_mmx):
-    times 8 dw 128
-    times 8 dw 0
-
-    times 8 dw 112
-    times 8 dw 16
-
-    times 8 dw 96
-    times 8 dw 32
-
-    times 8 dw 80
-    times 8 dw 48
-
-    times 8 dw 64
-    times 8 dw 64
-
-    times 8 dw 48
-    times 8 dw 80
-
-    times 8 dw 32
-    times 8 dw 96
-
-    times 8 dw 16
-    times 8 dw 112

diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index b62b5c6..cb550af 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm

@@ -10,6 +10,7 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
+extern sym(vp8_bilinear_filters_x86_8)
 
 %define BLOCK_HEIGHT_WIDTH 4
 %define VP8_FILTER_WEIGHT 128
@@ -961,7 +962,7 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-extern sym(vp8_bilinear_filters_mmx)
+extern sym(vp8_bilinear_filters_x86_8)
 global sym(vp8_bilinear_predict16x16_sse2)
 sym(vp8_bilinear_predict16x16_sse2):
     push        rbp
@@ -973,10 +974,10 @@
     push        rdi
     ; end prolog
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
 
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
         movsxd      rax,        dword ptr arg(2) ;xoffset
 
         cmp         rax,        0      ;skip first_pass filter if xoffset=0
@@ -1230,7 +1231,6 @@
 ;    unsigned char *dst_ptr,
 ;    int dst_pitch
 ;)
-extern sym(vp8_bilinear_filters_mmx)
 global sym(vp8_bilinear_predict8x8_sse2)
 sym(vp8_bilinear_predict8x8_sse2):
     push        rbp
@@ -1245,9 +1245,9 @@
     ALIGN_STACK 16, rax
     sub         rsp, 144                         ; reserve 144 bytes
 
-    ;const short *HFilter = bilinear_filters_mmx[xoffset]
-    ;const short *VFilter = bilinear_filters_mmx[yoffset]
-        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_mmx))]
+    ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset]
+    ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset]
+        lea         rcx,        [GLOBAL(sym(vp8_bilinear_filters_x86_8))]
 
         mov         rsi,        arg(0) ;src_ptr
         movsxd      rdx,        dword ptr arg(1) ;src_pixels_per_line

diff --git a/vp8/common/x86/subpixel_x86.h b/vp8/common/x86/subpixel_x86.h
index 75991cc..01ec9e2 100644
--- a/vp8/common/x86/subpixel_x86.h
+++ b/vp8/common/x86/subpixel_x86.h

@@ -12,6 +12,8 @@
 #ifndef SUBPIXEL_X86_H
 #define SUBPIXEL_X86_H
 
+#include "filter_x86.h"
+
 /* Note:
  *
  * This platform is commonly built for runtime CPU detection. If you modify

diff --git a/vp8/common/x86/vp8_asm_stubs.c b/vp8/common/x86/vp8_asm_stubs.c
index bce7bc3..a623c69 100644
--- a/vp8/common/x86/vp8_asm_stubs.c
+++ b/vp8/common/x86/vp8_asm_stubs.c

@@ -12,9 +12,9 @@
 #include "vpx_config.h"
 #include "vpx_ports/mem.h"
 #include "vp8/common/subpixel.h"
+#include "filter_x86.h"
 
 extern const short vp8_six_tap_mmx[8][6*8];
-extern const short vp8_bilinear_filters_mmx[8][2*8];
 
 extern void vp8_filter_block1d_h6_mmx
 (

diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 1133efb..31eafcf 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c

@@ -942,16 +942,38 @@
         if (!pc->refresh_golden_frame)
             pc->copy_buffer_to_gf = vp8_read_literal(bc, 2);
 
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the golden if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_gf = 0;
+#endif
+
         pc->copy_buffer_to_arf = 0;
 
         if (!pc->refresh_alt_ref_frame)
             pc->copy_buffer_to_arf = vp8_read_literal(bc, 2);
 
+#if CONFIG_ERROR_CONCEALMENT
+        /* Assume we shouldn't copy to the alt-ref if the bit is missing */
+        xd->corrupted |= vp8dx_bool_error(bc);
+        if (pbi->ec_active && xd->corrupted)
+            pc->copy_buffer_to_arf = 0;
+#endif
+
+
         pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp8_read_bit(bc);
         pc->ref_frame_sign_bias[ALTREF_FRAME] = vp8_read_bit(bc);
     }
 
     pc->refresh_entropy_probs = vp8_read_bit(bc);
+#if CONFIG_ERROR_CONCEALMENT
+    /* Assume we shouldn't refresh the probabilities if the bit is
+     * missing */
+    xd->corrupted |= vp8dx_bool_error(bc);
+    if (pbi->ec_active && xd->corrupted)
+        pc->refresh_entropy_probs = 0;
+#endif
     if (pc->refresh_entropy_probs == 0)
     {
         vpx_memcpy(&pc->lfc, &pc->fc, sizeof(pc->fc));

diff --git a/vp8/decoder/error_concealment.c b/vp8/decoder/error_concealment.c
index 86fa191..b77d743 100644
--- a/vp8/decoder/error_concealment.c
+++ b/vp8/decoder/error_concealment.c

@@ -491,33 +491,6 @@
     assert(i == 20);
 }
 
-/* Calculates which reference frame type is dominating among the neighbors */
-static MV_REFERENCE_FRAME dominant_ref_frame(EC_BLOCK *neighbors)
-{
-    /* Default to referring to "skip" */
-    MV_REFERENCE_FRAME dom_ref_frame = LAST_FRAME;
-    int max_ref_frame_cnt = 0;
-    int ref_frame_cnt[MAX_REF_FRAMES] = {0};
-    int i;
-    /* Count neighboring reference frames */
-    for (i = 0; i < NUM_NEIGHBORS; ++i)
-    {
-        if (neighbors[i].ref_frame < MAX_REF_FRAMES &&
-            neighbors[i].ref_frame != INTRA_FRAME)
-            ++ref_frame_cnt[neighbors[i].ref_frame];
-    }
-    /* Find maximum */
-    for (i = 0; i < MAX_REF_FRAMES; ++i)
-    {
-        if (ref_frame_cnt[i] > max_ref_frame_cnt)
-        {
-            dom_ref_frame = i;
-            max_ref_frame_cnt = ref_frame_cnt[i];
-        }
-    }
-    return dom_ref_frame;
-}
-
 /* Interpolates all motion vectors for a macroblock from the neighboring blocks'
  * motion vectors.
  */
@@ -591,7 +564,6 @@
 {
     /* Find relevant neighboring blocks */
     EC_BLOCK neighbors[NUM_NEIGHBORS];
-    MV_REFERENCE_FRAME dom_ref_frame;
     int i;
     /* Initialize the array. MAX_REF_FRAMES is interpreted as "doesn't exist" */
     for (i = 0; i < NUM_NEIGHBORS; ++i)
@@ -604,13 +576,11 @@
                                 mb_row, mb_col,
                                 mb_rows, mb_cols,
                                 mb->mode_info_stride);
-    /* Determine the dominant block type */
-    dom_ref_frame = dominant_ref_frame(neighbors);
-    /* Interpolate MVs for the missing blocks
-     * from the dominating MVs */
-    interpolate_mvs(mb, neighbors, dom_ref_frame);
+    /* Interpolate MVs for the missing blocks from the surrounding
+     * blocks which refer to the last frame. */
+    interpolate_mvs(mb, neighbors, LAST_FRAME);
 
-    mb->mode_info_context->mbmi.ref_frame = dom_ref_frame;
+    mb->mode_info_context->mbmi.ref_frame = LAST_FRAME;
     mb->mode_info_context->mbmi.mode = SPLITMV;
     mb->mode_info_context->mbmi.uv_mode = DC_PRED;
     mb->mode_info_context->mbmi.partitioning = 3;

diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c
index 0779549..cf525f4 100644
--- a/vp8/decoder/onyxd_if.c
+++ b/vp8/decoder/onyxd_if.c

@@ -359,28 +359,38 @@
         pbi->fragment_sizes[0] = 0;
     }
 
-    if (pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
+    if (!pbi->ec_active &&
+        pbi->num_fragments <= 1 && pbi->fragment_sizes[0] == 0)
     {
-       /* This is used to signal that we are missing frames.
-        * We do not know if the missing frame(s) was supposed to update
-        * any of the reference buffers, but we act conservative and
-        * mark only the last buffer as corrupted.
-        */
-        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
-
         /* If error concealment is disabled we won't signal missing frames
          * to the decoder.
          */
-        if (!pbi->ec_active)
+        if (cm->fb_idx_ref_cnt[cm->lst_fb_idx] > 1)
         {
-            /* Signal that we have no frame to show. */
-            cm->show_frame = 0;
-
-            pbi->num_fragments = 0;
-
-            /* Nothing more to do. */
-            return 0;
+            /* The last reference shares buffer with another reference
+             * buffer. Move it to its own buffer before setting it as
+             * corrupt, otherwise we will make multiple buffers corrupt.
+             */
+            const int prev_idx = cm->lst_fb_idx;
+            cm->fb_idx_ref_cnt[prev_idx]--;
+            cm->lst_fb_idx = get_free_fb(cm);
+            vp8_yv12_copy_frame_ptr(&cm->yv12_fb[prev_idx],
+                                    &cm->yv12_fb[cm->lst_fb_idx]);
         }
+        /* This is used to signal that we are missing frames.
+         * We do not know if the missing frame(s) was supposed to update
+         * any of the reference buffers, but we act conservative and
+         * mark only the last buffer as corrupted.
+         */
+        cm->yv12_fb[cm->lst_fb_idx].corrupted = 1;
+
+        /* Signal that we have no frame to show. */
+        cm->show_frame = 0;
+
+        pbi->num_fragments = 0;
+
+        /* Nothing more to do. */
+        return 0;
     }
 
 #if HAVE_ARMV7

diff --git a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
index 30513f91..5b7e8f6 100644
--- a/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm
+++ b/vp8/encoder/arm/armv5te/boolhuff_armv5te.asm

@@ -13,6 +13,7 @@
     EXPORT |vp8_encode_bool|
     EXPORT |vp8_stop_encode|
     EXPORT |vp8_encode_value|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -22,6 +23,20 @@
 
     AREA    |.text|, CODE, READONLY
 
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 BOOL_CODER *br
 ; r1 unsigned char *source
 ; r2 unsigned char *source_end
@@ -43,7 +58,7 @@
 ; r1 int bit
 ; r2 int probability
 |vp8_encode_bool| PROC
-    push    {r4-r9, lr}
+    push    {r4-r10, lr}
 
     mov     r4, r2
 
@@ -106,6 +121,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r1, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero
@@ -114,7 +132,7 @@
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r9, pc}
+    pop     {r4-r10, pc}
     ENDP
 
 ; r0 BOOL_CODER *br
@@ -179,6 +197,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r1, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r1                 ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero_se
@@ -198,7 +219,7 @@
 ; r1 int data
 ; r2 int bits
 |vp8_encode_value| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
 
     mov     r10, r2
 
@@ -270,6 +291,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r9, r11                ; validate_buffer at pos
+
     strb    r7, [r9, r4]                ; w->buffer[w->pos++]
 
 token_count_lt_zero_ev
@@ -281,7 +305,7 @@
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    pop     {r4-r11, pc}
+    pop     {r4-r12, pc}
     ENDP
 
     END

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
index 933717c..a1cd467 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_armv5.asm

@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,6 +20,22 @@
 
     AREA    |.text|, CODE, READONLY
 
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
+
 ; r0 vp8_writer *w
 ; r1 const TOKENEXTRA *p
 ; r2 int xcount
@@ -26,11 +43,11 @@
 ; s0 vp8_extra_bits
 ; s1 vp8_coef_tree
 |vp8cx_pack_tokens_armv5| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
+    sub     sp, sp, #16
 
     ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
     ;  sizeof (TOKENEXTRA) is 8
-    sub     sp, sp, #12
     add     r2, r1, r2, lsl #3          ; stop = p + xcount*sizeof(TOKENEXTRA)
     str     r2, [sp, #0]
     str     r3, [sp, #8]                ; save vp8_coef_encodings
@@ -57,7 +74,7 @@
     subne   r8, r8, #1                  ; --n
 
     rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
     lsl     r12, r6, r4                ; r12 = v << 32 - n
@@ -128,12 +145,15 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
     ; temp variable here.  So after r10 is used, reload
     ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #52]              ; vp8_coef_tree
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
 
 token_count_lt_zero
     lsl     r2, r2, r6                  ; lowvalue <<= shift
@@ -142,7 +162,7 @@
     bne     token_loop
 
     ldrb    r6, [r1, #tokenextra_token] ; t
-    ldr     r7, [sp, #48]               ; vp8_extra_bits
+    ldr     r7, [sp, #56]               ; vp8_extra_bits
     ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
     ;  element.  Here vp8_extra_bit_struct == 16
     add     r12, r7, r6, lsl #4         ; b = vp8_extra_bits + t
@@ -223,6 +243,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -271,7 +294,10 @@
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -284,8 +310,8 @@
     str     r2, [r0, #vp8_writer_lowvalue]
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
-    add     sp, sp, #12
-    pop     {r4-r11, pc}
+    add     sp, sp, #16
+    pop     {r4-r12, pc}
     ENDP
 
     END

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
index 82bf71f..1fa5e6c 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_mbrow_armv5.asm

@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_mb_row_tokens_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,6 +20,21 @@
 
     AREA    |.text|, CODE, READONLY
 
+
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 VP8_COMP *cpi
 ; r1 vp8_writer *w
 ; r2 vp8_coef_encodings
@@ -26,7 +42,7 @@
 ; s0 vp8_coef_tree
 
 |vp8cx_pack_mb_row_tokens_armv5| PROC
-    push    {r4-r11, lr}
+    push    {r4-r12, lr}
     sub     sp, sp, #24
 
     ; Compute address of cpi->common.mb_rows
@@ -79,7 +95,7 @@
     subne   r8, r8, #1                  ; --n
 
     rsb     r4, r8, #32                 ; 32-n
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
 
     ; v is kept in r12 during the token pack loop
     lsl     r12, r6, r4                 ; r12 = v << 32 - n
@@ -150,12 +166,15 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
     ; temp variable here.  So after r10 is used, reload
     ; vp8_coef_tree_dcd into r10
-    ldr     r10, [sp, #60]              ; vp8_coef_tree
+    ldr     r10, [sp, #64]              ; vp8_coef_tree
 
 token_count_lt_zero
     lsl     r2, r2, r6                  ; lowvalue <<= shift
@@ -245,6 +264,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -293,7 +315,10 @@
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12               ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -314,7 +339,7 @@
     str     r5, [r0, #vp8_writer_range]
     str     r3, [r0, #vp8_writer_count]
     add     sp, sp, #24
-    pop     {r4-r11, pc}
+    pop     {r4-r12, pc}
     ENDP
 
 _VP8_COMP_common_

diff --git a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
index c061b2f..3a183aa 100644
--- a/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm
+++ b/vp8/encoder/arm/armv5te/vp8_packtokens_partitions_armv5.asm

@@ -10,6 +10,7 @@
 
 
     EXPORT |vp8cx_pack_tokens_into_partitions_armv5|
+    IMPORT |vp8_validate_buffer_arm|
 
     INCLUDE asm_enc_offsets.asm
 
@@ -19,17 +20,31 @@
 
     AREA    |.text|, CODE, READONLY
 
+    ; macro for validating write buffer position
+    ; needs vp8_writer in r0
+    ; start shall not be in r1
+    MACRO
+    VALIDATE_POS $start, $pos
+    push {r0-r3, r12, lr}        ; rest of regs are preserved by subroutine call
+    ldr  r2, [r0, #vp8_writer_buffer_end]
+    ldr  r3, [r0, #vp8_writer_error]
+    mov  r1, $pos
+    mov  r0, $start
+    bl   vp8_validate_buffer_arm
+    pop  {r0-r3, r12, lr}
+    MEND
+
 ; r0 VP8_COMP *cpi
 ; r1 unsigned char *cx_data
-; r2 int num_part
-; r3 *size
+; r2 const unsigned char *cx_data_end
+; r3 int num_part
 ; s0 vp8_coef_encodings
 ; s1 vp8_extra_bits,
-; s2 const vp8_tree_index *,
+; s2 const vp8_tree_index *
 
 |vp8cx_pack_tokens_into_partitions_armv5| PROC
-    push    {r4-r11, lr}
-    sub     sp, sp, #44
+    push    {r4-r12, lr}
+    sub     sp, sp, #40
 
     ; Compute address of cpi->common.mb_rows
     ldr     r4, _VP8_COMP_common_
@@ -39,31 +54,26 @@
     ldr     r5, [r4, r6]                ; load up mb_rows
 
     str     r5, [sp, #36]               ; save mb_rows
-    str     r1, [sp, #24]               ; save cx_data
-    str     r2, [sp, #20]               ; save num_part
-    str     r3, [sp, #8]                ; save *size
-
-    ; *size = 3*(num_part -1 );
-    sub     r2, r2, #1                  ; num_part - 1
-    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
-    str     r2, [r3]
-
-    add     r2, r2, r1                  ; cx_data + *size
-    str     r2, [sp, #40]               ; ptr
+    str     r1, [sp, #24]               ; save ptr = cx_data
+    str     r3, [sp, #20]               ; save num_part
+    str     r2, [sp, #8]                ; save cx_data_end
 
     ldr     r4, _VP8_COMP_tplist_
     add     r4, r0, r4
     ldr     r7, [r4, #0]                ; dereference cpi->tp_list
     str     r7, [sp, #32]               ; store start of cpi->tp_list
 
-    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi
+    ldr     r11, _VP8_COMP_bc_          ; load up vp8_writer out of cpi
     add     r0, r0, r11
 
     mov     r11, #0
     str     r11, [sp, #28]              ; i
 
 numparts_loop
-    ldr     r10, [sp, #40]              ; ptr
+    ldr     r2, _vp8_writer_sz_         ; load up sizeof(vp8_writer)
+    add     r0, r2                      ; bc[i + 1]
+
+    ldr     r10, [sp, #24]              ; ptr
     ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
     subs    r5, r5, r11                 ; move start point with each partition
                                         ; mb_rows starts at i
@@ -72,6 +82,10 @@
     ; Reset all of the VP8 Writer data for each partition that
     ; is processed.
     ; start_encode
+
+    ldr     r3, [sp, #8]
+    str     r3, [r0, #vp8_writer_buffer_end]
+
     mov     r2, #0                      ; vp8_writer_lowvalue
     mov     r5, #255                    ; vp8_writer_range
     mvn     r3, #23                     ; vp8_writer_count
@@ -182,6 +196,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
     ; r10 is used earlier in the loop, but r10 is used as
@@ -277,6 +294,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
     ldr     r10, [sp, #4]               ; b->tree
 extra_count_lt_zero
@@ -320,12 +340,15 @@
     bne     end_count_zero
 
     ldr     r4, [r0, #vp8_writer_pos]
-    mvn     r3, #7
+    mvn     r3, #7                      ; count = -8
     ldr     r7, [r0, #vp8_writer_buffer]
     lsr     r6, r2, #24                 ; lowvalue >> 24
     add     r12, r4, #1                 ; w->pos++
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
-    str     r12, [r0, #0x10]
+    str     r12, [r0, #vp8_writer_pos]
+
+    VALIDATE_POS r7, r12                ; validate_buffer at pos
+
     strb    r6, [r7, r4]
 end_count_zero
 skip_extra_bits
@@ -401,6 +424,9 @@
     bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
     str     r11, [r0, #vp8_writer_pos]
     sub     r3, r3, #8                  ; count -= 8
+
+    VALIDATE_POS r10, r11               ; validate_buffer at pos
+
     strb    r7, [r10, r4]               ; w->buffer[w->pos++]
 
 token_count_lt_zero_se
@@ -409,33 +435,10 @@
     subs    r12, r12, #1
     bne     stop_encode_loop
 
-    ldr     r10, [sp, #8]               ; *size
-    ldr     r11, [r10]
     ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
-    add     r11, r11, r4                ; *size += w->pos
-    str     r11, [r10]
-
-    ldr     r9, [sp, #20]               ; num_parts
-    sub     r9, r9, #1
-    ldr     r10, [sp, #28]              ; i
-    cmp     r10, r9                     ; if(i<(num_part - 1))
-    bge     skip_write_partition
-
-    ldr     r12, [sp, #40]              ; ptr
+    ldr     r12, [sp, #24]              ; ptr
     add     r12, r12, r4                ; ptr += w->pos
-    str     r12, [sp, #40]
-
-    ldr     r9, [sp, #24]               ; cx_data
-    mov     r8, r4, asr #8
-    strb    r4, [r9, #0]
-    strb    r8, [r9, #1]
-    mov     r4, r4, asr #16
-    strb    r4, [r9, #2]
-
-    add     r9, r9, #3                  ; cx_data += 3
-    str     r9, [sp, #24]
-
-skip_write_partition
+    str     r12, [sp, #24]
 
     ldr     r11, [sp, #28]              ; i
     ldr     r10, [sp, #20]              ; num_parts
@@ -451,9 +454,8 @@
     cmp     r10, r11
     bgt     numparts_loop
 
-
-    add     sp, sp, #44
-    pop     {r4-r11, pc}
+    add     sp, sp, #40
+    pop     {r4-r12, pc}
     ENDP
 
 _VP8_COMP_common_
@@ -462,7 +464,9 @@
     DCD     vp8_common_mb_rows
 _VP8_COMP_tplist_
     DCD     vp8_comp_tplist
-_VP8_COMP_bc2_
-    DCD     vp8_comp_bc2
+_VP8_COMP_bc_
+    DCD     vp8_comp_bc
+_vp8_writer_sz_
+    DCD     vp8_writer_sz
 
     END

diff --git a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
index 0ca7438..f329f8f 100644
--- a/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm
+++ b/vp8/encoder/arm/armv6/vp8_subtract_armv6.asm

@@ -72,22 +72,23 @@
 ; r0    short *diff
 ; r1    unsigned char *usrc
 ; r2    unsigned char *vsrc
-; r3    unsigned char *pred
-; stack int stride
+; r3    int src_stride
+; sp    unsigned char *upred
+; sp    unsigned char *vpred
+; sp    int pred_stride
 |vp8_subtract_mbuv_armv6| PROC
 
-    stmfd   sp!, {r4-r12, lr}
+    stmfd   sp!, {r4-r11}
 
     add     r0, r0, #512        ; set *diff point to Cb
-    add     r3, r3, #256        ; set *pred point to Cb
-
     mov     r4, #8              ; loop count
-    ldr     r5, [sp, #40]       ; stride
+    ldr     r5, [sp, #32]       ; upred
+    ldr     r12, [sp, #40]      ; pred_stride
 
     ; Subtract U block
 loop_u
-    ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
+    ldr     r6, [r1]            ; usrc      (A)
+    ldr     r7, [r5]            ; upred     (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -97,8 +98,8 @@
     usub16  r6, r8, r9          ; [d2 | d0] (A)
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
-    ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
+    ldr     r10, [r1, #4]       ; usrc      (B)
+    ldr     r11, [r5, #4]       ; upred     (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -114,7 +115,8 @@
     usub16  r6, r8, r9          ; [d2 | d0] (B)
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
-    add     r1, r1, r5          ; update usrc pointer
+    add     r1, r1, r3          ; update usrc pointer
+    add     r5, r5, r12         ; update upred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -125,12 +127,13 @@
 
     bne     loop_u
 
+    ldr     r5, [sp, #36]       ; vpred
     mov     r4, #8              ; loop count
 
     ; Subtract V block
 loop_v
-    ldr     r6, [r2]            ; src       (A)
-    ldr     r7, [r3], #4        ; pred      (A)
+    ldr     r6, [r2]            ; vsrc      (A)
+    ldr     r7, [r5]            ; vpred     (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -140,8 +143,8 @@
     usub16  r6, r8, r9          ; [d2 | d0] (A)
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
-    ldr     r10, [r2, #4]       ; src       (B)
-    ldr     r11, [r3], #4       ; pred      (B)
+    ldr     r10, [r2, #4]       ; vsrc      (B)
+    ldr     r11, [r5, #4]       ; vpred     (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -157,7 +160,8 @@
     usub16  r6, r8, r9          ; [d2 | d0] (B)
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
-    add     r2, r2, r5          ; update vsrc pointer
+    add     r2, r2, r3          ; update vsrc pointer
+    add     r5, r5, r12         ; update vpred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -168,23 +172,25 @@
 
     bne     loop_v
 
-    ldmfd   sp!, {r4-r12, pc}
+    ldmfd   sp!, {r4-r11}
+    bx      lr
 
     ENDP
 
 
 ; r0    short *diff
 ; r1    unsigned char *src
-; r2    unsigned char *pred
-; r3    int stride
+; r2    int src_stride
+; r3    unsigned char *pred
+; sp    int pred_stride
 |vp8_subtract_mby_armv6| PROC
 
     stmfd   sp!, {r4-r11}
-
+    ldr     r12, [sp, #32]      ; pred_stride
     mov     r4, #16
 loop
     ldr     r6, [r1]            ; src       (A)
-    ldr     r7, [r2], #4        ; pred      (A)
+    ldr     r7, [r3]            ; pred      (A)
 
     uxtb16  r8, r6              ; [s2 | s0] (A)
     uxtb16  r9, r7              ; [p2 | p0] (A)
@@ -195,7 +201,7 @@
     usub16  r7, r10, r11        ; [d3 | d1] (A)
 
     ldr     r10, [r1, #4]       ; src       (B)
-    ldr     r11, [r2], #4       ; pred      (B)
+    ldr     r11, [r3, #4]       ; pred      (B)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (A)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (A)
@@ -212,7 +218,7 @@
     usub16  r7, r10, r11        ; [d3 | d1] (B)
 
     ldr     r10, [r1, #8]       ; src       (C)
-    ldr     r11, [r2], #4       ; pred      (C)
+    ldr     r11, [r3, #8]       ; pred      (C)
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (B)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (B)
@@ -229,10 +235,10 @@
     usub16  r7, r10, r11        ; [d3 | d1] (C)
 
     ldr     r10, [r1, #12]      ; src       (D)
-    ldr     r11, [r2], #4       ; pred      (D)
+    ldr     r11, [r3, #12]      ; pred      (D)
 
-    pkhbt   r8, r6, r7, lsl #16  ; [d1 | d0] (C)
-    pkhtb   r9, r7, r6, asr #16  ; [d3 | d2] (C)
+    pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (C)
+    pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (C)
 
     str     r8, [r0], #4        ; diff      (C)
     uxtb16  r8, r10             ; [s2 | s0] (D)
@@ -245,7 +251,8 @@
     usub16  r6, r8, r9          ; [d2 | d0] (D)
     usub16  r7, r10, r11        ; [d3 | d1] (D)
 
-    add     r1, r1, r3          ; update src pointer
+    add     r1, r1, r2          ; update src pointer
+    add     r3, r3, r12         ; update pred pointer
 
     pkhbt   r8, r6, r7, lsl #16 ; [d1 | d0] (D)
     pkhtb   r9, r7, r6, asr #16 ; [d3 | d2] (D)
@@ -257,7 +264,7 @@
     bne     loop
 
     ldmfd   sp!, {r4-r11}
-    mov     pc, lr
+    bx      lr
 
     ENDP
 

diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
index 9089663..17a941b 100644
--- a/vp8/encoder/arm/boolhuff_arm.c
+++ b/vp8/encoder/arm/boolhuff_arm.c

@@ -10,7 +10,7 @@
 
 
 #include "vp8/encoder/boolhuff.h"
-#include "vp8/common/blockd.h"
+#include "vpx/internal/vpx_codec_internal.h"
 
 const unsigned int vp8_prob_cost[256] =
 {
@@ -32,3 +32,10 @@
     22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
 };
 
+int vp8_validate_buffer_arm(const unsigned char *start,
+                            size_t               len,
+                            const unsigned char *end,
+                            struct vpx_internal_error_info *error)
+{
+    return validate_buffer(start, len, end, error);
+}

diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
index 68c2950..91a328c 100644
--- a/vp8/encoder/arm/neon/subtract_neon.asm
+++ b/vp8/encoder/arm/neon/subtract_neon.asm

@@ -61,19 +61,24 @@
 
 
 ;==========================================
-;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, int src_stride
+;                           unsigned char *pred, int pred_stride)
 |vp8_subtract_mby_neon| PROC
+    push            {r4-r7}
     mov             r12, #4
+    ldr             r4, [sp, #16]           ; pred_stride
+    mov             r6, #32                 ; "diff" stride x2
+    add             r5, r0, #16             ; second diff pointer
 
 subtract_mby_loop
-    vld1.8          {q0}, [r1], r3          ;load src
-    vld1.8          {q1}, [r2]!             ;load pred
-    vld1.8          {q2}, [r1], r3
-    vld1.8          {q3}, [r2]!
-    vld1.8          {q4}, [r1], r3
-    vld1.8          {q5}, [r2]!
-    vld1.8          {q6}, [r1], r3
-    vld1.8          {q7}, [r2]!
+    vld1.8          {q0}, [r1], r2          ;load src
+    vld1.8          {q1}, [r3], r4          ;load pred
+    vld1.8          {q2}, [r1], r2
+    vld1.8          {q3}, [r3], r4
+    vld1.8          {q4}, [r1], r2
+    vld1.8          {q5}, [r3], r4
+    vld1.8          {q6}, [r1], r2
+    vld1.8          {q7}, [r3], r4
 
     vsubl.u8        q8, d0, d2
     vsubl.u8        q9, d1, d3
@@ -84,46 +89,53 @@
     vsubl.u8        q14, d12, d14
     vsubl.u8        q15, d13, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r6          ;store diff
+    vst1.16         {q9}, [r5], r6
+    vst1.16         {q10}, [r0], r6
+    vst1.16         {q11}, [r5], r6
+    vst1.16         {q12}, [r0], r6
+    vst1.16         {q13}, [r5], r6
+    vst1.16         {q14}, [r0], r6
+    vst1.16         {q15}, [r5], r6
 
     subs            r12, r12, #1
     bne             subtract_mby_loop
 
+    pop             {r4-r7}
     bx              lr
     ENDP
 
 ;=================================
-;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
 |vp8_subtract_mbuv_neon| PROC
-    ldr             r12, [sp]
+    push            {r4-r7}
+    ldr             r4, [sp, #16]       ; upred
+    ldr             r5, [sp, #20]       ; vpred
+    ldr             r6, [sp, #24]       ; pred_stride
+    add             r0, r0, #512        ; short *udiff = diff + 256;
+    mov             r12, #32            ; "diff" stride x2
+    add             r7, r0, #16         ; second diff pointer
 
 ;u
-    add             r0, r0, #512        ;   short *udiff = diff + 256;
-    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
-
-    vld1.8          {d0}, [r1], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r1], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r1], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r1], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r1], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r1], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r1], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r1], r12
-    vld1.8          {d15}, [r3]!
+    vld1.8          {d0}, [r1], r3      ;load usrc
+    vld1.8          {d1}, [r4], r6      ;load upred
+    vld1.8          {d2}, [r1], r3
+    vld1.8          {d3}, [r4], r6
+    vld1.8          {d4}, [r1], r3
+    vld1.8          {d5}, [r4], r6
+    vld1.8          {d6}, [r1], r3
+    vld1.8          {d7}, [r4], r6
+    vld1.8          {d8}, [r1], r3
+    vld1.8          {d9}, [r4], r6
+    vld1.8          {d10}, [r1], r3
+    vld1.8          {d11}, [r4], r6
+    vld1.8          {d12}, [r1], r3
+    vld1.8          {d13}, [r4], r6
+    vld1.8          {d14}, [r1], r3
+    vld1.8          {d15}, [r4], r6
 
     vsubl.u8        q8, d0, d1
     vsubl.u8        q9, d2, d3
@@ -134,32 +146,32 @@
     vsubl.u8        q14, d12, d13
     vsubl.u8        q15, d14, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
 
 ;v
-    vld1.8          {d0}, [r2], r12         ;load src
-    vld1.8          {d1}, [r3]!             ;load pred
-    vld1.8          {d2}, [r2], r12
-    vld1.8          {d3}, [r3]!
-    vld1.8          {d4}, [r2], r12
-    vld1.8          {d5}, [r3]!
-    vld1.8          {d6}, [r2], r12
-    vld1.8          {d7}, [r3]!
-    vld1.8          {d8}, [r2], r12
-    vld1.8          {d9}, [r3]!
-    vld1.8          {d10}, [r2], r12
-    vld1.8          {d11}, [r3]!
-    vld1.8          {d12}, [r2], r12
-    vld1.8          {d13}, [r3]!
-    vld1.8          {d14}, [r2], r12
-    vld1.8          {d15}, [r3]!
+    vld1.8          {d0}, [r2], r3      ;load vsrc
+    vld1.8          {d1}, [r5], r6      ;load vpred
+    vld1.8          {d2}, [r2], r3
+    vld1.8          {d3}, [r5], r6
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d5}, [r5], r6
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d7}, [r5], r6
+    vld1.8          {d8}, [r2], r3
+    vld1.8          {d9}, [r5], r6
+    vld1.8          {d10}, [r2], r3
+    vld1.8          {d11}, [r5], r6
+    vld1.8          {d12}, [r2], r3
+    vld1.8          {d13}, [r5], r6
+    vld1.8          {d14}, [r2], r3
+    vld1.8          {d15}, [r5], r6
 
     vsubl.u8        q8, d0, d1
     vsubl.u8        q9, d2, d3
@@ -170,16 +182,18 @@
     vsubl.u8        q14, d12, d13
     vsubl.u8        q15, d14, d15
 
-    vst1.16         {q8}, [r0]!             ;store diff
-    vst1.16         {q9}, [r0]!
-    vst1.16         {q10}, [r0]!
-    vst1.16         {q11}, [r0]!
-    vst1.16         {q12}, [r0]!
-    vst1.16         {q13}, [r0]!
-    vst1.16         {q14}, [r0]!
-    vst1.16         {q15}, [r0]!
+    vst1.16         {q8}, [r0], r12     ;store diff
+    vst1.16         {q9}, [r7], r12
+    vst1.16         {q10}, [r0], r12
+    vst1.16         {q11}, [r7], r12
+    vst1.16         {q12}, [r0], r12
+    vst1.16         {q13}, [r7], r12
+    vst1.16         {q14}, [r0], r12
+    vst1.16         {q15}, [r7], r12
 
+    pop             {r4-r7}
     bx              lr
+
     ENDP
 
     END

diff --git a/vp8/encoder/asm_enc_offsets.c b/vp8/encoder/asm_enc_offsets.c
index d05dab4..2e9ca72 100644
--- a/vp8/encoder/asm_enc_offsets.c
+++ b/vp8/encoder/asm_enc_offsets.c

@@ -50,6 +50,7 @@
 DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
 DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
 DEFINE(vp8_writer_buffer_end,                   offsetof(vp8_writer, buffer_end));
+DEFINE(vp8_writer_error,                        offsetof(vp8_writer, error));
 
 DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
 DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
@@ -69,7 +70,8 @@
 
 DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
 DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
-DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
+DEFINE(vp8_comp_bc ,                            offsetof(VP8_COMP, bc));
+DEFINE(vp8_writer_sz ,                          sizeof(vp8_writer));
 
 DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
 DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 748b607..5eea39c 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c

@@ -109,7 +109,7 @@
 {
     VP8_COMMON *const x = & cpi->common;
 
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
 
     {
         vp8_prob Pnew   [VP8_YMODES-1];
@@ -221,6 +221,11 @@
                     w->buffer[x] += 1;
                 }
 
+                validate_buffer(w->buffer + w->pos,
+                                1,
+                                w->buffer_end,
+                                w->error);
+
                 w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                 lowvalue <<= offset;
                 shift = count;
@@ -281,6 +286,11 @@
                             w->buffer[x] += 1;
                         }
 
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
                         w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                         lowvalue <<= offset;
                         shift = count;
@@ -329,6 +339,12 @@
                 if (!++count)
                 {
                     count = -8;
+
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
                     w->buffer[w->pos++] = (lowvalue >> 24);
                     lowvalue &= 0xffffff;
                 }
@@ -358,20 +374,21 @@
 
 }
 
-static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, unsigned char * cx_data_end, int num_part, int *size)
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data,
+                                          unsigned char * cx_data_end,
+                                          int num_part)
 {
 
     int i;
     unsigned char *ptr = cx_data;
     unsigned char *ptr_end = cx_data_end;
     unsigned int shift;
-    vp8_writer *w = &cpi->bc2;
-    *size = 3 * (num_part - 1);
-    cpi->partition_sz[0] += *size;
-    ptr = cx_data + (*size);
+    vp8_writer *w;
+    ptr = cx_data;
 
     for (i = 0; i < num_part; i++)
     {
+        w = cpi->bc + i + 1;
         vp8_start_encode(w, ptr, ptr_end);
         {
             unsigned int split;
@@ -581,17 +598,7 @@
         }
 
         vp8_stop_encode(w);
-        *size +=   w->pos;
-
-        /* The first partition size is set earlier */
-        cpi->partition_sz[i + 1] = w->pos;
-
-        if (i < (num_part - 1))
-        {
-            write_partition_size(cx_data, w->pos);
-            cx_data += 3;
-            ptr += w->pos;
-        }
+        ptr += w->pos;
     }
 }
 
@@ -664,6 +671,11 @@
                         w->buffer[x] += 1;
                     }
 
+                    validate_buffer(w->buffer + w->pos,
+                                    1,
+                                    w->buffer_end,
+                                    w->error);
+
                     w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                     lowvalue <<= offset;
                     shift = count;
@@ -724,6 +736,11 @@
                                 w->buffer[x] += 1;
                             }
 
+                            validate_buffer(w->buffer + w->pos,
+                                            1,
+                                            w->buffer_end,
+                                            w->error);
+
                             w->buffer[w->pos++] = (lowvalue >> (24 - offset));
                             lowvalue <<= offset;
                             shift = count;
@@ -770,6 +787,12 @@
                     if (!++count)
                     {
                         count = -8;
+
+                        validate_buffer(w->buffer + w->pos,
+                                        1,
+                                        w->buffer_end,
+                                        w->error);
+
                         w->buffer[w->pos++] = (lowvalue >> 24);
                         lowvalue &= 0xffffff;
                     }
@@ -860,7 +883,7 @@
 static void pack_inter_mode_mvs(VP8_COMP *const cpi)
 {
     VP8_COMMON *const pc = & cpi->common;
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
     const MV_CONTEXT *mvc = pc->fc.mvc;
 
     const int *const rfct = cpi->count_mb_ref_frame_usage;
@@ -1075,7 +1098,7 @@
 
 static void write_kfmodes(VP8_COMP *cpi)
 {
-    vp8_writer *const bc = & cpi->bc;
+    vp8_writer *const bc = cpi->bc;
     const VP8_COMMON *const c = & cpi->common;
     /* const */
     MODE_INFO *m = c->mi;
@@ -1405,7 +1428,7 @@
 static void update_coef_probs(VP8_COMP *cpi)
 {
     int i = 0;
-    vp8_writer *const w = & cpi->bc;
+    vp8_writer *const w = cpi->bc;
     int savings = 0;
 
     vp8_clear_system_state(); //__asm emms;
@@ -1551,7 +1574,7 @@
     int i, j;
     VP8_HEADER oh;
     VP8_COMMON *const pc = & cpi->common;
-    vp8_writer *const bc = & cpi->bc;
+    vp8_writer *const bc = cpi->bc;
     MACROBLOCKD *const xd = & cpi->mb.e_mbd;
     int extra_bytes_packed = 0;
 
@@ -1566,6 +1589,8 @@
 
     mb_feature_data_bits = vp8_mb_feature_data_bits;
 
+    bc[0].error = &pc->error;
+
     validate_buffer(cx_data, 3, cx_data_end, &cpi->common.error);
     cx_data += 3;
 
@@ -1844,7 +1869,9 @@
 
     vp8_stop_encode(bc);
 
-    oh.first_partition_length_in_bytes = cpi->bc.pos;
+    cx_data += bc->pos;
+
+    oh.first_partition_length_in_bytes = cpi->bc->pos;
 
     /* update frame tag */
     {
@@ -1858,34 +1885,58 @@
         dest[2] = v >> 16;
     }
 
-    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc.pos;
+    *size = VP8_HEADER_SIZE + extra_bytes_packed + cpi->bc->pos;
+
     cpi->partition_sz[0] = *size;
 
     if (pc->multi_token_partition != ONE_PARTITION)
     {
-        int num_part;
-        int asize;
-        num_part = 1 << pc->multi_token_partition;
+        int num_part = 1 << pc->multi_token_partition;
 
-        pack_tokens_into_partitions(cpi, cx_data + bc->pos, cx_data_end, num_part, &asize);
+        /* partition size table at the end of first partition */
+        cpi->partition_sz[0] += 3 * (num_part - 1);
+        *size += 3 * (num_part - 1);
 
-        *size += asize;
+        validate_buffer(cx_data, 3 * (num_part - 1), cx_data_end,
+                        &pc->error);
+
+        for(i = 1; i < num_part + 1; i++)
+        {
+            cpi->bc[i].error = &pc->error;
+        }
+
+        pack_tokens_into_partitions(cpi, cx_data + 3 * (num_part - 1),
+                                    cx_data_end, num_part);
+
+        for(i = 1; i < num_part; i++)
+        {
+            cpi->partition_sz[i] = cpi->bc[i].pos;
+            write_partition_size(cx_data, cpi->partition_sz[i]);
+            cx_data += 3;
+            *size += cpi->partition_sz[i]; /* add to total */
+        }
+
+        /* add last partition to total size */
+        cpi->partition_sz[i] = cpi->bc[i].pos;
+        *size += cpi->partition_sz[i];
     }
     else
     {
-        vp8_start_encode(&cpi->bc2, cx_data + bc->pos, cx_data_end);
+        bc[1].error = &pc->error;
+
+        vp8_start_encode(&cpi->bc[1], cx_data, cx_data_end);
 
 #if CONFIG_MULTITHREAD
         if (cpi->b_multi_threaded)
-            pack_mb_row_tokens(cpi, &cpi->bc2);
+            pack_mb_row_tokens(cpi, &cpi->bc[1]);
         else
 #endif
-            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+            pack_tokens(&cpi->bc[1], cpi->tok, cpi->tok_count);
 
-        vp8_stop_encode(&cpi->bc2);
+        vp8_stop_encode(&cpi->bc[1]);
 
-        *size += cpi->bc2.pos;
-        cpi->partition_sz[1] = cpi->bc2.pos;
+        *size += cpi->bc[1].pos;
+        cpi->partition_sz[1] = cpi->bc[1].pos;
     }
 }
 

diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
index 8a875a5..9007ced 100644
--- a/vp8/encoder/bitstream.h
+++ b/vp8/encoder/bitstream.h

@@ -17,23 +17,27 @@
                              vp8_token *,
                              vp8_extra_bit_struct *,
                              const vp8_tree_index *);
-void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *, unsigned char *, int , int *,
-        vp8_token *,
-        vp8_extra_bit_struct *,
-        const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv5(VP8_COMP *,
+                                             unsigned char * cx_data,
+                                             const unsigned char *cx_data_end,
+                                             int num_parts,
+                                             vp8_token *,
+                                             vp8_extra_bit_struct *,
+                                             const vp8_tree_index *);
 void vp8cx_pack_mb_row_tokens_armv5(VP8_COMP *cpi, vp8_writer *w,
                                     vp8_token *,
                                     vp8_extra_bit_struct *,
                                     const vp8_tree_index *);
 # define pack_tokens(a,b,c)                  \
     vp8cx_pack_tokens_armv5(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
-# define pack_tokens_into_partitions(a,b,unused,c,d)  \
+# define pack_tokens_into_partitions(a,b,c,d)  \
     vp8cx_pack_tokens_into_partitions_armv5(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 # define pack_mb_row_tokens(a,b)               \
     vp8cx_pack_mb_row_tokens_armv5(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
 #else
-# define pack_tokens(a,b,c)                  pack_tokens_c(a,b,c)
-# define pack_tokens_into_partitions(a,b,c,d,e)  pack_tokens_into_partitions_c(a,b,c,d,e)
+# define pack_tokens(a,b,c)                    pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
 # define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
 #endif
+
 #endif

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index d89d74e..a3b800a 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -100,7 +100,7 @@
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd);
 
     ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
-        x->e_mbd.predictor, b->src_stride);
+        b->src_stride, x->e_mbd.predictor, 16);
 
     vp8_transform_intra_mby(x);
 
@@ -115,7 +115,9 @@
 {
     RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd);
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
+        &x->e_mbd.predictor[320], 8);
 
     vp8_transform_mbuv(x);
 

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 80c32df..e9042e1 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c

@@ -48,12 +48,12 @@
     }
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                         int src_stride, unsigned char *upred,
+                         unsigned char *vpred, int pred_stride)
 {
     short *udiff = diff + 256;
     short *vdiff = diff + 320;
-    unsigned char *upred = pred + 256;
-    unsigned char *vpred = pred + 320;
 
     int r, c;
 
@@ -65,8 +65,8 @@
         }
 
         udiff += 8;
-        upred += 8;
-        usrc  += stride;
+        upred += pred_stride;
+        usrc  += src_stride;
     }
 
     for (r = 0; r < 8; r++)
@@ -77,12 +77,13 @@
         }
 
         vdiff += 8;
-        vpred += 8;
-        vsrc  += stride;
+        vpred += pred_stride;
+        vsrc  += src_stride;
     }
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
+                        unsigned char *pred, int pred_stride)
 {
     int r, c;
 
@@ -94,8 +95,8 @@
         }
 
         diff += 16;
-        pred += 16;
-        src  += stride;
+        pred += pred_stride;
+        src  += src_stride;
     }
 }
 
@@ -103,8 +104,11 @@
 {
     BLOCK *b = &x->block[0];
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.predictor, 16);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer,
+        x->src.v_buffer, x->src.uv_stride, &x->e_mbd.predictor[256],
+        &x->e_mbd.predictor[320], 8);
 }
 
 static void build_dcblock(MACROBLOCK *x)
@@ -641,7 +645,8 @@
 
     vp8_build_inter16x16_predictors_mby(&x->e_mbd);
 
-    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src),
+        b->src_stride, x->e_mbd.predictor, 16);
 
     transform_mby(x);
 

diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 8fa457a..597a57b 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h

@@ -28,11 +28,13 @@
     void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
 
 #define prototype_submby(sym) \
-    void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
+    void (sym)(short *diff, unsigned char *src, int src_stride, \
+        unsigned char *pred, int pred_stride)
 
 #define prototype_submbuv(sym) \
     void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
-               unsigned char *pred, int stride)
+               int src_stride, unsigned char *upred, unsigned char *vpred,\
+               int pred_stride)
 
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/encodemb_x86.h"

diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index a4849c6..c122d03 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c

@@ -395,7 +395,7 @@
 
 void vp8_write_mvprobs(VP8_COMP *cpi)
 {
-    vp8_writer *const w  = & cpi->bc;
+    vp8_writer *const w  = cpi->bc;
     MV_CONTEXT *mvc = cpi->common.fc.mvc;
     int flags[2] = {0, 0};
 #ifdef ENTROPY_STATS

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 23e3050..8d19d1e 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -1276,7 +1276,7 @@
     // pass.
     vp8_new_frame_rate(cpi, 10000000.0 * cpi->twopass.total_stats->count / cpi->twopass.total_stats->duration);
 
-    cpi->output_frame_rate = cpi->oxcf.frame_rate;
+    cpi->output_frame_rate = cpi->frame_rate;
     cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats->duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
     cpi->twopass.bits_left -= (int64_t)(cpi->twopass.total_stats->duration * two_pass_min_rate / 10000000.0);
 
@@ -3061,7 +3061,7 @@
 
         // Calculate Average bits per frame.
         //av_bits_per_frame = cpi->twopass.bits_left/(double)(cpi->twopass.total_stats->count - cpi->common.current_video_frame);
-        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate);
         //if ( av_bits_per_frame < 0.0 )
         //  av_bits_per_frame = 0.0
 
@@ -3123,7 +3123,7 @@
         }
         else
         {
-            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+            int64_t clip_bits = (int64_t)(cpi->twopass.total_stats->count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->frame_rate));
             int64_t over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
 
             if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 1d00e67..4df2a78 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -1436,7 +1436,7 @@
         cpi->mt_sync_range = 16;
 #endif
 
-        vpx_free(cpi->tplist);
+    vpx_free(cpi->tplist);
 
     CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
 }
@@ -1470,8 +1470,8 @@
     if(framerate < .1)
         framerate = 30;
 
-    cpi->oxcf.frame_rate        = framerate;
-    cpi->output_frame_rate      = cpi->oxcf.frame_rate;
+    cpi->frame_rate             = framerate;
+    cpi->output_frame_rate      = framerate;
     cpi->per_frame_bandwidth    = (int)(cpi->oxcf.target_bandwidth /
                                   cpi->output_frame_rate);
     cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
@@ -1527,6 +1527,18 @@
     cm->version = oxcf->Version;
     vp8_setup_version(cm);
 
+    /* frame rate is not available on the first frame, as it's derived from
+     * the observed timestamps. The actual value used here doesn't matter
+     * too much, as it will adapt quickly. If the reciprocal of the timebase
+     * seems like a reasonable framerate, then use that as a guess, otherwise
+     * use 30.
+     */
+    cpi->frame_rate = (double)(oxcf->timebase.den) /
+                      (double)(oxcf->timebase.num);
+
+    if (cpi->frame_rate > 180)
+        cpi->frame_rate = 30;
+
     // change includes all joint functionality
     vp8_change_config(ptr, oxcf);
 
@@ -1787,7 +1799,7 @@
                     cpi->oxcf.target_bandwidth, 1000);
 
     // Set up frame rate and related parameters rate control values.
-    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+    vp8_new_frame_rate(cpi, cpi->frame_rate);
 
     // Set absolute upper and lower quality limits
     cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
@@ -2408,7 +2420,7 @@
         {
             extern int count_mb_seg[4];
             FILE *f = fopen("modes.stt", "a");
-            double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            double dr = (double)cpi->frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
             fprintf(f, "intra_mode in Intra Frames:\n");
             fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
             fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
@@ -4856,7 +4868,7 @@
     {
         double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth
             *cpi->oxcf.two_pass_vbrmin_section / 100);
-        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->oxcf.frame_rate);
+        cpi->twopass.bits_left += (int64_t)(two_pass_min_rate / cpi->frame_rate);
     }
 }
 #endif
@@ -4928,7 +4940,7 @@
     int64_t store_reg[8];
 #endif
     VP8_COMP *cpi = (VP8_COMP *) ptr;
-    VP8_COMMON *cm = &cpi->common;
+    VP8_COMMON *cm;
     struct vpx_usec_timer  tsctimer;
     struct vpx_usec_timer  ticktimer;
     struct vpx_usec_timer  cmptimer;
@@ -4937,12 +4949,14 @@
     if (!cpi)
         return -1;
 
-    if (setjmp(cpi->common.error.jmp)){
+    cm = &cpi->common;
+
+    if (setjmp(cpi->common.error.jmp))
+    {
         cpi->common.error.setjmp = 0;
         return VPX_CODEC_CORRUPT_FRAME;
     }
 
-    cpi->bc.error = &cpi->common.error;
     cpi->common.error.setjmp = 1;
 
 #if HAVE_ARMV7
@@ -5092,7 +5106,7 @@
                 if(interval > 10000000.0)
                     interval = 10000000;
 
-                avg_duration = 10000000.0 / cpi->oxcf.frame_rate;
+                avg_duration = 10000000.0 / cpi->frame_rate;
                 avg_duration *= (interval - avg_duration + this_duration);
                 avg_duration /= interval;
 

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index a0828a4..7382de4 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h

@@ -314,8 +314,7 @@
 
     MACROBLOCK mb;
     VP8_COMMON common;
-    vp8_writer bc, bc2;
-    // bool_writer *bc2;
+    vp8_writer bc[9]; // one boolcoder for each partition
 
     VP8_CONFIG oxcf;
 
@@ -418,6 +417,7 @@
 
     int buffered_mode;
 
+    double frame_rate;
     int64_t buffer_level;
     int bits_off_target;
 

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e8abf84..77e4553 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -296,7 +296,7 @@
 
 void vp8_auto_select_speed(VP8_COMP *cpi)
 {
-    int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+    int milliseconds_for_compress = (int)(1000000 / cpi->frame_rate);
 
     milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
 
@@ -552,7 +552,7 @@
     int d;
 
     ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src),
-                                   mb->e_mbd.predictor, mb->block[0].src_stride );
+        mb->block[0].src_stride,  mb->e_mbd.predictor, 16);
 
     // Fdct and building the 2nd order block
     for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
@@ -800,7 +800,8 @@
 {
     vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
 
     vp8_transform_mbuv(x);
     vp8_quantize_mbuv(x);
@@ -816,7 +817,8 @@
 {
     vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
     ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+        x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+        &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
 
     vp8_transform_mbuv(x);
     vp8_quantize_mbuv(x);
@@ -845,8 +847,8 @@
         RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv)
                      (&x->e_mbd);
         ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
-                      x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor,
-                      x->src.uv_stride);
+                      x->src.u_buffer, x->src.v_buffer, x->src.uv_stride,
+                      &x->e_mbd.predictor[256], &x->e_mbd.predictor[320], 8);
         vp8_transform_mbuv(x);
         vp8_quantize_mbuv(x);
 

diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index 4ce16ce..75e8aa3c 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm

@@ -73,74 +73,71 @@
     pop         rbp
     ret
 
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_mmx)
 sym(vp8_subtract_mby_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
+    SHADOW_ARGS_TO_STACK 5
     push rsi
     push rdi
     ; end prolog
 
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    push        rbx
+    movsxd      rbx,        dword ptr arg(4);pred_stride
 
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
+    pxor        mm0,        mm0
+    mov         rcx,        16
 
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            16
-            pxor        mm0,            mm0
 
 .submby_loop:
+    movq        mm1,        [rsi]
+    movq        mm3,        [rax]
 
-            movq        mm1,            [rsi]
-            movq        mm3,            [rax]
+    movq        mm2,        mm1
+    movq        mm4,        mm3
 
-            movq        mm2,            mm1
-            movq        mm4,            mm3
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
 
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
 
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
 
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
+    movq        [rdi],      mm1
+    movq        [rdi+8],    mm2
 
-            movq        [rdi],          mm1
-            movq        [rdi+8],        mm2
+    movq        mm1,        [rsi+8]
+    movq        mm3,        [rax+8]
 
+    movq        mm2,        mm1
+    movq        mm4,        mm3
 
-            movq        mm1,            [rsi+8]
-            movq        mm3,            [rax+8]
+    punpcklbw   mm1,        mm0
+    punpcklbw   mm3,        mm0
 
-            movq        mm2,            mm1
-            movq        mm4,            mm3
+    punpckhbw   mm2,        mm0
+    punpckhbw   mm4,        mm0
 
-            punpcklbw   mm1,            mm0
-            punpcklbw   mm3,            mm0
+    psubw       mm1,        mm3
+    psubw       mm2,        mm4
 
-            punpckhbw   mm2,            mm0
-            punpckhbw   mm4,            mm0
+    movq        [rdi+16],   mm1
+    movq        [rdi+24],   mm2
+    add         rdi,        32
+    lea         rax,        [rax+rbx]
+    lea         rsi,        [rsi+rdx]
+    dec         rcx
+    jnz         .submby_loop
 
-            psubw       mm1,            mm3
-            psubw       mm2,            mm4
-
-            movq        [rdi+16],       mm1
-            movq        [rdi+24],       mm2
-
-
-            add         rdi,            32
-            add         rax,            16
-
-            lea         rsi,            [rsi+rdx]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
+    pop rbx
     pop rdi
     pop rsi
     ; begin epilog
@@ -149,281 +146,75 @@
     ret
 
 
-;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+
 global sym(vp8_subtract_mbuv_mmx)
 sym(vp8_subtract_mbuv_mmx):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
+    SHADOW_ARGS_TO_STACK 7
     push rsi
     push rdi
     ; end prolog
 
-    ;short *udiff = diff + 256;
-    ;short *vdiff = diff + 320;
-    ;unsigned char *upred = pred + 256;
-    ;unsigned char *vpred = pred + 320;
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        8
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
 
-        ;unsigned char  *z    = usrc;
-        ;unsigned short *diff = udiff;
-        ;unsigned char  *Predictor= upred;
+    pxor        mm7,        mm7
 
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
+.submbu_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
 
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
+    dec         rcx
+    jnz         .submbu_loop
 
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        8
 
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
+.submbv_loop:
+    movq        mm0,        [rsi]
+    movq        mm1,        [rax]
+    movq        mm3,        mm0
+    movq        mm4,        mm1
+    punpcklbw   mm0,        mm7
+    punpcklbw   mm1,        mm7
+    punpckhbw   mm3,        mm7
+    punpckhbw   mm4,        mm7
+    psubw       mm0,        mm1
+    psubw       mm3,        mm4
+    movq        [rdi],      mm0
+    movq        [rdi+8],    mm3
+    add         rdi, 16
+    add         rsi, rdx
+    add         rax, rbx
 
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
+    dec         rcx
+    jnz         .submbv_loop
 
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-        ;unsigned char  *z    = vsrc;
-        ;unsigned short *diff = vdiff;
-        ;unsigned char  *Predictor= vpred;
-
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(2) ;z = usrc
-            add     rdi,        320*2  ;diff = diff + 320 (shorts)
-            add     rax,        320    ;Predictor = pred + 320
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            pxor    mm7,        mm7
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
-
-            add     rdi,        64
-            add     rax,        32
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi]
-            movq    mm1,        [rax]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi],      mm0
-            movq    [rdi+8],    mm3
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+8]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+16],   mm0
-            movq    [rdi+24],   mm3
-
-            movq    mm0,        [rsi+rdx*2]
-            movq    mm1,        [rax+16]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-            movq    [rdi+32],   mm0
-            movq    [rdi+40],   mm3
-            lea     rsi,        [rsi+rdx*2]
-
-
-            movq    mm0,        [rsi+rdx]
-            movq    mm1,        [rax+24]
-            movq    mm3,        mm0
-            movq    mm4,        mm1
-            punpcklbw   mm0,    mm7
-            punpcklbw   mm1,    mm7
-            punpckhbw   mm3,    mm7
-            punpckhbw   mm4,    mm7
-            psubw   mm0,        mm1
-            psubw   mm3,        mm4
-
-            movq    [rdi+48],   mm0
-            movq    [rdi+56],   mm3
-
+    pop         rbx
     ; begin epilog
     pop rdi
     pop rsi

diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 3bd1ff6..008e9c7 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm

@@ -71,277 +71,166 @@
     ret
 
 
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
+;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
+;unsigned char *pred, int pred_stride)
 global sym(vp8_subtract_mby_sse2)
 sym(vp8_subtract_mby_sse2):
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-            mov         rsi,            arg(1) ;src
-            mov         rdi,            arg(0) ;diff
-
-            mov         rax,            arg(2) ;pred
-            movsxd      rdx,            dword ptr arg(3) ;stride
-
-            mov         rcx,            8      ; do two lines at one time
-
-.submby_loop:
-            movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
-            movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
-
-            movdqa      xmm2,           xmm0
-            psubb       xmm0,           xmm1
-
-            pxor        xmm1,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm2,           [GLOBAL(t80)]
-            pcmpgtb     xmm1,           xmm2            ; obtain sign information
-
-            movdqa      xmm2,    xmm0
-            movdqa      xmm3,    xmm1
-            punpcklbw   xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw   xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi],   xmm0
-            movdqa      XMMWORD PTR [rdi +16], xmm2
-
-            movdqa      xmm4,           XMMWORD PTR [rsi + rdx]
-            movdqa      xmm5,           XMMWORD PTR [rax + 16]
-
-            movdqa      xmm6,           xmm4
-            psubb       xmm4,           xmm5
-
-            pxor        xmm5,           [GLOBAL(t80)]   ;convert to signed values
-            pxor        xmm6,           [GLOBAL(t80)]
-            pcmpgtb     xmm5,           xmm6            ; obtain sign information
-
-            movdqa      xmm6,    xmm4
-            movdqa      xmm7,    xmm5
-            punpcklbw   xmm4,    xmm5            ; put sign back to subtraction
-            punpckhbw   xmm6,    xmm7            ; put sign back to subtraction
-
-            movdqa      XMMWORD PTR [rdi +32], xmm4
-            movdqa      XMMWORD PTR [rdi +48], xmm6
-
-            add         rdi,            64
-            add         rax,            32
-            lea         rsi,            [rsi+rdx*2]
-
-            sub         rcx,            1
-            jnz         .submby_loop
-
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp8_subtract_mbuv_sse2)
-sym(vp8_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 5
     GET_GOT     rbx
     push rsi
     push rdi
     ; end prolog
 
-            mov     rdi,        arg(0) ;diff
-            mov     rax,        arg(3) ;pred
-            mov     rsi,        arg(1) ;z = usrc
-            add     rdi,        256*2  ;diff = diff + 256 (shorts)
-            add     rax,        256    ;Predictor = pred + 256
-            movsxd  rdx,        dword ptr arg(4) ;stride;
-            lea     rcx,        [rdx + rdx*2]
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;src
+    movsxd      rdx,        dword ptr arg(2);src_stride
+    mov         rax,        arg(3)          ;pred
+    movdqa      xmm4,       [GLOBAL(t80)]
+    push        rbx
+    mov         rcx,        8               ; do two lines at one time
+    movsxd      rbx,        dword ptr arg(4);pred_stride
 
-            ;u
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
+.submby_loop:
+    movdqa      xmm0,       [rsi]           ; src
+    movdqa      xmm1,       [rax]           ; pred
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1
 
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+    movdqa      xmm2,       xmm0
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
 
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
+    movdqa      xmm3,       [rsi + rdx]
+    movdqa      xmm5,       [rax + rbx]
 
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
+    lea         rsi,        [rsi+rdx*2]
+    lea         rax,        [rax+rbx*2]
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
+    movdqa      [rdi],      xmm0
+    movdqa      [rdi +16],  xmm2
 
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+    movdqa      xmm1,       xmm3
+    psubb       xmm3,       xmm5
 
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+    pxor        xmm5,       xmm4            ;convert to signed values
+    pxor        xmm1,       xmm4
+    pcmpgtb     xmm5,       xmm1            ; obtain sign information
 
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
+    movdqa      xmm1,       xmm3
+    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
+    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
 
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
+    movdqa      [rdi +32],  xmm3
+    movdqa      [rdi +48],  xmm1
 
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
+    add         rdi,        64
+    dec         rcx
+    jnz         .submby_loop
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
+    pop rbx
+    pop rdi
+    pop rsi
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
 
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
+;                         int src_stride, unsigned char *upred,
+;                         unsigned char *vpred, int pred_stride)
+global sym(vp8_subtract_mbuv_sse2)
+sym(vp8_subtract_mbuv_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
 
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+    movdqa      xmm4,       [GLOBAL(t80)]
+    mov         rdi,        arg(0)          ;diff
+    mov         rsi,        arg(1)          ;usrc
+    movsxd      rdx,        dword ptr arg(3);src_stride;
+    mov         rax,        arg(4)          ;upred
+    add         rdi,        256*2           ;diff = diff + 256 (shorts)
+    mov         rcx,        4
+    push        rbx
+    movsxd      rbx,        dword ptr arg(6);pred_stride
 
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
+    ;u
+.submbu_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
 
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
 
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
 
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbu_loop
 
-            ;v
-            mov     rsi,        arg(2) ;z = vsrc
-            add     rdi,        64*2  ;diff = diff + 320 (shorts)
-            add     rax,        64    ;Predictor = pred + 320
+    mov         rsi,        arg(2)          ;vsrc
+    mov         rax,        arg(5)          ;vpred
+    mov         rcx,        4
 
-            ;line 0 1
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax]  ; pred
-            punpcklqdq xmm0,    xmm2
+    ;v
+.submbv_loop:
+    movq        xmm0,       [rsi]           ; src
+    movq        xmm2,       [rsi+rdx]       ; src -- next line
+    movq        xmm1,       [rax]           ; pred
+    movq        xmm3,       [rax+rbx]       ; pred -- next line
+    lea         rsi,        [rsi + rdx*2]
+    lea         rax,        [rax + rbx*2]
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
+    punpcklqdq  xmm0,       xmm2
+    punpcklqdq  xmm1,       xmm3
 
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
+    movdqa      xmm2,       xmm0
+    psubb       xmm0,       xmm1            ; subtraction with sign missed
 
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
+    pxor        xmm1,       xmm4            ;convert to signed values
+    pxor        xmm2,       xmm4
+    pcmpgtb     xmm1,       xmm2            ; obtain sign information
 
-            movdqa     XMMWORD PTR [rdi],   xmm0
-            movdqa     XMMWORD PTR [rdi +16],   xmm2
+    movdqa      xmm2,       xmm0
+    movdqa      xmm3,       xmm1
+    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
+    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
 
-            ;line 2 3
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+16]  ; pred
-            punpcklqdq xmm0,    xmm2
+    movdqa      [rdi],      xmm0            ; store difference
+    movdqa      [rdi +16],  xmm2            ; store difference
+    add         rdi,        32
+    sub         rcx, 1
+    jnz         .submbv_loop
 
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 32],   xmm0
-            movdqa     XMMWORD PTR [rdi + 48],   xmm2
-
-            ;line 4 5
-            lea        rsi,     [rsi + rdx*4]
-
-            movq       xmm0,    MMWORD PTR [rsi]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rdx]
-            movdqa     xmm1,    XMMWORD PTR [rax + 32]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 64],   xmm0
-            movdqa     XMMWORD PTR [rdi + 80],   xmm2
-
-            ;line 6 7
-            movq       xmm0,    MMWORD PTR [rsi+rdx*2]  ; src
-            movq       xmm2,    MMWORD PTR [rsi+rcx]
-            movdqa     xmm1,    XMMWORD PTR [rax+ 48]  ; pred
-            punpcklqdq xmm0,    xmm2
-
-            movdqa     xmm2,    xmm0
-            psubb      xmm0,    xmm1            ; subtraction with sign missed
-
-            pxor       xmm1,    [GLOBAL(t80)]   ;convert to signed values
-            pxor       xmm2,    [GLOBAL(t80)]
-            pcmpgtb    xmm1,    xmm2            ; obtain sign information
-
-            movdqa     xmm2,    xmm0
-            movdqa     xmm3,    xmm1
-            punpcklbw  xmm0,    xmm1            ; put sign back to subtraction
-            punpckhbw  xmm2,    xmm3            ; put sign back to subtraction
-
-            movdqa     XMMWORD PTR [rdi + 96],   xmm0
-            movdqa     XMMWORD PTR [rdi + 112],  xmm2
-
+    pop         rbx
     ; begin epilog
     pop rdi
     pop rsi

diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 92b695f..e2524b4 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c

@@ -12,6 +12,7 @@
 #include "vp8/encoder/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
 
 extern void filter_block1d_h6_mmx
 (
@@ -21,7 +22,7 @@
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
-    short *vp7_filter
+    short *filter
 );
 extern void filter_block1d_v6_mmx
 (
@@ -31,7 +32,7 @@
     unsigned int pixel_step,
     unsigned int output_height,
     unsigned int output_width,
-    short *vp7_filter
+    short *filter
 );
 
 extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
@@ -198,24 +199,6 @@
 }
 
 
-
-
-///////////////////////////////////////////////////////////////////////////
-// the mmx function that does the bilinear filtering and var calculation //
-// int one pass                                                          //
-///////////////////////////////////////////////////////////////////////////
-DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
-{
-    { 128, 128, 128, 128,  0,  0,  0,  0 },
-    { 112, 112, 112, 112, 16, 16, 16, 16 },
-    {  96, 96, 96, 96, 32, 32, 32, 32 },
-    {  80, 80, 80, 80, 48, 48, 48, 48 },
-    {  64, 64, 64, 64, 64, 64, 64, 64 },
-    {  48, 48, 48, 48, 80, 80, 80, 80 },
-    {  32, 32, 32, 32, 96, 96, 96, 96 },
-    {  16, 16, 16, 16, 112, 112, 112, 112 }
-};
-
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
     const unsigned char  *src_ptr,
@@ -232,7 +215,7 @@
     vp8_filter_block2d_bil4x4_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
@@ -257,7 +240,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;
@@ -283,7 +266,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum0, &xxsum0
     );
 
@@ -291,7 +274,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr + 8, src_pixels_per_line,
         dst_ptr + 8, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum1, &xxsum1
     );
 
@@ -336,7 +319,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum0, &xxsum0
     );
 
@@ -344,7 +327,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr + 8, src_pixels_per_line,
         dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum1, &xxsum1
     );
 
@@ -371,7 +354,7 @@
     vp8_filter_block2d_bil_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line, 16,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;

diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
index 24062eb..39213b0 100644
--- a/vp8/encoder/x86/variance_sse2.c
+++ b/vp8/encoder/x86/variance_sse2.c

@@ -12,11 +12,12 @@
 #include "vp8/encoder/variance.h"
 #include "vp8/common/pragmas.h"
 #include "vpx_ports/mem.h"
+#include "vp8/common/x86/filter_x86.h"
 
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
+extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
 
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
@@ -135,8 +136,6 @@
     unsigned int *sumsquared
 );
 
-DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
-
 unsigned int vp8_variance4x4_wmt(
     const unsigned char *src_ptr,
     int  source_stride,
@@ -262,7 +261,7 @@
     vp8_filter_block2d_bil4x4_var_mmx(
         src_ptr, src_pixels_per_line,
         dst_ptr, dst_pixels_per_line,
-        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
         &xsum, &xxsum
     );
     *sse = xxsum;

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5c15a3e..683af34 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -72,6 +72,8 @@
 VP8_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/textblit.c
 VP8_COMMON_SRCS-yes += common/treecoder.c
 
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
+VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/idct_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/subpixel_x86.h
 VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/recon_x86.h

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 7260e94..4f21e14 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -271,14 +271,7 @@
 
     oxcf->Width                 = cfg.g_w;
     oxcf->Height                = cfg.g_h;
-    /* guess a frame rate if out of whack, use 30 */
-    oxcf->frame_rate = (double)(cfg.g_timebase.den) /
-                       (double)(cfg.g_timebase.num);
-
-    if (oxcf->frame_rate > 180)
-    {
-        oxcf->frame_rate = 30;
-    }
+    oxcf->timebase              = cfg.g_timebase;
 
     oxcf->error_resilient_mode = cfg.g_error_resilient;