Initial WebM release

diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm
new file mode 100644
index 0000000..608c9ae
--- /dev/null
+++ b/vp8/encoder/arm/armv6/walsh_v6.asm

@@ -0,0 +1,144 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+    EXPORT |vp8_short_walsh4x4_armv6|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch)
+|vp8_short_walsh4x4_armv6| PROC
+
+    stmdb       sp!, {r4 - r11, lr}
+
+    mov         r12, r2              ; ugh. not clean
+    ldr         r2, [r0]             ; [1  |  0]
+    ldr         r3, [r0, #4]         ; [3  |  2]
+    ldr         r4, [r0, r12]!       ; [5  |  4]
+    ldr         r5, [r0, #4]         ; [7  |  6]
+    ldr         r6, [r0, r12]!       ; [9  |  8]
+    ldr         r7, [r0, #4]         ; [11 | 10]
+    ldr         r8, [r0, r12]!       ; [13 | 12]
+    ldr         r9, [r0, #4]         ; [15 | 14]
+
+    qsubaddx    r10, r2, r3          ; [c1|a1] [1-2   |   0+3]
+    qaddsubx    r11, r2, r3          ; [b1|d1] [1+2   |   0-3]
+    qsubaddx    r12, r4, r5          ; [c1|a1] [5-6   |   4+7]
+    qaddsubx    lr, r4, r5           ; [b1|d1] [5+6   |   4-7]
+
+    qaddsubx    r2, r10, r11         ; [1 | 2] [c1+d1 | a1-b1]
+    qaddsubx    r3, r11, r10         ; [0 | 3] [b1+a1 | d1-c1]
+    qaddsubx    r4, r12, lr          ; [5 | 6] [c1+d1 | a1-b1]
+    qaddsubx    r5, lr, r12          ; [4 | 7] [b1+a1 | d1-c1]
+
+    qsubaddx    r10, r6, r7          ; [c1|a1] [9-10  |  8+11]
+    qaddsubx    r11, r6, r7          ; [b1|d1] [9+10  |  8-11]
+    qsubaddx    r12, r8, r9          ; [c1|a1] [13-14 | 12+15]
+    qaddsubx    lr, r8, r9           ; [b1|d1] [13+14 | 12-15]
+
+    qaddsubx    r6, r10, r11         ; [9 |10] [c1+d1 | a1-b1]
+    qaddsubx    r7, r11, r10         ; [8 |11] [b1+a1 | d1-c1]
+    qaddsubx    r8, r12, lr          ; [13|14] [c1+d1 | a1-b1]
+    qaddsubx    r9, lr, r12          ; [12|15] [b1+a1 | d1-c1]
+
+    ; first transform complete
+
+    qadd16      r10, r3, r9          ; a1 [0+12  |  3+15]
+    qadd16      r11, r5, r7          ; b1 [4+8   |  7+11]
+    qsub16      r12, r5, r7          ; c1 [4-8   |  7-11]
+    qsub16      lr, r3, r9           ; d1 [0-12  |  3-15]
+
+    qadd16      r3, r10, r11         ; a2 [a1+b1] [0 | 3]
+    qadd16      r5, r12, lr          ; b2 [c1+d1] [4 | 7]
+    qsub16      r7, r10, r11         ; c2 [a1-b1] [8 |11]
+    qsub16      r9, lr, r12          ; d2 [d1-c1] [12|15]
+
+    qadd16      r10, r2, r8          ; a1 [1+13  |  2+14]
+    qadd16      r11, r4, r6          ; b1 [5+9   |  6+10]
+    qsub16      r12, r4, r6          ; c1 [5-9   |  6-10]
+    qsub16      lr, r2, r8           ; d1 [1-13  |  2-14]
+
+    qadd16      r2, r10, r11         ; a2 [a1+b1] [1 | 2]
+    qadd16      r4, r12, lr          ; b2 [c1+d1] [5 | 6]
+    qsub16      r6, r10, r11         ; c2 [a1-b1] [9 |10]
+    qsub16      r8, lr, r12          ; d2 [d1-c1] [13|14]
+
+    ; [a-d]2 += ([a-d]2 > 0)
+
+    asrs        r10, r3, #16
+    addpl       r10, r10, #1         ; [~0]
+    asrs        r11, r2, #16
+    addpl       r11, r11, #1         ; [~1]
+    lsl         r11, r11, #15        ; [1  |  x]
+    pkhtb       r10, r11, r10, asr #1; [1  |  0]
+    str         r10, [r1], #4
+
+    lsls        r11, r2, #16
+    addpl       r11, r11, #0x10000   ; [~2]
+    lsls        r12, r3, #16
+    addpl       r12, r12, #0x10000   ; [~3]
+    asr         r12, r12, #1         ; [3  |  x]
+    pkhtb       r11, r12, r11, asr #17; [3  |  2]
+    str         r11, [r1], #4
+
+    asrs        r2, r5, #16
+    addpl       r2, r2, #1           ; [~4]
+    asrs        r3, r4, #16
+    addpl       r3, r3, #1           ; [~5]
+    lsl         r3, r3, #15          ; [5  |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [5  |  4]
+    str         r2, [r1], #4
+
+    lsls        r2, r4, #16
+    addpl       r2, r2, #0x10000     ; [~6]
+    lsls        r3, r5, #16
+    addpl       r3, r3, #0x10000     ; [~7]
+    asr         r3, r3, #1           ; [7  |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [7  |  6]
+    str         r2, [r1], #4
+
+    asrs        r2, r7, #16
+    addpl       r2, r2, #1           ; [~8]
+    asrs        r3, r6, #16
+    addpl       r3, r3, #1           ; [~9]
+    lsl         r3, r3, #15          ; [9  |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [9  |  8]
+    str         r2, [r1], #4
+
+    lsls        r2, r6, #16
+    addpl       r2, r2, #0x10000     ; [~10]
+    lsls        r3, r7, #16
+    addpl       r3, r3, #0x10000     ; [~11]
+    asr         r3, r3, #1           ; [11 |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [11 | 10]
+    str         r2, [r1], #4
+
+    asrs        r2, r9, #16
+    addpl       r2, r2, #1           ; [~12]
+    asrs        r3, r8, #16
+    addpl       r3, r3, #1           ; [~13]
+    lsl         r3, r3, #15          ; [13 |  x]
+    pkhtb       r2, r3, r2, asr #1   ; [13 | 12]
+    str         r2, [r1], #4
+
+    lsls        r2, r8, #16
+    addpl       r2, r2, #0x10000     ; [~14]
+    lsls        r3, r9, #16
+    addpl       r3, r3, #0x10000     ; [~15]
+    asr         r3, r3, #1           ; [15 |  x]
+    pkhtb       r2, r3, r2, asr #17  ; [15 | 14]
+    str         r2, [r1]
+
+    ldmia       sp!, {r4 - r11, pc}
+    ENDP        ; |vp8_short_walsh4x4_armv6|
+
+    END

diff --git a/vp8/encoder/arm/boolhuff_arm.c b/vp8/encoder/arm/boolhuff_arm.c
new file mode 100644
index 0000000..e70b3ad
--- /dev/null
+++ b/vp8/encoder/arm/boolhuff_arm.c

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+#include "blockd.h"
+
+const unsigned int vp8_prob_cost[256] =
+{
+    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+

diff --git a/vp8/encoder/arm/csystemdependent.c b/vp8/encoder/arm/csystemdependent.c
new file mode 100644
index 0000000..0039796
--- /dev/null
+++ b/vp8/encoder/arm/csystemdependent.c

@@ -0,0 +1,159 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_cmachine_specific_config(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    cpi->rtcd.common                         = &cpi->common.rtcd;
+
+#if HAVE_ARMV7
+    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_neon;
+    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_neon;
+    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_neon;
+    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_neon;
+    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_neon;
+
+    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
+    cpi->rtcd.variance.var8x8                = vp8_variance8x8_neon;
+    cpi->rtcd.variance.var8x16               = vp8_variance8x16_neon;
+    cpi->rtcd.variance.var16x8               = vp8_variance16x8_neon;
+    cpi->rtcd.variance.var16x16              = vp8_variance16x16_neon;
+
+    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
+    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_neon;
+    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_neon;
+
+    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_neon;
+    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
+
+    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;
+    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
+    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
+
+    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
+    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_neon;
+    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_neon;
+    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_neon;
+    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_neon;
+
+    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
+    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_neon;
+    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_neon;
+    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_neon;
+
+    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_neon;
+#elif HAVE_ARMV6
+    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
+    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
+    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
+    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
+
+    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
+    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
+    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
+    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+
+    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
+    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
+    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+
+    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
+    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
+
+    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
+    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
+    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
+
+    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
+    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
+    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_armv6;
+
+    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
+    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
+    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
+    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
+
+    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
+#else
+    //pure c
+    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
+    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
+    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
+    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
+
+    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
+    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
+    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
+    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+
+    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
+    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
+    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+
+    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
+    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
+
+    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
+    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
+    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
+
+    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
+    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
+    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
+
+    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
+    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
+    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
+    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
+
+    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
+#endif
+#endif
+
+#if HAVE_ARMV7
+    vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon;
+#else
+    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
+#endif
+}

diff --git a/vp8/encoder/arm/dct_arm.h b/vp8/encoder/arm/dct_arm.h
new file mode 100644
index 0000000..a671862
--- /dev/null
+++ b/vp8/encoder/arm/dct_arm.h

@@ -0,0 +1,45 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DCT_ARM_H
+#define DCT_ARM_H
+
+#if HAVE_ARMV6
+extern prototype_fdct(vp8_short_walsh4x4_armv6);
+
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_armv6
+#endif
+
+#if HAVE_ARMV7
+extern prototype_fdct(vp8_short_fdct4x4_neon);
+extern prototype_fdct(vp8_short_fdct8x4_neon);
+extern prototype_fdct(vp8_fast_fdct4x4_neon);
+extern prototype_fdct(vp8_fast_fdct8x4_neon);
+extern prototype_fdct(vp8_short_walsh4x4_neon);
+
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_neon
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_neon
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_neon
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_neon
+
+#undef  vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4 vp8_short_walsh4x4_neon
+
+#endif
+
+#endif

diff --git a/vp8/encoder/arm/encodemb_arm.c b/vp8/encoder/arm/encodemb_arm.c
new file mode 100644
index 0000000..3f1d053
--- /dev/null
+++ b/vp8/encoder/arm/encodemb_arm.c

@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "encodemb.h"
+#include "reconinter.h"
+#include "quantize.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "dct.h"
+#include "vpx_mem/vpx_mem.h"
+
+extern void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
+
+void vp8_subtract_b_neon(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *src_ptr = (*(be->base_src) + be->src);
+    short *diff_ptr = be->src_diff;
+    unsigned char *pred_ptr = bd->predictor;
+    int src_stride = be->src_stride;
+
+    vp8_subtract_b_neon_func(diff_ptr, src_ptr, pred_ptr, src_stride, pitch);
+}

diff --git a/vp8/encoder/arm/encodemb_arm.h b/vp8/encoder/arm/encodemb_arm.h
new file mode 100644
index 0000000..28f9e5c
--- /dev/null
+++ b/vp8/encoder/arm/encodemb_arm.h

@@ -0,0 +1,43 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_ARM_H
+#define ENCODEMB_ARM_H
+
+#if HAVE_ARMV7
+//extern prototype_berr(vp8_block_error_c);
+//extern prototype_mberr(vp8_mbblock_error_c);
+//extern prototype_mbuverr(vp8_mbuverror_c);
+
+extern prototype_subb(vp8_subtract_b_neon);
+extern prototype_submby(vp8_subtract_mby_neon);
+extern prototype_submbuv(vp8_subtract_mbuv_neon);
+
+//#undef  vp8_encodemb_berr
+//#define vp8_encodemb_berr vp8_block_error_c
+
+//#undef  vp8_encodemb_mberr
+//#define vp8_encodemb_mberr vp8_mbblock_error_c
+
+//#undef  vp8_encodemb_mbuverr
+//#define vp8_encodemb_mbuverr vp8_mbuverror_c
+
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_neon
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_neon
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_neon
+
+#endif
+
+#endif

diff --git a/vp8/encoder/arm/mcomp_arm.c b/vp8/encoder/arm/mcomp_arm.c
new file mode 100644
index 0000000..07f2186
--- /dev/null
+++ b/vp8/encoder/arm/mcomp_arm.c

@@ -0,0 +1,1662 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+static int mv_bits_sadcost[256];
+
+extern unsigned int vp8_sub_pixel_variance16x16s_neon
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+);
+extern unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+);
+
+void vp8cx_init_mv_bits_sadcost()
+{
+    int i;
+
+    for (i = 0; i < 256; i++)
+    {
+        mv_bits_sadcost[i] = (int)sqrt(i * 16);
+    }
+}
+
+
+int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
+{
+    // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
+    // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
+    // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
+    // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
+    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
+}
+
+int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+{
+    //int i;
+    //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
+    //return ( (vp8_mv_bit_cost(mv,  ref, mvcost, 100) + 128) * error_per_bit) >> 8;
+
+    //i = (vp8_mv_bit_cost(mv,  ref, mvcost, 100) * error_per_bit + 128) >> 8;
+    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
+    //return (vp8_mv_bit_cost(mv,  ref, mvcost, 128) * error_per_bit + 128) >> 8;
+}
+
+
+static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
+{
+    // get the estimated number of bits for a motion vector, to be used for costing in SAD based
+    // motion estimation
+    return ((mvcost[0][(mv->row - ref->row) >> 1]  +  mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+
+    // Generate offsets for 4 search sites per step.
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        // Contract.
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+    // Generate offsets for 8 search sites per step.
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride - Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride + Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride - Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride + Len;
+        search_site_count++;
+
+
+        // Contract.
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 8;
+}
+
+
+#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
+#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
+#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+
+    int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
+    int br = bestmv->row << 2, bc = bestmv->col << 2;
+    int tr = br, tc = bc;
+    unsigned int besterr = INT_MAX;
+    unsigned int left, right, up, down, diag;
+    unsigned int sse;
+    unsigned int whichdir;
+    unsigned int halfiters = 4;
+    unsigned int quarteriters = 4;
+
+    int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+
+    // calculate central point error
+    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+    while (--halfiters)
+    {
+        // 1/2 pel
+        CHECK_BETTER(left, tr, tc - 2);
+        CHECK_BETTER(right, tr, tc + 2);
+        CHECK_BETTER(up, tr - 2, tc);
+        CHECK_BETTER(down, tr + 2, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 2, tc - 2);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 2, tc + 2);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 2, tc - 2);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 2, tc + 2);
+            break;
+        }
+
+        // no reason to check the same one again.
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+    // 1/4 pel
+    while (--quarteriters)
+    {
+        CHECK_BETTER(left, tr, tc - 1);
+        CHECK_BETTER(right, tr, tc + 1);
+        CHECK_BETTER(up, tr - 1, tc);
+        CHECK_BETTER(down, tr + 1, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 1, tc - 1);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 1, tc + 1);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 1, tc - 1);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 1, tc + 1);
+            break;
+        }
+
+        // no reason to check the same one again.
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    bestmv->row = br << 1;
+    bestmv->col = bc << 1;
+
+    if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
+        return INT_MAX;
+
+    return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    int bestmse = INT_MAX;
+    MV startmv;
+    //MV this_mv;
+    MV this_mv;
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+
+
+    // Trap uncodable vectors
+    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+    {
+        bestmv->row <<= 3;
+        bestmv->col <<= 3;
+        return INT_MAX;
+    }
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+    startmv = *bestmv;
+
+    // calculate central point error
+    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+    this_mv.col = ((startmv.col - 8) | 4);
+    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 8;
+    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+    this_mv.row = ((startmv.row - 8) | 4);
+    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 8;
+    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+
+    // now check 1 more diagonal
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    //for(whichdir =0;whichdir<4;whichdir++)
+    //{
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.col += 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row += 4;
+        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+        break;
+    case 3:
+        this_mv.col += 4;
+        this_mv.row += 4;
+        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+//  }
+
+
+    // time to check quarter pels.
+    if (bestmv->row < startmv.row)
+        y -= d->pre_stride;
+
+    if (bestmv->col < startmv.col)
+        y--;
+
+    startmv = *bestmv;
+
+
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+
+    if (startmv.col & 7)
+    {
+        this_mv.col = startmv.col - 2;
+        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.col = (startmv.col - 8) | 6;
+        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 4;
+    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+
+    if (startmv.row & 7)
+    {
+        this_mv.row = startmv.row - 2;
+        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.row = (startmv.row - 8) | 6;
+        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+    }
+
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 4;
+    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+
+    // now check 1 more diagonal
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+//  for(whichdir=0;whichdir<4;whichdir++)
+//  {
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+
+        if (startmv.row & 7)
+        {
+            this_mv.row -= 2;
+
+            if (startmv.col & 7)
+            {
+                this_mv.col -= 2;
+                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.col = (startmv.col - 8) | 6;
+                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            }
+        }
+        else
+        {
+            this_mv.row = (startmv.row - 8) | 6;
+
+            if (startmv.col & 7)
+            {
+                this_mv.col -= 2;
+                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.col = (startmv.col - 8) | 6;
+                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+            }
+        }
+
+        break;
+    case 1:
+        this_mv.col += 2;
+
+        if (startmv.row & 7)
+        {
+            this_mv.row -= 2;
+            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.row = (startmv.row - 8) | 6;
+            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 2:
+        this_mv.row += 2;
+
+        if (startmv.col & 7)
+        {
+            this_mv.col -= 2;
+            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.col = (startmv.col - 8) | 6;
+            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+        }
+
+        break;
+    case 3:
+        this_mv.col += 2;
+        this_mv.row += 2;
+        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+//  }
+
+    return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    int bestmse = INT_MAX;
+    MV startmv;
+    //MV this_mv;
+    MV this_mv;
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+
+    // Trap uncodable vectors
+    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+    {
+        bestmv->row <<= 3;
+        bestmv->col <<= 3;
+        return INT_MAX;
+    }
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+    startmv = *bestmv;
+
+    // calculate central point error
+    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+    this_mv.col = ((startmv.col - 8) | 4);
+    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 8;
+    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+    this_mv.row = ((startmv.row - 8) | 4);
+    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 8;
+    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+    // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
+#if 0
+    // now check 1 more diagonal -
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.col += 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row += 4;
+        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 3:
+        this_mv.col += 4;
+        this_mv.row += 4;
+        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+#else
+    this_mv.col = (this_mv.col - 8) | 4;
+    this_mv.row = (this_mv.row - 8) | 4;
+    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col += 8;
+    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col = (this_mv.col - 8) | 4;
+    this_mv.row = startmv.row + 4;
+    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col += 8;
+    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+#endif
+    return bestmse;
+}
+
+#if 1
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+const MV next_chkpts[6][3] =
+{
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
+int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_t vf,
+    vp8_sad_fn_t      sf,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
+    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    int i, j;
+    unsigned char *src = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    unsigned int besterr, thiserr = 0x7fffffff;
+    int k = -1, tk;
+
+    if (bc < x->mv_col_min) bc = x->mv_col_min;
+
+    if (bc > x->mv_col_max) bc = x->mv_col_max;
+
+    if (br < x->mv_row_min) br = x->mv_row_min;
+
+    if (br > x->mv_row_max) br = x->mv_row_max;
+
+    rr >>= 1;
+    rc >>= 1;
+
+    besterr = ERR(br, bc, thiserr);
+
+    // hex search
+    //j=0
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 6; i++)
+    {
+        int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        //CHECK_BETTER(thiserr,nr,nc);
+        if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+        {
+            besterr = thiserr;
+            br = nr;
+            bc = nc;
+            k = i;
+        }
+    }
+
+    if (tr == br && tc == bc)
+        goto cal_neighbors;
+
+    for (j = 1; j < 127; j++)
+    {
+        tr = br;
+        tc = bc;
+        tk = k;
+
+        for (i = 0; i < 3; i++)
+        {
+            int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
+
+            if (nc < x->mv_col_min) continue;
+
+            if (nc > x->mv_col_max) continue;
+
+            if (nr < x->mv_row_min) continue;
+
+            if (nr > x->mv_row_max) continue;
+
+            //CHECK_BETTER(thiserr,nr,nc);
+            if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+            {
+                besterr = thiserr;
+                br = nr;
+                bc = nc; //k=(tk+5+i)%6;}
+                k = tk + 5 + i;
+
+                if (k >= 12) k -= 12;
+                else if (k >= 6) k -= 6;
+            }
+        }
+
+        if (tr == br && tc == bc)
+            break;
+    }
+
+    // check 8 1 away neighbors
+cal_neighbors:
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 8; i++)
+    {
+        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        CHECK_BETTER(thiserr, nr, nc);
+    }
+
+    best_mv->row = br;
+    best_mv->col = bc;
+
+    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+
+#else
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+
+int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_t vf,
+    vp8_sad_fn_t      sf,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    int i, j;
+    unsigned char *src = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    //int rr= ref_mv->row,rc= ref_mv->col,br=rr,bc=rc,tr,tc;
+    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
+    unsigned int besterr, thiserr = 0x7fffffff;
+
+    /*
+        if ( rc < x->mv_col_min) bc = x->mv_col_min;
+        if ( rc > x->mv_col_max) bc = x->mv_col_max;
+        if ( rr < x->mv_row_min) br = x->mv_row_min;
+        if ( rr > x->mv_row_max) br = x->mv_row_max;
+        rr>>=1;
+        rc>>=1;
+        br>>=3;
+        bc>>=3;
+    */
+    if (bc < x->mv_col_min) bc = x->mv_col_min;
+
+    if (bc > x->mv_col_max) bc = x->mv_col_max;
+
+    if (br < x->mv_row_min) br = x->mv_row_min;
+
+    if (br > x->mv_row_max) br = x->mv_row_max;
+
+    rr >>= 1;
+    rc >>= 1;
+
+    besterr = ERR(br, bc, thiserr);
+
+    // hex search  jbb changed to 127 to avoid max 256 problem steping by 2.
+    for (j = 0; j < 127; j++)
+    {
+        tr = br;
+        tc = bc;
+
+        for (i = 0; i < 6; i++)
+        {
+            int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+            if (nc < x->mv_col_min) continue;
+
+            if (nc > x->mv_col_max) continue;
+
+            if (nr < x->mv_row_min) continue;
+
+            if (nr > x->mv_row_max) continue;
+
+            CHECK_BETTER(thiserr, nr, nc);
+        }
+
+        if (tr == br && tc == bc)
+            break;
+    }
+
+    // check 8 1 away neighbors
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 8; i++)
+    {
+        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        CHECK_BETTER(thiserr, nr, nc);
+    }
+
+    best_mv->row = br;
+    best_mv->col = bc;
+
+    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+
+#endif
+
+int vp8_diamond_search_sad
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    MV this_mv;
+
+    int bestsad = INT_MAX;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+    int thissad;
+
+    // Work out the start point for the search
+    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    best_address = in_what;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Check the starting position
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // search_param determines the length of the initial step and hence the number of iterations
+    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    *num00 = 0;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        for (j = 0 ; j < x->searches_per_step ; j++)
+        {
+            // Trap illegal vectors
+            this_row_offset = best_mv->row + ss[i].mv.row;
+            this_col_offset = best_mv->col + ss[i].mv.col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+            {
+                check_here = ss[i].offset + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.row = this_row_offset << 3;
+                    this_mv.col = this_col_offset << 3;
+                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = i;
+                    }
+                }
+            }
+
+            i++;
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->row += ss[best_site].mv.row;
+            best_mv->col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad == INT_MAX)
+        return INT_MAX;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+int vp8_diamond_search_sadx4
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    MV this_mv;
+
+    int bestsad = INT_MAX;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+    int thissad;
+
+    // Work out the start point for the search
+    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    best_address = in_what;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Check the starting position
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // search_param determines the length of the initial step and hence the number of iterations
+    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    *num00 = 0;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        int check_row_min, check_col_min, check_row_max, check_col_max;
+
+        check_row_min = x->mv_row_min - best_mv->row;
+        check_row_max = x->mv_row_max - best_mv->row;
+        check_col_min = x->mv_col_min - best_mv->col;
+        check_col_max = x->mv_col_max - best_mv->col;
+
+        for (j = 0 ; j < x->searches_per_step ; j += 4)
+        {
+            char *block_offset[4];
+            unsigned int valid_block[4];
+            int all_in = 1, t;
+
+            for (t = 0; t < 4; t++)
+            {
+                valid_block [t]  = (ss[t+i].mv.col > check_col_min);
+                valid_block [t] &= (ss[t+i].mv.col < check_col_max);
+                valid_block [t] &= (ss[t+i].mv.row > check_row_min);
+                valid_block [t] &= (ss[t+i].mv.row < check_row_max);
+
+                all_in &= valid_block[t];
+                block_offset[t] = ss[i+t].offset + best_address;
+            }
+
+            if (all_in)
+            {
+                int sad_array[4];
+
+                fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+                for (t = 0; t < 4; t++, i++)
+                {
+                    thissad = sad_array[t];
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
+                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
+                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = i;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                int t;
+
+                for (t = 0; t < 4; i++, t++)
+                {
+                    // Trap illegal vectors
+                    if (valid_block[t])
+
+                    {
+                        check_here = block_offset[t];
+                        thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                        if (thissad < bestsad)
+                        {
+                            this_row_offset = best_mv->row + ss[i].mv.row;
+                            this_col_offset = best_mv->col + ss[i].mv.col;
+
+                            this_mv.row = this_row_offset << 3;
+                            this_mv.col = this_col_offset << 3;
+                            thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                            if (thissad < bestsad)
+                            {
+                                bestsad = thissad;
+                                best_site = i;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->row += ss[best_site].mv.row;
+            best_mv->col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad == INT_MAX)
+        return INT_MAX;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+
+        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+
+        for (c = col_min; c < col_max; c++)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            this_mv.col = c << 3;
+            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
+            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+
+            if (thissad < bestsad)
+            {
+                bestsad = thissad;
+                best_mv->row = r;
+                best_mv->col = c;
+                bestaddress = check_here;
+            }
+
+            check_here++;
+        }
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    int sad_array[3];
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 3) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.col = c << 3;
+                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->row = r;
+                    best_mv->col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+#endif
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+    FILE *f = fopen("modecont.c", "w");
+    int i, j;
+
+    fprintf(f, "#include \"entropy.h\"\n");
+    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+    fprintf(f, "{\n");
+
+    for (j = 0; j < 6; j++)
+    {
+        fprintf(f, "  { // %d \n", j);
+        fprintf(f, "    ");
+
+        for (i = 0; i < 4; i++)
+        {
+            int overal_prob;
+            int this_prob;
+            int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
+
+            // Overall probs
+            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+            if (count)
+                overal_prob = 256 * mv_mode_cts[i][0] / count;
+            else
+                overal_prob = 128;
+
+            if (overal_prob == 0)
+                overal_prob = 1;
+
+            // context probs
+            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+            if (count)
+                this_prob = 256 * mv_ref_ct[j][i][0] / count;
+            else
+                this_prob = 128;
+
+            if (this_prob == 0)
+                this_prob = 1;
+
+            fprintf(f, "%5d, ", this_prob);
+            //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
+            //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+    if (m == ZEROMV)
+    {
+        ++mv_ref_ct [ct[0]] [0] [0];
+        ++mv_mode_cts[0][0];
+    }
+    else
+    {
+        ++mv_ref_ct [ct[0]] [0] [1];
+        ++mv_mode_cts[0][1];
+
+        if (m == NEARESTMV)
+        {
+            ++mv_ref_ct [ct[1]] [1] [0];
+            ++mv_mode_cts[1][0];
+        }
+        else
+        {
+            ++mv_ref_ct [ct[1]] [1] [1];
+            ++mv_mode_cts[1][1];
+
+            if (m == NEARMV)
+            {
+                ++mv_ref_ct [ct[2]] [2] [0];
+                ++mv_mode_cts[2][0];
+            }
+            else
+            {
+                ++mv_ref_ct [ct[2]] [2] [1];
+                ++mv_mode_cts[2][1];
+
+                if (m == NEWMV)
+                {
+                    ++mv_ref_ct [ct[3]] [3] [0];
+                    ++mv_mode_cts[3][0];
+                }
+                else
+                {
+                    ++mv_ref_ct [ct[3]] [3] [1];
+                    ++mv_mode_cts[3][1];
+                }
+            }
+        }
+    }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif

diff --git a/vp8/encoder/arm/neon/boolhuff_armv7.asm b/vp8/encoder/arm/neon/boolhuff_armv7.asm
new file mode 100644
index 0000000..9a5f366
--- /dev/null
+++ b/vp8/encoder/arm/neon/boolhuff_armv7.asm

@@ -0,0 +1,292 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_start_encode|
+    EXPORT |vp8_encode_bool|
+    EXPORT |vp8_stop_encode|
+    EXPORT |vp8_encode_value|
+
+    INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 BOOL_CODER *br
+; r1 unsigned char *source
+
+|vp8_start_encode| PROC
+    mov     r12, #0
+    mov     r3,  #255
+    mvn     r2,  #23
+    str     r12, [r0, #vp8_writer_lowvalue]
+    str     r3,  [r0, #vp8_writer_range]
+    str     r12, [r0, #vp8_writer_value]
+    str     r2,  [r0, #vp8_writer_count]
+    str     r12, [r0, #vp8_writer_pos]
+    str     r1,  [r0, #vp8_writer_buffer]
+    bx      lr
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int bit
+; r2 int probability
+|vp8_encode_bool| PROC
+    push    {r4-r9, lr}
+
+    mov     r4, r2
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    sub     r7, r5, #1                  ; range-1
+
+    cmp     r1, #0
+    mul     r4, r4, r7                  ; ((range-1) * probability)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * probability) >> 8)
+
+    addne   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subne   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r9, pc}
+    ENDP
+
+; r0 BOOL_CODER *br
+|vp8_stop_encode| PROC
+    push    {r4-r10, lr}
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    mov     r10, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r1, [r7, r4]
+    cmpge   r1, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r1, r4, #1                  ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r1, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     stop_encode_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0 BOOL_CODER *br
+; r1 int data
+; r2 int bits
+|vp8_encode_value| PROC
+    push    {r4-r11, lr}
+
+    mov     r10, r2
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+    ; reverse the stream of bits to be packed.  Normally
+    ; the most significant bit is peeled off and compared
+    ; in the form of (v >> --n) & 1.  ARM architecture has
+    ; the ability to set a flag based on the value of the
+    ; bit shifted off the bottom of the register.  To make
+    ; that happen the bitstream is reversed.
+    rbit    r11, r1
+    rsb     r4, r10, #32                 ; 32-n
+
+    ; v is kept in r1 during the token pack loop
+    lsr     r1, r11, r4                 ; v >>= 32 - n
+
+encode_value_loop
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsrs    r1, r1, #1                  ; bit = v >> n
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bit) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bit) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_ev      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_ev
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_ev
+token_zero_while_loop_ev
+    mov     r9, #0
+    strb    r9, [r7, r4]                ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_ev
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_ev
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r9, [r7, r4]                ; w->buffer[x]
+    add     r9, r9, #1
+    strb    r9, [r7, r4]                ; w->buffer[x] + 1
+token_high_bit_not_set_ev
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r9, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r9, r4]                ; w->buffer[w->pos++]
+
+token_count_lt_zero_ev
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r10, r10, #1
+    bne     encode_value_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    pop     {r4-r11, pc}
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/fastfdct4x4_neon.asm b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm
new file mode 100644
index 0000000..d5dec44
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastfdct4x4_neon.asm

@@ -0,0 +1,126 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_fdct4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct4x4_neon| PROC
+    vld1.16         {d2}, [r0], r2              ;load input
+    ldr             r12, _ffdct_coeff_
+    vld1.16         {d3}, [r0], r2
+    vld1.16         {d4}, [r0], r2
+    vld1.16         {d0}, [r12]
+    vld1.16         {d5}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;ip[0]+ip[3]
+    vadd.s16        d7, d3, d4              ;ip[1]+ip[2]
+    vsub.s16        d8, d3, d4              ;ip[1]-ip[2]
+    vsub.s16        d9, d2, d5              ;ip[0]-ip[3]
+    vshl.i16        q3, q3, #1              ; a1, b1
+    vshl.i16        q4, q4, #1              ; c1, d1
+
+    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
+    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
+
+    vqdmulh.s16     q6, q5, d0[1]
+    vqdmulh.s16     q8, q4, d0[0]
+    vqdmulh.s16     q7, q4, d0[2]
+
+    vshr.s16        q6, q6, #1
+    vshr.s16        q8, q8, #1
+    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d10, d12            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d11, d13            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d14, d17            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d15, d16            ;op[3] = temp1 - temp2
+
+    ;Second for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
+    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
+    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+
+    vadd.s16        d10, d6, d7             ;temp1 = a1 + b1
+    vsub.s16        d11, d6, d7             ;temp2 = a1 - b1
+
+
+    vqdmulh.s16     q6, q5, d0[1]
+    vqdmulh.s16     q8, q4, d0[0]
+    vqdmulh.s16     q7, q4, d0[2]
+
+    vshr.s16        q6, q6, #1
+    vshr.s16        q8, q8, #1
+    vshr.s16        q7, q7, #1              ;d14:temp1 = ( c1 * x_c3)>>16;  d15:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q8, q4, q8              ;d16:temp2 = ((c1 * x_c1)>>16) + c1;  d17:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d10, d12            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d11, d13            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d14, d17            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d15, d16            ;d2 = temp1 - temp2
+
+    vclt.s16        q3, q1, #0
+    vclt.s16        q4, q2, #0
+
+    vsub.s16        q1, q1, q3
+    vsub.s16        q2, q2, q4
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+
+    vst1.16         {q1, q2}, [r1]
+
+    bx              lr
+
+    ENDP
+
+;-----------------
+    AREA    fastfdct_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_ffdct_coeff_
+    DCD     ffdct_coeff
+ffdct_coeff
+; 60547 =  0xEC83
+; 46341 =  0xB505
+; 25080 =  0x61F8
+    DCD     0xB505EC83, 0x000061F8
+
+    END

diff --git a/vp8/encoder/arm/neon/fastfdct8x4_neon.asm b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm
new file mode 100644
index 0000000..de1c254
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastfdct8x4_neon.asm

@@ -0,0 +1,179 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_fdct8x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+;NOTE:
+;The input *src_diff. src_diff is calculated as:
+;diff_ptr[c] = src_ptr[c] - pred_ptr[c]; (in Subtract* function)
+;In which *src_ptr and *pred_ptr both are unsigned char.
+;Therefore, *src_diff should be in the range of [-255, 255].
+;CAUTION:
+;The input values of 25th block are set in vp8_build_dcblock function, which are out of [-255, 255].
+;But, VP8 encoder only uses vp8_short_fdct4x4_c for 25th block, not vp8_fast_fdct4x4_c. That makes
+;it ok for assuming *input in [-255, 255] in vp8_fast_fdct4x4_c, but not ok in vp8_short_fdct4x4_c.
+
+|vp8_fast_fdct8x4_neon| PROC
+    vld1.16         {q1}, [r0], r2              ;load input
+    ldr             r12, _ffdct8_coeff_
+    vld1.16         {q2}, [r0], r2
+    vld1.16         {q3}, [r0], r2
+    vld1.16         {d0}, [r12]
+    vld1.16         {q4}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[1], d6=ip[2], d8=ip[3]
+    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[1], d7=ip[2], d9=ip[3]
+    vtrn.32         d2, d6
+    vtrn.32         d3, d7
+    vtrn.32         d4, d8
+    vtrn.32         d5, d9
+    vtrn.16         d2, d4
+    vtrn.16         d3, d5
+    vtrn.16         d6, d8
+    vtrn.16         d7, d9
+
+    vadd.s16        d10, d2, d8             ;ip[0]+ip[3]
+    vadd.s16        d11, d4, d6             ;ip[1]+ip[2]
+    vsub.s16        d12, d4, d6             ;ip[1]-ip[2]
+    vsub.s16        d13, d2, d8             ;ip[0]-ip[3]
+    vadd.s16        d22, d3, d9
+    vadd.s16        d23, d5, d7
+    vsub.s16        d24, d5, d7
+    vsub.s16        d25, d3, d9
+
+    vshl.i16        q5, q5, #1              ; a1, b1
+    vshl.i16        q6, q6, #1              ; c1, d1
+    vshl.i16        q1, q11, #1
+    vshl.i16        q2, q12, #1
+
+    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
+    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
+    vadd.s16        d24, d2, d3
+    vsub.s16        d25, d2, d3
+
+    vqdmulh.s16     q8, q7, d0[1]
+    vqdmulh.s16     q13, q12, d0[1]
+    vqdmulh.s16     q10, q6, d0[0]
+    vqdmulh.s16     q15, q2, d0[0]
+    vqdmulh.s16     q9, q6, d0[2]
+    vqdmulh.s16     q14, q2, d0[2]
+
+    vshr.s16        q8, q8, #1
+    vshr.s16        q13, q13, #1
+    vshr.s16        q10, q10, #1
+    vshr.s16        q15, q15, #1
+    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
+    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
+    vadd.s16        q15, q2, q15            ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d14, d16            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d3, d24, d26            ;op[0] = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d6, d15, d17            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d7, d25, d27            ;op[2] = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d4, d18, d21            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vadd.s16        d5, d28, d31            ;op[1] = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d8, d19, d20            ;op[3] = temp1 - temp2
+    vsub.s16        d9, d29, d30            ;op[3] = temp1 - temp2
+
+    ;Second for-loop
+    ;transpose d2, d4, d6, d8. Then, d2=ip[0], d4=ip[4], d6=ip[8], d8=ip[12]
+    ;transpose d3, d5, d7, d9. Then, d3=ip[0], d5=ip[4], d7=ip[8], d9=ip[12]
+    vtrn.32         d2, d6
+    vtrn.32         d3, d7
+    vtrn.32         d4, d8
+    vtrn.32         d5, d9
+    vtrn.16         d2, d4
+    vtrn.16         d3, d5
+    vtrn.16         d6, d8
+    vtrn.16         d7, d9
+
+    vadd.s16        d10, d2, d8             ;a1 = ip[0]+ip[12]
+    vadd.s16        d11, d4, d6             ;b1 = ip[4]+ip[8]
+    vsub.s16        d12, d4, d6             ;c1 = ip[4]-ip[8]
+    vsub.s16        d13, d2, d8             ;d1 = ip[0]-ip[12]
+    vadd.s16        d2, d3, d9
+    vadd.s16        d4, d5, d7
+    vsub.s16        d24, d5, d7
+    vsub.s16        d25, d3, d9
+
+    vadd.s16        d14, d10, d11           ;temp1 = a1 + b1
+    vsub.s16        d15, d10, d11           ;temp2 = a1 - b1
+    vadd.s16        d22, d2, d4
+    vsub.s16        d23, d2, d4
+
+    vqdmulh.s16     q8, q7, d0[1]
+    vqdmulh.s16     q13, q11, d0[1]
+    vqdmulh.s16     q10, q6, d0[0]
+    vqdmulh.s16     q15, q12, d0[0]
+    vqdmulh.s16     q9, q6, d0[2]
+    vqdmulh.s16     q14, q12, d0[2]
+
+    vshr.s16        q8, q8, #1
+    vshr.s16        q13, q13, #1
+    vshr.s16        q10, q10, #1
+    vshr.s16        q15, q15, #1
+    vshr.s16        q9, q9, #1              ;d18:temp1 = ( c1 * x_c3)>>16;  d19:temp1 =  (d1 * x_c3)>>16
+    vshr.s16        q14, q14, #1            ;d28:temp1 = ( c1 * x_c3)>>16;  d29:temp1 =  (d1 * x_c3)>>16
+    vadd.s16        q10, q6, q10            ;d20:temp2 = ((c1 * x_c1)>>16) + c1;  d21:temp2 = ((d1 * x_c1)>>16) + d1
+    vadd.s16        q15, q12, q15           ;d30:temp2 = ((c1 * x_c1)>>16) + c1;  d31:temp2 = ((d1 * x_c1)>>16) + d1
+
+    vadd.s16        d2, d14, d16            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d6, d22, d26            ;a2 = ((temp1 * x_c2 )>>16) + temp1
+    vadd.s16        d4, d15, d17            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d8, d23, d27            ;c2 = ((temp2 * x_c2 )>>16) + temp2
+    vadd.s16        d3, d18, d21            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vadd.s16        d7, d28, d31            ;b2 = temp1 + temp2  -- q is not necessary, just for protection
+    vsub.s16        d5, d19, d20            ;d2 = temp1 - temp2
+    vsub.s16        d9, d29, d30            ;d2 = temp1 - temp2
+
+    vclt.s16        q5, q1, #0
+    vclt.s16        q6, q2, #0
+    vclt.s16        q7, q3, #0
+    vclt.s16        q8, q4, #0
+
+    vsub.s16        q1, q1, q5
+    vsub.s16        q2, q2, q6
+    vsub.s16        q3, q3, q7
+    vsub.s16        q4, q4, q8
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+    vshr.s16        q3, q3, #1
+    vshr.s16        q4, q4, #1
+
+    vst1.16         {q1, q2}, [r1]!
+    vst1.16         {q3, q4}, [r1]
+
+    bx              lr
+
+    ENDP
+
+;-----------------
+    AREA    fastfdct8x4_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_ffdct8_coeff_
+    DCD     ffdct8_coeff
+ffdct8_coeff
+; 60547 =  0xEC83
+; 46341 =  0xB505
+; 25080 =  0x61F8
+    DCD     0xB505EC83, 0x000061F8
+
+    END

diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.asm b/vp8/encoder/arm/neon/fastquantizeb_neon.asm
new file mode 100644
index 0000000..1107037
--- /dev/null
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.asm

@@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_fast_quantize_b_neon_func|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0        short *coeff_ptr
+; r1        short *zbin_ptr
+; r2        short *qcoeff_ptr
+; r3        short *dqcoeff_ptr
+; stack     short *dequant_ptr
+; stack     short *scan_mask
+; stack     short *round_ptr
+; stack     short *quant_ptr
+
+; return    int * eob
+|vp8_fast_quantize_b_neon_func| PROC
+    vld1.16         {q0, q1}, [r0]              ;load z
+    vld1.16         {q10, q11}, [r1]            ;load zbin
+
+    vabs.s16        q4, q0                      ;calculate x = abs(z)
+    vabs.s16        q5, q1
+
+    vcge.s16        q10, q4, q10                ;x>=zbin
+    vcge.s16        q11, q5, q11
+
+    ;if x<zbin (q10 & q11 are all 0), go to zero_output
+    vorr.s16        q6, q10, q11
+    vorr.s16        d12, d12, d13
+    vmov            r0, r1, d12
+    orr             r0, r0, r1
+    cmp             r0, #0
+    beq             zero_output
+
+    ldr             r0, [sp, #8]                ;load round_ptr
+    ldr             r12, [sp, #12]              ;load quant_ptr
+
+    ;right shift 15 to get sign, all 0 if it is positive, all 1 if it is negative
+    vshr.s16        q2, q0, #15                 ; sz
+    vshr.s16        q3, q1, #15
+
+    vld1.s16        {q6, q7}, [r0]              ;load round_ptr [0-15]
+    vld1.s16        {q8, q9}, [r12]             ;load quant_ptr [0-15]
+
+    vadd.s16        q4, q6                      ;x + Round
+    vadd.s16        q5, q7
+
+    ldr             r0, [sp, #4]                ;load rvsplus1_scan_order ptr
+
+    vqdmulh.s16     q4, q8                      ;y = ((Round + abs(z)) * Quant) >> 16
+    vqdmulh.s16     q5, q9
+
+    vld1.16         {q0, q1}, [r0]              ;load rvsplus1_scan_order
+    vceq.s16        q8, q8                      ;set q8 to all 1
+
+    vshr.s16        q4, #1                      ;right shift 1 after vqdmulh
+    vshr.s16        q5, #1
+
+    ;modify data to have its original sign
+    veor.s16        q4, q2                      ; y^sz
+    veor.s16        q5, q3
+
+    ldr             r12, [sp]                   ;load dequant_ptr
+
+    vsub.s16        q4, q2                      ; x1 = (y^sz) - sz = (y^sz) - (-1) (two's complement)
+    vsub.s16        q5, q3
+
+    vand.s16        q4, q10                     ;mask off x1 elements
+    vand.s16        q5, q11
+
+    vld1.s16        {q6, q7}, [r12]             ;load dequant_ptr[i]
+
+    vtst.16         q14, q4, q8                 ;now find eob
+    vtst.16         q15, q5, q8                 ;non-zero element is set to all 1 in q4, q5
+
+    vst1.s16        {q4, q5}, [r2]              ;store: qcoeff = x1
+
+    vand            q0, q0, q14                 ;get all valid number from rvsplus1_scan_order array
+    vand            q1, q1, q15
+
+    vmax.u16        q0, q0, q1                  ;find maximum value in q0, q1
+    vmax.u16        d0, d0, d1
+    vmovl.u16       q0, d0
+
+    vmul.s16        q6, q4                      ;x * Dequant
+    vmul.s16        q7, q5
+
+    vmax.u32        d0, d0, d1
+    vpmax.u32       d0, d0, d0
+
+    vst1.s16        {q6, q7}, [r3]              ;store dqcoeff = x * Dequant
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+zero_output
+    vst1.s16        {q10, q11}, [r2]        ; qcoeff = 0
+    vst1.s16        {q10, q11}, [r3]        ; dqcoeff = 0
+    mov             r0, #0
+
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/sad16_neon.asm b/vp8/encoder/arm/neon/sad16_neon.asm
new file mode 100644
index 0000000..6169f10
--- /dev/null
+++ b/vp8/encoder/arm/neon/sad16_neon.asm

@@ -0,0 +1,206 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad16x16_neon|
+    EXPORT  |vp8_sad16x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int  src_stride
+; r2    unsigned char *ref_ptr
+; r3    int  ref_stride
+|vp8_sad16x16_neon| PROC
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+;;
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0]
+    vld1.8          {q7}, [r2]
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;==============================
+;unsigned int vp8_sad16x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+|vp8_sad16x8_neon| PROC
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+    vabdl.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vld1.8          {q0}, [r0], r1
+    vld1.8          {q4}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+    vabal.u8        q13, d1, d9
+
+    vld1.8          {q2}, [r0], r1
+    vld1.8          {q6}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+    vabal.u8        q13, d3, d11
+
+    vld1.8          {q3}, [r0], r1
+    vld1.8          {q7}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q13, d5, d13
+
+    vabal.u8        q12, d6, d14
+    vabal.u8        q13, d7, d15
+
+    vadd.u16        q0, q12, q13
+
+    vpaddl.u16      q1, q0
+    vpaddl.u32      q0, q1
+
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/sad8_neon.asm b/vp8/encoder/arm/neon/sad8_neon.asm
new file mode 100644
index 0000000..28604dd
--- /dev/null
+++ b/vp8/encoder/arm/neon/sad8_neon.asm

@@ -0,0 +1,208 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sad8x8_neon|
+    EXPORT  |vp8_sad8x16_neon|
+    EXPORT  |vp8_sad4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; unsigned int vp8_sad8x8_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x8_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;============================
+;unsigned int vp8_sad8x16_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad8x16_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vabal.u8        q12, d6, d14
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabal.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      q1, q12
+    vpaddl.u32      q0, q1
+    vadd.u32        d0, d0, d1
+
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;===========================
+;unsigned int vp8_sad4x4_c(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+
+|vp8_sad4x4_neon| PROC
+    vld1.8          {d0}, [r0], r1
+    vld1.8          {d8}, [r2], r3
+
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d10}, [r2], r3
+
+    vabdl.u8        q12, d0, d8
+
+    vld1.8          {d4}, [r0], r1
+    vld1.8          {d12}, [r2], r3
+
+    vabal.u8        q12, d2, d10
+
+    vld1.8          {d6}, [r0], r1
+    vld1.8          {d14}, [r2], r3
+
+    vabal.u8        q12, d4, d12
+    vabal.u8        q12, d6, d14
+
+    vpaddl.u16      d1, d24
+    vpaddl.u32      d0, d1
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/shortfdct_neon.asm b/vp8/encoder/arm/neon/shortfdct_neon.asm
new file mode 100644
index 0000000..26bc0d0
--- /dev/null
+++ b/vp8/encoder/arm/neon/shortfdct_neon.asm

@@ -0,0 +1,146 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_fdct4x4_neon|
+    EXPORT  |vp8_short_fdct8x4_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    short *input
+; r1    short *output
+; r2    int pitch
+; Input has a pitch, output is contiguous
+|vp8_short_fdct4x4_neon| PROC
+    ldr             r12, _dct_matrix_
+    vld1.16         d0, [r0], r2
+    vld1.16         d1, [r0], r2
+    vld1.16         d2, [r0], r2
+    vld1.16         d3, [r0]
+    vld1.16         {q2, q3}, [r12]
+
+;first stage
+    vmull.s16       q11, d4, d0[0]              ;i=0
+    vmull.s16       q12, d4, d1[0]              ;i=1
+    vmull.s16       q13, d4, d2[0]              ;i=2
+    vmull.s16       q14, d4, d3[0]              ;i=3
+
+    vmlal.s16       q11, d5, d0[1]
+    vmlal.s16       q12, d5, d1[1]
+    vmlal.s16       q13, d5, d2[1]
+    vmlal.s16       q14, d5, d3[1]
+
+    vmlal.s16       q11, d6, d0[2]
+    vmlal.s16       q12, d6, d1[2]
+    vmlal.s16       q13, d6, d2[2]
+    vmlal.s16       q14, d6, d3[2]
+
+    vmlal.s16       q11, d7, d0[3]              ;sumtemp for i=0
+    vmlal.s16       q12, d7, d1[3]              ;sumtemp for i=1
+    vmlal.s16       q13, d7, d2[3]              ;sumtemp for i=2
+    vmlal.s16       q14, d7, d3[3]              ;sumtemp for i=3
+
+    ; rounding
+    vrshrn.i32      d22, q11, #14
+    vrshrn.i32      d24, q12, #14
+    vrshrn.i32      d26, q13, #14
+    vrshrn.i32      d28, q14, #14
+
+;second stage
+    vmull.s16       q4, d22, d4[0]              ;i=0
+    vmull.s16       q5, d22, d4[1]              ;i=1
+    vmull.s16       q6, d22, d4[2]              ;i=2
+    vmull.s16       q7, d22, d4[3]              ;i=3
+
+    vmlal.s16       q4, d24, d5[0]
+    vmlal.s16       q5, d24, d5[1]
+    vmlal.s16       q6, d24, d5[2]
+    vmlal.s16       q7, d24, d5[3]
+
+    vmlal.s16       q4, d26, d6[0]
+    vmlal.s16       q5, d26, d6[1]
+    vmlal.s16       q6, d26, d6[2]
+    vmlal.s16       q7, d26, d6[3]
+
+    vmlal.s16       q4, d28, d7[0]              ;sumtemp for i=0
+    vmlal.s16       q5, d28, d7[1]              ;sumtemp for i=1
+    vmlal.s16       q6, d28, d7[2]              ;sumtemp for i=2
+    vmlal.s16       q7, d28, d7[3]              ;sumtemp for i=3
+
+    vrshr.s32       q0, q4, #16
+    vrshr.s32       q1, q5, #16
+    vrshr.s32       q2, q6, #16
+    vrshr.s32       q3, q7, #16
+
+    vmovn.i32       d0, q0
+    vmovn.i32       d1, q1
+    vmovn.i32       d2, q2
+    vmovn.i32       d3, q3
+
+    vst1.16         {q0, q1}, [r1]
+
+    bx              lr
+
+    ENDP
+
+; r0    short *input
+; r1    short *output
+; r2    int pitch
+|vp8_short_fdct8x4_neon| PROC
+    ; Store link register and input before calling
+    ;  first 4x4 fdct.  Do not need to worry about
+    ;  output or pitch because those pointers are not
+    ;  touched in the 4x4 fdct function
+    stmdb           sp!, {r0, lr}
+
+    bl              vp8_short_fdct4x4_neon
+
+    ldmia           sp!, {r0, lr}
+
+    ; Move to the next block of data.
+    add             r0, r0, #8
+    add             r1, r1, #32
+
+    ; Second time through do not store off the
+    ;  link register, just return from the 4x4 fdtc
+    b               vp8_short_fdct4x4_neon
+
+    ; Should never get to this.
+    bx              lr
+
+    ENDP
+
+;-----------------
+    AREA    dct4x4_dat, DATA, READONLY
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_dct_matrix_
+    DCD     dct_matrix
+dct_matrix
+;   DCW     23170,  30274,  23170, 12540
+;   DCW     23170,  12540, -23170,-30274
+;   DCW     23170, -12540, -23170, 30274
+;   DCW     23170, -30274,  23170,-12540
+; 23170 =  0x5a82
+; -23170 =  0xa57e
+; 30274 =  0x7642
+; -30274 =  0x89be
+; 12540 =  0x30fc
+; -12540 = 0xcf04
+    DCD     0x76425a82, 0x30fc5a82
+    DCD     0x30fc5a82, 0x89bea57e
+    DCD     0xcf045a82, 0x7642a57e
+    DCD     0x89be5a82, 0xcf045a82
+
+    END

diff --git a/vp8/encoder/arm/neon/subtract_neon.asm b/vp8/encoder/arm/neon/subtract_neon.asm
new file mode 100644
index 0000000..8781ca0
--- /dev/null
+++ b/vp8/encoder/arm/neon/subtract_neon.asm

@@ -0,0 +1,171 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_subtract_b_neon_func|
+    EXPORT |vp8_subtract_mby_neon|
+    EXPORT |vp8_subtract_mbuv_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_subtract_b_neon_func(short *diff, unsigned char *src, unsigned char *pred, int stride, int pitch);
+|vp8_subtract_b_neon_func| PROC
+    ldr             r12, [sp]               ;load pitch
+
+    vld1.8          {d0}, [r1], r3          ;load src
+    vld1.8          {d1}, [r2], r12         ;load pred
+    vld1.8          {d2}, [r1], r3
+    vld1.8          {d3}, [r2], r12
+    vld1.8          {d4}, [r1], r3
+    vld1.8          {d5}, [r2], r12
+    vld1.8          {d6}, [r1], r3
+    vld1.8          {d7}, [r2], r12
+
+    vsubl.u8        q10, d0, d1
+    vsubl.u8        q11, d2, d3
+    vsubl.u8        q12, d4, d5
+    vsubl.u8        q13, d6, d7
+
+    mov             r12, r12, lsl #1
+
+    vst1.16         {d20}, [r0], r12        ;store diff
+    vst1.16         {d22}, [r0], r12
+    vst1.16         {d24}, [r0], r12
+    vst1.16         {d26}, [r0], r12
+
+    bx              lr
+    ENDP
+
+;==========================================
+;void vp8_subtract_mby_neon(short *diff, unsigned char *src, unsigned char *pred, int stride)
+|vp8_subtract_mby_neon| PROC
+    mov             r12, #4
+
+subtract_mby_loop
+    vld1.8          {q0}, [r1], r3          ;load src
+    vld1.8          {q1}, [r2]!             ;load pred
+    vld1.8          {q2}, [r1], r3
+    vld1.8          {q3}, [r2]!
+    vld1.8          {q4}, [r1], r3
+    vld1.8          {q5}, [r2]!
+    vld1.8          {q6}, [r1], r3
+    vld1.8          {q7}, [r2]!
+
+    vsubl.u8        q8, d0, d2
+    vsubl.u8        q9, d1, d3
+    vsubl.u8        q10, d4, d6
+    vsubl.u8        q11, d5, d7
+    vsubl.u8        q12, d8, d10
+    vsubl.u8        q13, d9, d11
+    vsubl.u8        q14, d12, d14
+    vsubl.u8        q15, d13, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    subs            r12, r12, #1
+    bne             subtract_mby_loop
+
+    bx              lr
+    ENDP
+
+;=================================
+;void vp8_subtract_mbuv_neon(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+|vp8_subtract_mbuv_neon| PROC
+    ldr             r12, [sp]
+
+;u
+    add             r0, r0, #512        ;   short *udiff = diff + 256;
+    add             r3, r3, #256        ;   unsigned char *upred = pred + 256;
+
+    vld1.8          {d0}, [r1], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r1], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r1], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r1], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r1], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r1], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r1], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r1], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+;v
+    vld1.8          {d0}, [r2], r12         ;load src
+    vld1.8          {d1}, [r3]!             ;load pred
+    vld1.8          {d2}, [r2], r12
+    vld1.8          {d3}, [r3]!
+    vld1.8          {d4}, [r2], r12
+    vld1.8          {d5}, [r3]!
+    vld1.8          {d6}, [r2], r12
+    vld1.8          {d7}, [r3]!
+    vld1.8          {d8}, [r2], r12
+    vld1.8          {d9}, [r3]!
+    vld1.8          {d10}, [r2], r12
+    vld1.8          {d11}, [r3]!
+    vld1.8          {d12}, [r2], r12
+    vld1.8          {d13}, [r3]!
+    vld1.8          {d14}, [r2], r12
+    vld1.8          {d15}, [r3]!
+
+    vsubl.u8        q8, d0, d1
+    vsubl.u8        q9, d2, d3
+    vsubl.u8        q10, d4, d5
+    vsubl.u8        q11, d6, d7
+    vsubl.u8        q12, d8, d9
+    vsubl.u8        q13, d10, d11
+    vsubl.u8        q14, d12, d13
+    vsubl.u8        q15, d14, d15
+
+    vst1.16         {q8}, [r0]!             ;store diff
+    vst1.16         {q9}, [r0]!
+    vst1.16         {q10}, [r0]!
+    vst1.16         {q11}, [r0]!
+    vst1.16         {q12}, [r0]!
+    vst1.16         {q13}, [r0]!
+    vst1.16         {q14}, [r0]!
+    vst1.16         {q15}, [r0]!
+
+    bx              lr
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/variance_neon.asm b/vp8/encoder/arm/neon/variance_neon.asm
new file mode 100644
index 0000000..64b83ca
--- /dev/null
+++ b/vp8/encoder/arm/neon/variance_neon.asm

@@ -0,0 +1,275 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_variance16x16_neon|
+    EXPORT  |vp8_variance16x8_neon|
+    EXPORT  |vp8_variance8x16_neon|
+    EXPORT  |vp8_variance8x8_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance16x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
+    ;the results into the elements of the destination vector. The explanation
+    ;in ARM guide is wrong.
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
+    ;vmov.32        r1, d1[0]
+    ;mul            r0, r0, r0
+    ;str            r1, [r12]
+    ;sub            r0, r1, r0, asr #8
+
+    ;sum is in [-255x256, 255x256]. sumxsum is 32-bit. Shift to right should
+    ;have sign-bit exension, which is vshr.s. Have to use s32 to make it right.
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;================================
+;unsigned int vp8_variance16x8_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+|vp8_variance16x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #4
+
+variance16x8_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance16x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;=================================
+;unsigned int vp8_variance8x16_c(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;   unsigned int *sse)
+
+|vp8_variance8x16_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+variance8x16_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d2, d6
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+
+    bne             variance8x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #7
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+;==================================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vp8_variance8x8_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+variance8x8_neon_loop
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4                 ;calculate diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;calculate sum
+    vmlal.s16       q9, d22, d22                ;calculate sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    ldr             r12, [sp]                   ;load *sse from stack
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r12]              ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm
new file mode 100644
index 0000000..f26b4d7
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm

@@ -0,0 +1,67 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8_memcpy_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;=========================================
+;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+|vp8_memcpy_neon| PROC
+    ;pld                [r1]                        ;preload pred data
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    mov             r12, r2, lsr #8                 ;copy 256 bytes data at one time
+
+memcpy_neon_loop
+    vld1.8          {q0, q1}, [r1]!                 ;load src data
+    subs            r12, r12, #1
+    vld1.8          {q2, q3}, [r1]!
+    vst1.8          {q0, q1}, [r0]!                 ;copy to dst_ptr
+    vld1.8          {q4, q5}, [r1]!
+    vst1.8          {q2, q3}, [r0]!
+    vld1.8          {q6, q7}, [r1]!
+    vst1.8          {q4, q5}, [r0]!
+    vld1.8          {q8, q9}, [r1]!
+    vst1.8          {q6, q7}, [r0]!
+    vld1.8          {q10, q11}, [r1]!
+    vst1.8          {q8, q9}, [r0]!
+    vld1.8          {q12, q13}, [r1]!
+    vst1.8          {q10, q11}, [r0]!
+    vld1.8          {q14, q15}, [r1]!
+    vst1.8          {q12, q13}, [r0]!
+    vst1.8          {q14, q15}, [r0]!
+
+    ;pld                [r1]                        ;preload pred data -- need to adjust for real device
+    ;pld                [r1, #128]
+    ;pld                [r1, #256]
+    ;pld                [r1, #384]
+
+    bne             memcpy_neon_loop
+
+    ands            r3, r2, #0xff                   ;extra copy
+    beq             done_copy_neon_loop
+
+extra_copy_neon_loop
+    vld1.8          {q0}, [r1]!                 ;load src data
+    subs            r3, r3, #16
+    vst1.8          {q0}, [r0]!
+    bne             extra_copy_neon_loop
+
+done_copy_neon_loop
+    bx              lr
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm
new file mode 100644
index 0000000..f535967
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm

@@ -0,0 +1,172 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_mse16x16_neon|
+    EXPORT  |vp8_get16x16pred_error_neon|
+    EXPORT  |vp8_get4x4sse_cs_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;============================
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;note: in this function, sum is never used. So, we can remove this part of calculation
+;from vp8_variance().
+
+|vp8_mse16x16_neon| PROC
+    vmov.i8         q7, #0                      ;q7, q8, q9, q10 - sse
+    vmov.i8         q8, #0
+    vmov.i8         q9, #0
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+mse16x16_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmlal.s16       q7, d22, d22
+    vmlal.s16       q8, d23, d23
+
+    subs            r12, r12, #1
+
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vmlal.s16       q7, d26, d26
+    vmlal.s16       q8, d27, d27
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             mse16x16_neon_loop
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+
+    ldr             r12, [sp]               ;load *sse from stack
+
+    vadd.u32        q10, q7, q9
+    vpaddl.u32      q1, q10
+    vadd.u64        d0, d2, d3
+
+    vst1.32         {d0[0]}, [r12]
+    vmov.32         r0, d0[0]
+
+    bx              lr
+
+    ENDP
+
+;============================
+; r0    unsigned char *src_ptr
+; r1    int src_stride
+; r2    unsigned char *ref_ptr
+; r3    int ref_stride
+|vp8_get16x16pred_error_neon| PROC
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - pred_error
+    vmov.i8         q10, #0
+
+    mov             r12, #8
+
+get16x16pred_error_neon_loop
+    vld1.8          {q0}, [r0], r1              ;Load up source and reference
+    vld1.8          {q2}, [r2], r3
+    vld1.8          {q1}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11
+    vmlal.s16       q9, d22, d22
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             get16x16pred_error_neon_loop
+
+    vadd.u32        q10, q9, q10
+    vpaddl.s32      q0, q8
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+    ENDP
+
+;=============================
+; r0    unsigned char *src_ptr,
+; r1    int  source_stride,
+; r2    unsigned char *ref_ptr,
+; r3    int  recon_stride
+|vp8_get4x4sse_cs_neon| PROC
+    vld1.8          {d0}, [r0], r1              ;Load up source and reference
+    vld1.8          {d4}, [r2], r3
+    vld1.8          {d1}, [r0], r1
+    vld1.8          {d5}, [r2], r3
+    vld1.8          {d2}, [r0], r1
+    vld1.8          {d6}, [r2], r3
+    vld1.8          {d3}, [r0], r1
+    vld1.8          {d7}, [r2], r3
+
+    vsubl.u8        q11, d0, d4
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vmull.s16       q7, d22, d22
+    vmull.s16       q8, d24, d24
+    vmull.s16       q9, d26, d26
+    vmull.s16       q10, d28, d28
+
+    vadd.u32        q7, q7, q8
+    vadd.u32        q9, q9, q10
+    vadd.u32        q9, q7, q9
+
+    vpaddl.u32      q1, q9
+    vadd.u64        d0, d2, d3
+
+    vmov.32         r0, d0[0]
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm
new file mode 100644
index 0000000..9c52c52
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_armv7.asm

@@ -0,0 +1,300 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_armv7|
+
+    INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 vp8_writer *w
+; r1 const TOKENEXTRA *p
+; r2 int xcount
+; r3 vp8_coef_encodings
+; s0 vp8_extra_bits
+; s1 vp8_coef_tree
+|vp8cx_pack_tokens_armv7| PROC
+    push    {r4-r11, lr}
+
+    ; Add size of xcount * sizeof (TOKENEXTRA) to get stop
+    ;  sizeof (TOKENEXTRA) is 20
+    add     r2, r2, r2, lsl #2          ; xcount
+    sub     sp, sp, #12
+    add     r2, r1, r2, lsl #2          ; stop = p + xcount
+    str     r2, [sp, #0]
+    str     r3, [sp, #8]                ; save vp8_coef_encodings
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+    b       check_p_lt_stop
+
+while_p_lt_stop
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #8]                ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldr     r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    ; reverse the stream of bits to be packed.  Normally
+    ; the most significant bit is peeled off and compared
+    ; in the form of (v >> --n) & 1.  ARM architecture has
+    ; the ability to set a flag based on the value of the
+    ; bit shifted off the bottom of the register.  To make
+    ; that happen the bitstream is reversed.
+    rbit    r12, r6
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsr     r12, r12, r4                ; v >>= 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsrs    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #52]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #48]               ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rbit    r12, r7                     ; reverse v
+    rsb     r4, r8, #32
+    lsr     r12, r12, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsrs    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #12
+    pop     {r4-r11, pc}
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm
new file mode 100644
index 0000000..92b0989
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_mbrow_armv7.asm

@@ -0,0 +1,335 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_mb_row_tokens_armv7|
+
+    INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 vp8_writer *w
+; r2 vp8_coef_encodings
+; r3 vp8_extra_bits
+; s0 vp8_coef_tree
+
+|vp8cx_pack_mb_row_tokens_armv7| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #24
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r2, [sp, #20]               ; save vp8_coef_encodings
+    str     r5, [sp, #12]               ; save mb_rows
+    str     r3, [sp, #8]                ; save vp8_extra_bits
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+
+    mov     r0, r1                      ; keep same as other loops
+
+    ldr     r2, [r0, #vp8_writer_lowvalue]
+    ldr     r5, [r0, #vp8_writer_range]
+    ldr     r3, [r0, #vp8_writer_count]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actuall work gets done here!
+
+while_p_lt_stop
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #20]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldr     r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    ; reverse the stream of bits to be packed.  Normally
+    ; the most significant bit is peeled off and compared
+    ; in the form of (v >> --n) & 1.  ARM architecture has
+    ; the ability to set a flag based on the value of the
+    ; bit shifted off the bottom of the register.  To make
+    ; that happen the bitstream is reversed.
+    rbit    r12, r6
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsr     r12, r12, r4                ; v >>= 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsrs    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #60]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #8]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rbit    r12, r7                     ; reverse v
+    rsb     r4, r8, #32
+    lsr     r12, r12, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]            ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsrs    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, #1
+    add     r7, r7, #TOKENLIST_SZ       ; next element in the array
+    str     r6, [sp, #12]
+    bne     mb_row_loop
+
+    str     r2, [r0, #vp8_writer_lowvalue]
+    str     r5, [r0, #vp8_writer_range]
+    str     r3, [r0, #vp8_writer_count]
+    add     sp, sp, #24
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm
new file mode 100644
index 0000000..6d5f882
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_packtokens_partitions_armv7.asm

@@ -0,0 +1,471 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT |vp8cx_pack_tokens_into_partitions_armv7|
+
+    INCLUDE vpx_vp8_enc_asm_offsets.asm
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA    |.text|, CODE, READONLY
+
+; r0 VP8_COMP *cpi
+; r1 unsigned char *cx_data
+; r2 int num_part
+; r3 *size
+; s0 vp8_coef_encodings
+; s1 vp8_extra_bits,
+; s2 const vp8_tree_index *,
+
+|vp8cx_pack_tokens_into_partitions_armv7| PROC
+    push    {r4-r11, lr}
+    sub     sp, sp, #44
+
+    ; Compute address of cpi->common.mb_rows
+    ldr     r4, _VP8_COMP_common_
+    ldr     r6, _VP8_COMMON_MBrows_
+    add     r4, r0, r4
+
+    ldr     r5, [r4, r6]                ; load up mb_rows
+
+    str     r5, [sp, #36]               ; save mb_rows
+    str     r1, [sp, #24]               ; save cx_data
+    str     r2, [sp, #20]               ; save num_part
+    str     r3, [sp, #8]                ; save *size
+
+    ; *size = 3*(num_part -1 );
+    sub     r2, r2, #1                  ; num_part - 1
+    add     r2, r2, r2, lsl #1          ; 3*(num_part - 1)
+    str     r2, [r3]
+
+    add     r2, r2, r1                  ; cx_data + *size
+    str     r2, [sp, #40]               ; ptr
+
+    ldr     r4, _VP8_COMP_tplist_
+    add     r4, r0, r4
+    ldr     r7, [r4, #0]                ; dereference cpi->tp_list
+    str     r7, [sp, #32]               ; store start of cpi->tp_list
+
+    ldr     r11, _VP8_COMP_bc2_         ; load up vp8_writer out of cpi
+    add     r0, r0, r11
+
+    mov     r11, #0
+    str     r11, [sp, #28]              ; i
+
+numparts_loop
+    ldr     r10, [sp, #40]              ; ptr
+    ldr     r5,  [sp, #36]              ; move mb_rows to the counting section
+    str     r5,  [sp, #12]
+
+    ; Reset all of the VP8 Writer data for each partition that
+    ; is processed.
+    ; start_encode
+    mov     r2, #0                      ; vp8_writer_lowvalue
+    mov     r5, #255                    ; vp8_writer_range
+    mvn     r3, #23                     ; vp8_writer_count
+
+    str     r2,  [r0, #vp8_writer_value]
+    str     r2,  [r0, #vp8_writer_pos]
+    str     r10, [r0, #vp8_writer_buffer]
+
+mb_row_loop
+
+    ldr     r1, [r7, #tokenlist_start]
+    ldr     r9, [r7, #tokenlist_stop]
+    str     r9, [sp, #0]                ; save stop for later comparison
+    str     r7, [sp, #16]               ; tokenlist address for next time
+
+    b       check_p_lt_stop
+
+    ; actual work gets done here!
+
+while_p_lt_stop
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r4, [sp, #80]               ; vp8_coef_encodings
+    mov     lr, #0
+    add     r4, r4, r6, lsl #3          ; a = vp8_coef_encodings + t
+    ldr     r9, [r1, #tokenextra_context_tree]   ; pp
+
+    ldr     r7, [r1, #tokenextra_skip_eob_node]
+
+    ldr     r6, [r4, #vp8_token_value]  ; v
+    ldr     r8, [r4, #vp8_token_len]    ; n
+
+    ; vp8 specific skip_eob_node
+    cmp     r7, #0
+    movne   lr, #2                      ; i = 2
+    subne   r8, r8, #1                  ; --n
+
+    ; reverse the stream of bits to be packed.  Normally
+    ; the most significant bit is peeled off and compared
+    ; in the form of (v >> --n) & 1.  ARM architecture has
+    ; the ability to set a flag based on the value of the
+    ; bit shifted off the bottom of the register.  To make
+    ; that happen the bitstream is reversed.
+    rbit    r12, r6
+    rsb     r4, r8, #32                 ; 32-n
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+    ; v is kept in r12 during the token pack loop
+    lsr     r12, r12, r4                ; v >>= 32 - n
+
+; loop start
+token_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp [i>>1]
+    sub     r7, r5, #1                  ; range-1
+
+    ; Decisions are made based on the bit value shifted
+    ; off of v, so set a flag here based on this.
+    ; This value is refered to as "bb"
+    lsrs    r12, r12, #1                ; bb = v >> n
+    mul     r4, r4, r7                  ; ((range-1) * pp[i>>1]))
+
+    ; bb can only be 0 or 1.  So only execute this statement
+    ; if bb == 1, otherwise it will act like i + 0
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = vp8_coef_tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start
+token_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+    ; r10 is used earlier in the loop, but r10 is used as
+    ; temp variable here.  So after r10 is used, reload
+    ; vp8_coef_tree_dcd into r10
+    ldr     r10, [sp, #88]              ; vp8_coef_tree
+
+token_count_lt_zero
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r8, r8, #1                  ; --n
+    bne     token_loop
+
+    ldr     r6, [r1, #tokenextra_token] ; t
+    ldr     r7, [sp, #84]                ; vp8_extra_bits
+    ; Add t * sizeof (vp8_extra_bit_struct) to get the desired
+    ;  element.  Here vp8_extra_bit_struct == 20
+    add     r6, r6, r6, lsl #2          ; b = vp8_extra_bits + t
+    add     r12, r7, r6, lsl #2         ; b = vp8_extra_bits + t
+
+    ldr     r4, [r12, #vp8_extra_bit_struct_base_val]
+    cmp     r4, #0
+    beq     skip_extra_bits
+
+;   if( b->base_val)
+    ldr     r8, [r12, #vp8_extra_bit_struct_len] ; L
+    ldr     lr, [r1, #tokenextra_extra] ; e = p->Extra
+    cmp     r8, #0                      ; if( L)
+    beq     no_extra_bits
+
+    ldr     r9, [r12, #vp8_extra_bit_struct_prob]
+    asr     r7, lr, #1                  ; v=e>>1
+
+    ldr     r10, [r12, #vp8_extra_bit_struct_tree]
+    str     r10, [sp, #4]               ; b->tree
+
+    rbit    r12, r7                     ; reverse v
+    rsb     r4, r8, #32
+    lsr     r12, r12, r4
+
+    mov     lr, #0                      ; i = 0
+
+extra_bits_loop
+    ldrb    r4, [r9, lr, asr #1]        ; pp[i>>1]
+    sub     r7, r5, #1                  ; range-1
+    lsrs    r12, r12, #1                ; v >> n
+    mul     r4, r4, r7                  ; (range-1) * pp[i>>1]
+    addcs   lr, lr, #1                  ; i + bb
+
+    mov     r7, #1
+    ldrsb   lr, [r10, lr]               ; i = b->tree[i+bb]
+    add     r4, r7, r4, lsr #8          ; split = 1 +  (((range-1) * pp[i>>1]) >> 8)
+
+    addcs   r2, r2, r4                  ; if  (bb) lowvalue += split
+    subcs   r4, r5, r4                  ; if  (bb) range = range-split
+
+    clz     r6, r4
+    sub     r6, r6, #24
+
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     extra_count_lt_zero         ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset= shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     extra_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos - 1
+    b       extra_zero_while_start
+extra_zero_while_loop
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+extra_zero_while_start
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     extra_zero_while_loop
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]
+extra_high_bit_not_set
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]=(lowvalue >> (24-offset))
+    ldr     r10, [sp, #4]               ; b->tree
+extra_count_lt_zero
+    lsl     r2, r2, r6
+
+    subs    r8, r8, #1                  ; --n
+    bne     extra_bits_loop             ; while (n)
+
+no_extra_bits
+    ldr     lr, [r1, #4]                ; e = p->Extra
+    add     r4, r5, #1                  ; range + 1
+    tst     lr, #1
+    lsr     r4, r4, #1                  ; split = (range + 1) >> 1
+    addne   r2, r2, r4                  ; lowvalue += split
+    subne   r4, r5, r4                  ; range = range-split
+    tst     r2, #0x80000000             ; lowvalue & 0x80000000
+    lsl     r5, r4, #1                  ; range <<= 1
+    beq     end_high_bit_not_set
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mov     r7, #0
+    sub     r4, r4, #1
+    b       end_zero_while_start
+end_zero_while_loop
+    strb    r7, [r6, r4]
+    sub     r4, r4, #1                  ; x--
+end_zero_while_start
+    cmp     r4, #0
+    ldrge   r6, [r0, #vp8_writer_buffer]
+    ldrb    r12, [r6, r4]
+    cmpge   r12, #0xff
+    beq     end_zero_while_loop
+
+    ldr     r6, [r0, #vp8_writer_buffer]
+    ldrb    r7, [r6, r4]
+    add     r7, r7, #1
+    strb    r7, [r6, r4]
+end_high_bit_not_set
+    adds    r3, r3, #1                  ; ++count
+    lsl     r2, r2, #1                  ; lowvalue  <<= 1
+    bne     end_count_zero
+
+    ldr     r4, [r0, #vp8_writer_pos]
+    mvn     r3, #7
+    ldr     r7, [r0, #vp8_writer_buffer]
+    lsr     r6, r2, #24                 ; lowvalue >> 24
+    add     r12, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r12, [r0, #0x10]
+    strb    r6, [r7, r4]
+end_count_zero
+skip_extra_bits
+    add     r1, r1, #TOKENEXTRA_SZ      ; ++p
+check_p_lt_stop
+    ldr     r4, [sp, #0]                ; stop
+    cmp     r1, r4                      ; while( p < stop)
+    bcc     while_p_lt_stop
+
+    ldr     r10, [sp, #20]              ; num_parts
+    mov     r1, #TOKENLIST_SZ
+    mul     r1, r10, r1
+
+    ldr     r6, [sp, #12]               ; mb_rows
+    ldr     r7, [sp, #16]               ; tokenlist address
+    subs    r6, r6, r10
+    add     r7, r7, r1                  ; next element in the array
+    str     r6, [sp, #12]
+    bgt     mb_row_loop
+
+    mov     r12, #32
+
+stop_encode_loop
+    sub     r7, r5, #1                  ; range-1
+
+    mov     r4, r7, lsl #7              ; ((range-1) * 128)
+
+    mov     r7, #1
+    add     r4, r7, r4, lsr #8          ; 1 + (((range-1) * 128) >> 8)
+
+    ; Counting the leading zeros is used to normalize range.
+    clz     r6, r4
+    sub     r6, r6, #24                 ; shift
+
+    ; Flag is set on the sum of count.  This flag is used later
+    ; to determine if count >= 0
+    adds    r3, r3, r6                  ; count += shift
+    lsl     r5, r4, r6                  ; range <<= shift
+    bmi     token_count_lt_zero_se      ; if(count >= 0)
+
+    sub     r6, r6, r3                  ; offset = shift - count
+    sub     r4, r6, #1                  ; offset-1
+    lsls    r4, r2, r4                  ; if((lowvalue<<(offset-1)) & 0x80000000 )
+    bpl     token_high_bit_not_set_se
+
+    ldr     r4, [r0, #vp8_writer_pos]   ; x
+    sub     r4, r4, #1                  ; x = w->pos-1
+    b       token_zero_while_start_se
+token_zero_while_loop_se
+    mov     r10, #0
+    strb    r10, [r7, r4]               ; w->buffer[x] =(unsigned char)0
+    sub     r4, r4, #1                  ; x--
+token_zero_while_start_se
+    cmp     r4, #0
+    ldrge   r7, [r0, #vp8_writer_buffer]
+    ldrb    r11, [r7, r4]
+    cmpge   r11, #0xff
+    beq     token_zero_while_loop_se
+
+    ldr     r7, [r0, #vp8_writer_buffer]
+    ldrb    r10, [r7, r4]               ; w->buffer[x]
+    add     r10, r10, #1
+    strb    r10, [r7, r4]               ; w->buffer[x] + 1
+token_high_bit_not_set_se
+    rsb     r4, r6, #24                 ; 24-offset
+    ldr     r10, [r0, #vp8_writer_buffer]
+    lsr     r7, r2, r4                  ; lowvalue >> (24-offset)
+    ldr     r4, [r0, #vp8_writer_pos]   ; w->pos
+    lsl     r2, r2, r6                  ; lowvalue <<= offset
+    mov     r6, r3                      ; shift = count
+    add     r11, r4, #1                 ; w->pos++
+    bic     r2, r2, #0xff000000         ; lowvalue &= 0xffffff
+    str     r11, [r0, #vp8_writer_pos]
+    sub     r3, r3, #8                  ; count -= 8
+    strb    r7, [r10, r4]               ; w->buffer[w->pos++]
+
+token_count_lt_zero_se
+    lsl     r2, r2, r6                  ; lowvalue <<= shift
+
+    subs    r12, r12, #1
+    bne     stop_encode_loop
+
+    ldr     r10, [sp, #8]               ; *size
+    ldr     r11, [r10]
+    ldr     r4,  [r0, #vp8_writer_pos]  ; w->pos
+    add     r11, r11, r4                ; *size += w->pos
+    str     r11, [r10]
+
+    ldr     r9, [sp, #20]               ; num_parts
+    sub     r9, r9, #1
+    ldr     r10, [sp, #28]              ; i
+    cmp     r10, r9                     ; if(i<(num_part - 1))
+    bge     skip_write_partition
+
+    ldr     r12, [sp, #40]              ; ptr
+    add     r12, r12, r4                ; ptr += w->pos
+    str     r12, [sp, #40]
+
+    ldr     r9, [sp, #24]               ; cx_data
+    mov     r8, r4, asr #8
+    strb    r4, [r9, #0]
+    strb    r8, [r9, #1]
+    mov     r4, r4, asr #16
+    strb    r4, [r9, #2]
+
+    add     r9, r9, #3                  ; cx_data += 3
+    str     r9, [sp, #24]
+
+skip_write_partition
+
+    ldr     r11, [sp, #28]              ; i
+    ldr     r10, [sp, #20]              ; num_parts
+
+    add     r11, r11, #1                ; i++
+    str     r11, [sp, #28]
+
+    ldr     r7, [sp, #32]               ; cpi->tp_list[i]
+    mov     r1, #TOKENLIST_SZ
+    add     r7, r7, r1                  ; next element in cpi->tp_list
+    str     r7, [sp, #32]               ; cpi->tp_list[i+1]
+
+    cmp     r10, r11
+    bgt     numparts_loop
+
+
+    add     sp, sp, #44
+    pop     {r4-r11, pc}
+    ENDP
+
+_VP8_COMP_common_
+    DCD     vp8_comp_common
+_VP8_COMMON_MBrows_
+    DCD     vp8_common_mb_rows
+_VP8_COMP_tplist_
+    DCD     vp8_comp_tplist
+_VP8_COMP_bc2_
+    DCD     vp8_comp_bc2
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm
new file mode 100644
index 0000000..5269c0a
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_shortwalsh4x4_neon.asm

@@ -0,0 +1,75 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_short_walsh4x4_neon|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+;void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+
+|vp8_short_walsh4x4_neon| PROC
+    vld1.16         {d2}, [r0], r2              ;load input
+    vld1.16         {d3}, [r0], r2
+    vld1.16         {d4}, [r0], r2
+    vld1.16         {d5}, [r0], r2
+
+    ;First for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[1], d4=ip[2], d5=ip[3]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[3]
+    vadd.s16        d7, d3, d4              ;b1 = ip[1]+ip[2]
+    vsub.s16        d8, d3, d4              ;c1 = ip[1]-ip[2]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[3]
+
+    vadd.s16        d2, d6, d7             ;op[0] = a1 + b1
+    vsub.s16        d4, d6, d7             ;op[2] = a1 - b1
+    vadd.s16        d3, d8, d9             ;op[1] = c1 + d1
+    vsub.s16        d5, d9, d8             ;op[3] = d1 - c1
+
+    ;Second for-loop
+    ;transpose d2, d3, d4, d5. Then, d2=ip[0], d3=ip[4], d4=ip[8], d5=ip[12]
+    vtrn.32         d2, d4
+    vtrn.32         d3, d5
+    vtrn.16         d2, d3
+    vtrn.16         d4, d5
+
+    vadd.s16        d6, d2, d5              ;a1 = ip[0]+ip[12]
+    vadd.s16        d7, d3, d4              ;b1 = ip[4]+ip[8]
+    vsub.s16        d8, d3, d4              ;c1 = ip[4]-ip[8]
+    vsub.s16        d9, d2, d5              ;d1 = ip[0]-ip[12]
+
+    vadd.s16        d2, d6, d7              ;a2 = a1 + b1;
+    vsub.s16        d4, d6, d7              ;c2 = a1 - b1;
+    vadd.s16        d3, d8, d9              ;b2 = c1 + d1;
+    vsub.s16        d5, d9, d8              ;d2 = d1 - c1;
+
+    vcgt.s16        q3, q1, #0
+    vcgt.s16        q4, q2, #0
+
+    vsub.s16        q1, q1, q3
+    vsub.s16        q2, q2, q4
+
+    vshr.s16        q1, q1, #1
+    vshr.s16        q2, q2, #1
+
+    vst1.16         {q1, q2}, [r1]
+
+    bx              lr
+
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm
new file mode 100644
index 0000000..aec716e
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16_neon.asm

@@ -0,0 +1,427 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance16x16_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict16x16_neon and vp8_variance16x16_neon.
+
+|vp8_sub_pixel_variance16x16_neon| PROC
+    push            {r4-r6, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #16]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #20]           ;load dst_pixels_per_line from stack
+    ldr             r6, [sp, #24]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16_only
+
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+
+    vld1.s32        {d31}, [r2]             ;load first_pass filter
+
+    beq             firstpass_bfilter16x16_only
+
+    sub             sp, sp, #272            ;reserve space on stack for temporary storage
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    mov             lr, sp
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    mov             r2, #3                  ;loop counter
+    vld1.u8         {d8, d9, d10}, [r0], r1
+
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    vdup.8          d1, d31[4]
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16_loop_neon
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vqrshrn.u16    d21, q14, #7
+    vld1.u8         {d5, d6, d7}, [r0], r1
+
+    vst1.u8         {d14, d15, d16, d17}, [lr]!     ;store result
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vst1.u8         {d18, d19, d20, d21}, [lr]!
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    bne             vp8e_filt_blk2d_fp16x16_loop_neon
+
+;First-pass filtering for rest 5 lines
+    vld1.u8         {d14, d15, d16}, [r0], r1
+
+    vmull.u8        q9, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q10, d3, d0
+    vmull.u8        q11, d5, d0
+    vmull.u8        q12, d6, d0
+    vmull.u8        q13, d8, d0
+    vmull.u8        q14, d9, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+
+    vmlal.u8        q9, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q11, d5, d1
+    vmlal.u8        q13, d8, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+
+    vmlal.u8        q10, d3, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q12, d6, d1
+    vmlal.u8        q14, d9, d1
+
+    vmull.u8        q1, d11, d0
+    vmull.u8        q2, d12, d0
+    vmull.u8        q3, d14, d0
+    vmull.u8        q4, d15, d0
+
+    vext.8          d11, d11, d12, #1       ;construct src_ptr[1]
+    vext.8          d14, d14, d15, #1
+
+    vmlal.u8        q1, d11, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q3, d14, d1
+
+    vext.8          d12, d12, d13, #1
+    vext.8          d15, d15, d16, #1
+
+    vmlal.u8        q2, d12, d1             ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q4, d15, d1
+
+    vqrshrn.u16    d10, q9, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d11, q10, #7
+    vqrshrn.u16    d12, q11, #7
+    vqrshrn.u16    d13, q12, #7
+    vqrshrn.u16    d14, q13, #7
+    vqrshrn.u16    d15, q14, #7
+    vqrshrn.u16    d16, q1, #7
+    vqrshrn.u16    d17, q2, #7
+    vqrshrn.u16    d18, q3, #7
+    vqrshrn.u16    d19, q4, #7
+
+    vst1.u8         {d10, d11, d12, d13}, [lr]!         ;store result
+    vst1.u8         {d14, d15, d16, d17}, [lr]!
+    vst1.u8         {d18, d19}, [lr]!
+
+;Second pass: 16x16
+;secondpass_filter
+    add             r3, r12, r3, lsl #3
+    sub             lr, lr, #272
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    sub             sp, sp, #256
+    mov             r3, sp
+
+    vld1.u8         {d22, d23}, [lr]!       ;load src data
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r12, #4                 ;loop counter
+
+vp8e_filt_blk2d_sp16x16_loop_neon
+    vld1.u8         {d24, d25}, [lr]!
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [lr]!
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [lr]!
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [lr]!
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    subs            r12, r12, #1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5}, [r3]!
+    vst1.u8         {d6, d7}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_sp16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;--------------------
+firstpass_bfilter16x16_only
+    mov             r2, #4                      ;loop counter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    vdup.8          d0, d31[0]                  ;first_pass filter (d0 d1)
+    vdup.8          d1, d31[4]
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16_loop_neon
+    vld1.u8         {d2, d3, d4}, [r0], r1      ;load src data
+    vld1.u8         {d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10}, [r0], r1
+    vld1.u8         {d11, d12, d13}, [r0], r1
+
+    pld             [r0]
+    pld             [r0, r1]
+    pld             [r0, r1, lsl #1]
+
+    vmull.u8        q7, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q8, d3, d0
+    vmull.u8        q9, d5, d0
+    vmull.u8        q10, d6, d0
+    vmull.u8        q11, d8, d0
+    vmull.u8        q12, d9, d0
+    vmull.u8        q13, d11, d0
+    vmull.u8        q14, d12, d0
+
+    vext.8          d2, d2, d3, #1          ;construct src_ptr[1]
+    vext.8          d5, d5, d6, #1
+    vext.8          d8, d8, d9, #1
+    vext.8          d11, d11, d12, #1
+
+    vmlal.u8        q7, d2, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q9, d5, d1
+    vmlal.u8        q11, d8, d1
+    vmlal.u8        q13, d11, d1
+
+    vext.8          d3, d3, d4, #1
+    vext.8          d6, d6, d7, #1
+    vext.8          d9, d9, d10, #1
+    vext.8          d12, d12, d13, #1
+
+    vmlal.u8        q8, d3, d1              ;(src_ptr[0] * Filter[1])
+    vmlal.u8        q10, d6, d1
+    vmlal.u8        q12, d9, d1
+    vmlal.u8        q14, d12, d1
+
+    subs            r2, r2, #1
+
+    vqrshrn.u16    d14, q7, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d15, q8, #7
+    vqrshrn.u16    d16, q9, #7
+    vqrshrn.u16    d17, q10, #7
+    vqrshrn.u16    d18, q11, #7
+    vqrshrn.u16    d19, q12, #7
+    vqrshrn.u16    d20, q13, #7
+    vst1.u8         {d14, d15}, [r3]!       ;store result
+    vqrshrn.u16    d21, q14, #7
+
+    vst1.u8         {d16, d17}, [r3]!
+    vst1.u8         {d18, d19}, [r3]!
+    vst1.u8         {d20, d21}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;---------------------
+secondpass_bfilter16x16_only
+;Second pass: 16x16
+;secondpass_filter
+    sub             sp, sp, #528            ;reserve space on stack for temporary storage
+    add             r3, r12, r3, lsl #3
+    mov             r12, #4                     ;loop counter
+    vld1.u32        {d31}, [r3]                 ;load second_pass filter
+    vld1.u8         {d22, d23}, [r0], r1        ;load src data
+    mov             r3, sp
+
+    vdup.8          d0, d31[0]                  ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+vp8e_filt_blk2d_spo16x16_loop_neon
+    vld1.u8         {d24, d25}, [r0], r1
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vld1.u8         {d26, d27}, [r0], r1
+    vmull.u8        q2, d23, d0
+    vld1.u8         {d28, d29}, [r0], r1
+    vmull.u8        q3, d24, d0
+    vld1.u8         {d30, d31}, [r0], r1
+
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d24, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d25, d1
+    vmlal.u8        q3, d26, d1
+    vmlal.u8        q4, d27, d1
+    vmlal.u8        q5, d28, d1
+    vmlal.u8        q6, d29, d1
+    vmlal.u8        q7, d30, d1
+    vmlal.u8        q8, d31, d1
+
+    vqrshrn.u16    d2, q1, #7               ;shift/round/saturate to u8
+    vqrshrn.u16    d3, q2, #7
+    vqrshrn.u16    d4, q3, #7
+    vqrshrn.u16    d5, q4, #7
+    vqrshrn.u16    d6, q5, #7
+    vqrshrn.u16    d7, q6, #7
+    vqrshrn.u16    d8, q7, #7
+    vqrshrn.u16    d9, q8, #7
+
+    vst1.u8         {d2, d3}, [r3]!         ;store result
+    subs            r12, r12, #1
+    vst1.u8         {d4, d5}, [r3]!
+    vmov            q11, q15
+    vst1.u8         {d6, d7}, [r3]!
+    vst1.u8         {d8, d9}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16_loop_neon
+
+    b               sub_pixel_variance16x16_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r12, #8
+
+sub_pixel_variance16x16_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q2}, [r4], r5
+    vld1.8          {q1}, [r3]!
+    vld1.8          {q3}, [r4], r5
+
+    vsubl.u8        q11, d0, d4                 ;diff
+    vsubl.u8        q12, d1, d5
+    vsubl.u8        q13, d2, d6
+    vsubl.u8        q14, d3, d7
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    bne             sub_pixel_variance16x16_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [r6]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #528
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4-r6,pc}
+
+    ENDP
+
+;-----------------
+    AREA    vp8e_bilinear_taps_dat, DATA, READWRITE          ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
new file mode 100644
index 0000000..3d02d7c
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm

@@ -0,0 +1,571 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance16x16s_4_0_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16s_0_4_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16s_4_4_neon|
+    EXPORT  |vp8_sub_pixel_variance16x16s_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_4_0_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                  ;loop counter
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8_filt_fpo16x16s_4_0_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.8          {q11}, [r2], r3
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.8          {q12}, [r2], r3
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.8          {q13}, [r2], r3
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vext.8          q3, q2, q3, #1
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vld1.8          {q14}, [r2], r3
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+
+    vsubl.u8        q4, d0, d22                 ;diff
+    vsubl.u8        q5, d1, d23
+    vsubl.u8        q6, d2, d24
+    vsubl.u8        q7, d3, d25
+    vsubl.u8        q0, d4, d26
+    vsubl.u8        q1, d5, d27
+    vsubl.u8        q2, d6, d28
+    vsubl.u8        q3, d7, d29
+
+    vpadal.s16      q8, q4                     ;sum
+    vmlal.s16       q9, d8, d8                ;sse
+    vmlal.s16       q10, d9, d9
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_fpo16x16s_4_0_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_0_4_neon| PROC
+    push            {lr}
+
+    mov             r12, #4                     ;loop counter
+
+    vld1.u8         {q0}, [r0], r1              ;load src data
+    ldr             lr, [sp, #4]                ;load *sse from stack
+
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+vp8_filt_spo16x16s_0_4_loop_neon
+    vld1.u8         {q2}, [r0], r1
+    vld1.8          {q1}, [r2], r3
+    vld1.u8         {q4}, [r0], r1
+    vld1.8          {q3}, [r2], r3
+    vld1.u8         {q6}, [r0], r1
+    vld1.8          {q5}, [r2], r3
+    vld1.u8         {q15}, [r0], r1
+
+    vrhadd.u8       q0, q0, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q4
+    vrhadd.u8       q4, q4, q6
+    vrhadd.u8       q6, q6, q15
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                 ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+
+    vmov            q0, q15
+
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             vp8_filt_spo16x16s_0_4_loop_neon
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;================================================
+;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+;(
+;    unsigned char  *src_ptr, r0
+;    int  src_pixels_per_line,  r1
+;    unsigned char *dst_ptr,  r2
+;    int dst_pixels_per_line,   r3
+;    unsigned int *sse
+;);
+;================================================
+|vp8_sub_pixel_variance16x16s_4_4_neon| PROC
+    push            {lr}
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+
+    ldr             lr, [sp, #4]           ;load *sse from stack
+    vmov.i8         q13, #0                      ;q8 - sum
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+
+    vmov.i8         q14, #0                      ;q9, q10 - sse
+    vmov.i8         q15, #0
+
+    mov             r12, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8_filt16x16s_4_4_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vld1.8          {q5}, [r2], r3
+    vrhadd.u8       q0, q0, q1
+    vld1.8          {q6}, [r2], r3
+    vrhadd.u8       q1, q1, q2
+    vld1.8          {q7}, [r2], r3
+    vrhadd.u8       q2, q2, q3
+    vld1.8          {q8}, [r2], r3
+    vrhadd.u8       q3, q3, q4
+
+    vsubl.u8        q9, d0, d10                 ;diff
+    vsubl.u8        q10, d1, d11
+    vsubl.u8        q11, d2, d12
+    vsubl.u8        q12, d3, d13
+
+    vsubl.u8        q0, d4, d14                 ;diff
+    vsubl.u8        q1, d5, d15
+    vsubl.u8        q5, d6, d16
+    vsubl.u8        q6, d7, d17
+
+    vpadal.s16      q13, q9                     ;sum
+    vmlal.s16       q14, d18, d18                ;sse
+    vmlal.s16       q15, d19, d19
+
+    vpadal.s16      q13, q10                     ;sum
+    vmlal.s16       q14, d20, d20                ;sse
+    vmlal.s16       q15, d21, d21
+
+    vpadal.s16      q13, q11                     ;sum
+    vmlal.s16       q14, d22, d22                ;sse
+    vmlal.s16       q15, d23, d23
+
+    vpadal.s16      q13, q12                     ;sum
+    vmlal.s16       q14, d24, d24                ;sse
+    vmlal.s16       q15, d25, d25
+
+    subs            r12, r12, #1
+
+    vpadal.s16      q13, q0                     ;sum
+    vmlal.s16       q14, d0, d0                ;sse
+    vmlal.s16       q15, d1, d1
+
+    vpadal.s16      q13, q1                     ;sum
+    vmlal.s16       q14, d2, d2                ;sse
+    vmlal.s16       q15, d3, d3
+
+    vpadal.s16      q13, q5                     ;sum
+    vmlal.s16       q14, d10, d10                ;sse
+    vmlal.s16       q15, d11, d11
+
+    vmov            q0, q4
+
+    vpadal.s16      q13, q6                     ;sum
+    vmlal.s16       q14, d12, d12                ;sse
+    vmlal.s16       q15, d13, d13
+
+    bne             vp8_filt16x16s_4_4_loop_neon
+
+    vadd.u32        q15, q14, q15                ;accumulate sse
+    vpaddl.s32      q0, q13                      ;accumulate sum
+
+    vpaddl.u32      q1, q15
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {pc}
+    ENDP
+
+;==============================
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack unsigned char *dst_ptr,
+; stack int dst_pixels_per_line,
+; stack unsigned int *sse
+;note: in vp8_find_best_half_pixel_step()(called when 8<Speed<15), and first call of vp8_find_best_sub_pixel_step()
+;(called when speed<=8). xoffset/yoffset can only be 4 or 0, which means either by pass the filter,
+;or filter coeff is {64, 64}. This simplified program only works in this situation.
+;note: It happens that both xoffset and yoffset are zero. This can be handled in c code later.
+
+|vp8_sub_pixel_variance16x16s_neon| PROC
+    push            {r4, lr}
+
+    ldr             r4, [sp, #8]            ;load *dst_ptr from stack
+    ldr             r12, [sp, #12]          ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #16]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             secondpass_bfilter16x16s_only
+
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    beq             firstpass_bfilter16x16s_only
+
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    mov             r3, sp
+    mov             r2, #4                  ;loop counter
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+
+;First Pass: output_height lines x output_width columns (17x16)
+vp8e_filt_blk2d_fp16x16s_loop_neon
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q3, q2, q3, #1          ;construct src_ptr[1]
+    vext.8          q5, q4, q5, #1
+    vext.8          q7, q6, q7, #1
+    vext.8          q9, q8, q9, #1
+
+    vrhadd.u8       q1, q2, q3              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+
+    vrhadd.u8       q0, q0, q1
+    vrhadd.u8       q1, q1, q2
+    vrhadd.u8       q2, q2, q3
+    vrhadd.u8       q3, q3, q4
+
+    subs            r2, r2, #1
+    vst1.u8         {d0, d1 ,d2, d3}, [r3]!         ;store result
+    vmov            q0, q4
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+
+    bne             vp8e_filt_blk2d_fp16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;--------------------
+firstpass_bfilter16x16s_only
+    mov             r2, #2                  ;loop counter
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+    mov             r3, sp
+
+;First Pass: output_height lines x output_width columns (16x16)
+vp8e_filt_blk2d_fpo16x16s_loop_neon
+    vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
+    vld1.u8         {d4, d5, d6, d7}, [r0], r1
+    vld1.u8         {d8, d9, d10, d11}, [r0], r1
+    vld1.u8         {d12, d13, d14, d15}, [r0], r1
+
+    ;pld                [r0]
+    ;pld                [r0, r1]
+    ;pld                [r0, r1, lsl #1]
+
+    vext.8          q1, q0, q1, #1          ;construct src_ptr[1]
+    vld1.u8         {d16, d17, d18, d19}, [r0], r1
+    vext.8          q3, q2, q3, #1
+    vld1.u8         {d20, d21, d22, d23}, [r0], r1
+    vext.8          q5, q4, q5, #1
+    vld1.u8         {d24, d25, d26, d27}, [r0], r1
+    vext.8          q7, q6, q7, #1
+    vld1.u8         {d28, d29, d30, d31}, [r0], r1
+    vext.8          q9, q8, q9, #1
+    vext.8          q11, q10, q11, #1
+    vext.8          q13, q12, q13, #1
+    vext.8          q15, q14, q15, #1
+
+    vrhadd.u8       q0, q0, q1              ;(src_ptr[0]+src_ptr[1])/round/shift right 1
+    vrhadd.u8       q1, q2, q3
+    vrhadd.u8       q2, q4, q5
+    vrhadd.u8       q3, q6, q7
+    vrhadd.u8       q4, q8, q9
+    vrhadd.u8       q5, q10, q11
+    vrhadd.u8       q6, q12, q13
+    vrhadd.u8       q7, q14, q15
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_fpo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;---------------------
+secondpass_bfilter16x16s_only
+    sub             sp, sp, #256            ;reserve space on stack for temporary storage
+
+    mov             r2, #2                  ;loop counter
+    vld1.u8         {d0, d1}, [r0], r1      ;load src data
+    mov             r3, sp
+
+vp8e_filt_blk2d_spo16x16s_loop_neon
+    vld1.u8         {d2, d3}, [r0], r1
+    vld1.u8         {d4, d5}, [r0], r1
+    vld1.u8         {d6, d7}, [r0], r1
+    vld1.u8         {d8, d9}, [r0], r1
+
+    vrhadd.u8       q0, q0, q1
+    vld1.u8         {d10, d11}, [r0], r1
+    vrhadd.u8       q1, q1, q2
+    vld1.u8         {d12, d13}, [r0], r1
+    vrhadd.u8       q2, q2, q3
+    vld1.u8         {d14, d15}, [r0], r1
+    vrhadd.u8       q3, q3, q4
+    vld1.u8         {d16, d17}, [r0], r1
+    vrhadd.u8       q4, q4, q5
+    vrhadd.u8       q5, q5, q6
+    vrhadd.u8       q6, q6, q7
+    vrhadd.u8       q7, q7, q8
+
+    subs            r2, r2, #1
+
+    vst1.u8         {d0, d1, d2, d3}, [r3]!         ;store result
+    vmov            q0, q8
+    vst1.u8         {d4, d5, d6, d7}, [r3]!
+    vst1.u8         {d8, d9, d10, d11}, [r3]!           ;store result
+    vst1.u8         {d12, d13, d14, d15}, [r3]!
+
+    bne             vp8e_filt_blk2d_spo16x16s_loop_neon
+
+    b               sub_pixel_variance16x16s_neon
+
+;----------------------------
+;variance16x16
+sub_pixel_variance16x16s_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    sub             r3, r3, #256
+    mov             r2, #4
+
+sub_pixel_variance16x16s_neon_loop
+    vld1.8          {q0}, [r3]!                 ;Load up source and reference
+    vld1.8          {q1}, [r4], r12
+    vld1.8          {q2}, [r3]!
+    vld1.8          {q3}, [r4], r12
+    vld1.8          {q4}, [r3]!
+    vld1.8          {q5}, [r4], r12
+    vld1.8          {q6}, [r3]!
+    vld1.8          {q7}, [r4], r12
+
+    vsubl.u8        q11, d0, d2                 ;diff
+    vsubl.u8        q12, d1, d3
+    vsubl.u8        q13, d4, d6
+    vsubl.u8        q14, d5, d7
+    vsubl.u8        q0, d8, d10
+    vsubl.u8        q1, d9, d11
+    vsubl.u8        q2, d12, d14
+    vsubl.u8        q3, d13, d15
+
+    vpadal.s16      q8, q11                     ;sum
+    vmlal.s16       q9, d22, d22                ;sse
+    vmlal.s16       q10, d23, d23
+
+    subs            r2, r2, #1
+
+    vpadal.s16      q8, q12
+    vmlal.s16       q9, d24, d24
+    vmlal.s16       q10, d25, d25
+    vpadal.s16      q8, q13
+    vmlal.s16       q9, d26, d26
+    vmlal.s16       q10, d27, d27
+    vpadal.s16      q8, q14
+    vmlal.s16       q9, d28, d28
+    vmlal.s16       q10, d29, d29
+
+    vpadal.s16      q8, q0                     ;sum
+    vmlal.s16       q9, d0, d0                ;sse
+    vmlal.s16       q10, d1, d1
+    vpadal.s16      q8, q1
+    vmlal.s16       q9, d2, d2
+    vmlal.s16       q10, d3, d3
+    vpadal.s16      q8, q2
+    vmlal.s16       q9, d4, d4
+    vmlal.s16       q10, d5, d5
+    vpadal.s16      q8, q3
+    vmlal.s16       q9, d6, d6
+    vmlal.s16       q10, d7, d7
+
+    bne             sub_pixel_variance16x16s_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #8
+    vsub.s32        d0, d1, d10
+
+    add             sp, sp, #256
+    vmov.32         r0, d0[0]                   ;return
+
+    pop             {r4, pc}
+    ENDP
+
+    END

diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm
new file mode 100644
index 0000000..bd56761
--- /dev/null
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance8x8_neon.asm

@@ -0,0 +1,226 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    EXPORT  |vp8_sub_pixel_variance8x8_neon|
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+; r0    unsigned char  *src_ptr,
+; r1    int  src_pixels_per_line,
+; r2    int  xoffset,
+; r3    int  yoffset,
+; stack(r4) unsigned char *dst_ptr,
+; stack(r5) int dst_pixels_per_line,
+; stack(r6) unsigned int *sse
+;note: most of the code is copied from bilinear_predict8x8_neon and vp8_variance8x8_neon.
+
+|vp8_sub_pixel_variance8x8_neon| PROC
+    push            {r4-r5, lr}
+
+    ldr             r12, _BilinearTaps_coeff_
+    ldr             r4, [sp, #12]           ;load *dst_ptr from stack
+    ldr             r5, [sp, #16]           ;load dst_pixels_per_line from stack
+    ldr             lr, [sp, #20]           ;load *sse from stack
+
+    cmp             r2, #0                  ;skip first_pass filter if xoffset=0
+    beq             skip_firstpass_filter
+
+;First pass: output_height lines x output_width columns (9x8)
+    add             r2, r12, r2, lsl #3     ;calculate filter location
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vld1.u32        {d31}, [r2]             ;load first_pass filter
+    vld1.u8         {q2}, [r0], r1
+    vdup.8          d0, d31[0]              ;first_pass filter (d0 d1)
+    vld1.u8         {q3}, [r0], r1
+    vdup.8          d1, d31[4]
+    vld1.u8         {q4}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+
+    vld1.u8         {q1}, [r0], r1          ;load src data
+    vqrshrn.u16    d22, q6, #7              ;shift/round/saturate to u8
+    vld1.u8         {q2}, [r0], r1
+    vqrshrn.u16    d23, q7, #7
+    vld1.u8         {q3}, [r0], r1
+    vqrshrn.u16    d24, q8, #7
+    vld1.u8         {q4}, [r0], r1
+    vqrshrn.u16    d25, q9, #7
+
+    ;first_pass filtering on the rest 5-line data
+    vld1.u8         {q5}, [r0], r1
+
+    vmull.u8        q6, d2, d0              ;(src_ptr[0] * Filter[0])
+    vmull.u8        q7, d4, d0
+    vmull.u8        q8, d6, d0
+    vmull.u8        q9, d8, d0
+    vmull.u8        q10, d10, d0
+
+    vext.8          d3, d2, d3, #1          ;construct src_ptr[-1]
+    vext.8          d5, d4, d5, #1
+    vext.8          d7, d6, d7, #1
+    vext.8          d9, d8, d9, #1
+    vext.8          d11, d10, d11, #1
+
+    vmlal.u8        q6, d3, d1              ;(src_ptr[1] * Filter[1])
+    vmlal.u8        q7, d5, d1
+    vmlal.u8        q8, d7, d1
+    vmlal.u8        q9, d9, d1
+    vmlal.u8        q10, d11, d1
+
+    vqrshrn.u16    d26, q6, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d27, q7, #7
+    vqrshrn.u16    d28, q8, #7
+    vqrshrn.u16    d29, q9, #7
+    vqrshrn.u16    d30, q10, #7
+
+;Second pass: 8x8
+secondpass_filter
+    cmp             r3, #0                  ;skip second_pass filter if yoffset=0
+    ;skip_secondpass_filter
+    beq             sub_pixel_variance8x8_neon
+
+    add             r3, r12, r3, lsl #3
+
+    vld1.u32        {d31}, [r3]             ;load second_pass filter
+
+    vdup.8          d0, d31[0]              ;second_pass filter parameters (d0 d1)
+    vdup.8          d1, d31[4]
+
+    vmull.u8        q1, d22, d0             ;(src_ptr[0] * Filter[0])
+    vmull.u8        q2, d23, d0
+    vmull.u8        q3, d24, d0
+    vmull.u8        q4, d25, d0
+    vmull.u8        q5, d26, d0
+    vmull.u8        q6, d27, d0
+    vmull.u8        q7, d28, d0
+    vmull.u8        q8, d29, d0
+
+    vmlal.u8        q1, d23, d1             ;(src_ptr[pixel_step] * Filter[1])
+    vmlal.u8        q2, d24, d1
+    vmlal.u8        q3, d25, d1
+    vmlal.u8        q4, d26, d1
+    vmlal.u8        q5, d27, d1
+    vmlal.u8        q6, d28, d1
+    vmlal.u8        q7, d29, d1
+    vmlal.u8        q8, d30, d1
+
+    vqrshrn.u16    d22, q1, #7              ;shift/round/saturate to u8
+    vqrshrn.u16    d23, q2, #7
+    vqrshrn.u16    d24, q3, #7
+    vqrshrn.u16    d25, q4, #7
+    vqrshrn.u16    d26, q5, #7
+    vqrshrn.u16    d27, q6, #7
+    vqrshrn.u16    d28, q7, #7
+    vqrshrn.u16    d29, q8, #7
+
+    b               sub_pixel_variance8x8_neon
+
+;--------------------
+skip_firstpass_filter
+    vld1.u8         {d22}, [r0], r1         ;load src data
+    vld1.u8         {d23}, [r0], r1
+    vld1.u8         {d24}, [r0], r1
+    vld1.u8         {d25}, [r0], r1
+    vld1.u8         {d26}, [r0], r1
+    vld1.u8         {d27}, [r0], r1
+    vld1.u8         {d28}, [r0], r1
+    vld1.u8         {d29}, [r0], r1
+    vld1.u8         {d30}, [r0], r1
+
+    b               secondpass_filter
+
+;----------------------
+;vp8_variance8x8_neon
+sub_pixel_variance8x8_neon
+    vmov.i8         q8, #0                      ;q8 - sum
+    vmov.i8         q9, #0                      ;q9, q10 - sse
+    vmov.i8         q10, #0
+
+    mov             r12, #2
+
+sub_pixel_variance8x8_neon_loop
+    vld1.8          {d0}, [r4], r5              ;load dst data
+    subs            r12, r12, #1
+    vld1.8          {d1}, [r4], r5
+    vld1.8          {d2}, [r4], r5
+    vsubl.u8        q4, d22, d0                 ;calculate diff
+    vld1.8          {d3}, [r4], r5
+
+    vsubl.u8        q5, d23, d1
+    vsubl.u8        q6, d24, d2
+
+    vpadal.s16      q8, q4                      ;sum
+    vmlal.s16       q9, d8, d8                  ;sse
+    vmlal.s16       q10, d9, d9
+
+    vsubl.u8        q7, d25, d3
+
+    vpadal.s16      q8, q5
+    vmlal.s16       q9, d10, d10
+    vmlal.s16       q10, d11, d11
+
+    vmov            q11, q13
+
+    vpadal.s16      q8, q6
+    vmlal.s16       q9, d12, d12
+    vmlal.s16       q10, d13, d13
+
+    vmov            q12, q14
+
+    vpadal.s16      q8, q7
+    vmlal.s16       q9, d14, d14
+    vmlal.s16       q10, d15, d15
+
+    bne             sub_pixel_variance8x8_neon_loop
+
+    vadd.u32        q10, q9, q10                ;accumulate sse
+    vpaddl.s32      q0, q8                      ;accumulate sum
+
+    vpaddl.u32      q1, q10
+    vadd.s64        d0, d0, d1
+    vadd.u64        d1, d2, d3
+
+    vmull.s32       q5, d0, d0
+    vst1.32         {d1[0]}, [lr]               ;store sse
+    vshr.s32        d10, d10, #6
+    vsub.s32        d0, d1, d10
+
+    vmov.32         r0, d0[0]                   ;return
+    pop             {r4-r5, pc}
+
+    ENDP
+
+;-----------------
+    AREA    bilinear_taps_dat, DATA, READWRITE           ;read/write by default
+;Data section with name data_area is specified. DCD reserves space in memory for 48 data.
+;One word each is reserved. Label filter_coeff can be used to access the data.
+;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ...
+_BilinearTaps_coeff_
+    DCD     bilinear_taps_coeff
+bilinear_taps_coeff
+    DCD     128, 0, 112, 16, 96, 32, 80, 48, 64, 64, 48, 80, 32, 96, 16, 112
+
+    END

diff --git a/vp8/encoder/arm/picklpf_arm.c b/vp8/encoder/arm/picklpf_arm.c
new file mode 100644
index 0000000..0586e55
--- /dev/null
+++ b/vp8/encoder/arm/picklpf_arm.c

@@ -0,0 +1,49 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "alloccommon.h"
+
+extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz);
+
+
+void
+vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int border;
+    int yoffset;
+    int linestocopy;
+
+    border   = src_ybc->border;
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    linestocopy = (yheight >> (Fraction + 4));
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+    yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16));
+    vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16)));
+}

diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
new file mode 100644
index 0000000..46906d3
--- /dev/null
+++ b/vp8/encoder/arm/quantize_arm.c

@@ -0,0 +1,79 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "quantize.h"
+#include "entropy.h"
+#include "predictdc.h"
+
+DECLARE_ALIGNED(16, const short, vp8_rvsplus1_default_zig_zag1d[16]) =
+{
+    1,  2,  6,  7,
+    3,  5,  8,  13,
+    4,  9,  12, 14,
+    10, 11, 15, 16,
+};
+
+
+extern int vp8_fast_quantize_b_neon_func(short *coeff_ptr, short *zbin_ptr, short *qcoeff_ptr, short *dqcoeff_ptr, short *dequant_ptr, const short *scan_mask, short *round_ptr, short *quant_ptr);
+
+void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d)
+{
+    d->eob = vp8_fast_quantize_b_neon_func(b->coeff, &b->zbin[0][0], d->qcoeff, d->dqcoeff, d->dequant[0], vp8_rvsplus1_default_zig_zag1d, &b->round[0][0], &b->quant[0][0]);
+}
+
+/*
+//neon code is written according to the following rewritten c code
+void vp8_fast_quantize_b_neon(BLOCK *b,BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, x1, y, z, sz;
+    short *coeff_ptr  = &b->Coeff[0];
+    short *zbin_ptr   = &b->Zbin[0][0];
+    short *round_ptr  = &b->Round[0][0];
+    short *quant_ptr  = &b->Quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr= d->dqcoeff;
+    short *dequant_ptr= &d->Dequant[0][0];
+
+    eob = 0;
+
+    for(i=0;i<16;i++)
+    {
+        z    = coeff_ptr[i];
+        zbin = zbin_ptr[i] ;
+        x  = abs(z);                                    // x = abs(z)
+
+        if(x>=zbin)
+        {
+            sz = (z>>31);                               // sign of z
+            y  = ((x+round_ptr[i])*quant_ptr[i])>>16;     // quantize (x)
+            x1  = (y^sz) - sz;                          // get the sign back
+
+            qcoeff_ptr[i] = x1;                          // write to destination
+            dqcoeff_ptr[i] = x1 * dequant_ptr[i];         // dequantized value
+
+            if(y)
+            {
+                if(eob<vp8_rvsplus1_default_zig_zag1d[i])
+                    eob=(int)vp8_rvsplus1_default_zig_zag1d[i];         // last nonzero coeffs
+            }
+        }else
+        {
+            qcoeff_ptr[i] = 0;                          // write to destination
+            dqcoeff_ptr[i] = 0;         // dequantized value
+        }
+    }
+        d->eob = eob;
+}
+*/

diff --git a/vp8/encoder/arm/quantize_arm.h b/vp8/encoder/arm/quantize_arm.h
new file mode 100644
index 0000000..e93f0fe
--- /dev/null
+++ b/vp8/encoder/arm/quantize_arm.h

@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef QUANTIZE_ARM_H
+#define QUANTIZE_ARM_H
+
+#if HAVE_ARMV7
+extern prototype_quantize_block(vp8_fast_quantize_b_neon);
+
+#undef  vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_neon
+
+#endif
+
+#endif

diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
new file mode 100644
index 0000000..d9fc9b3
--- /dev/null
+++ b/vp8/encoder/arm/variance_arm.h

@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_ARM_H
+#define VARIANCE_ARM_H
+
+#if HAVE_ARMV7
+extern prototype_sad(vp8_sad4x4_neon);
+extern prototype_sad(vp8_sad8x8_neon);
+extern prototype_sad(vp8_sad8x16_neon);
+extern prototype_sad(vp8_sad16x8_neon);
+extern prototype_sad(vp8_sad16x16_neon);
+
+//extern prototype_variance(vp8_variance4x4_c);
+extern prototype_variance(vp8_variance8x8_neon);
+extern prototype_variance(vp8_variance8x16_neon);
+extern prototype_variance(vp8_variance16x8_neon);
+extern prototype_variance(vp8_variance16x16_neon);
+
+//extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_neon);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
+//extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+
+//extern prototype_getmbss(vp8_get_mb_ss_c);
+extern prototype_variance(vp8_mse16x16_neon);
+extern prototype_sad(vp8_get16x16pred_error_neon);
+//extern prototype_variance2(vp8_get8x8var_c);
+//extern prototype_variance2(vp8_get16x16var_c);
+extern prototype_sad(vp8_get4x4sse_cs_neon);
+
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_neon
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_neon
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_neon
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_neon
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_neon
+
+//#undef  vp8_variance_var4x4
+//#define vp8_variance_var4x4 vp8_variance4x4_c
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_neon
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_neon
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_neon
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_neon
+
+//#undef  vp8_variance_subpixvar4x4
+//#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_neon
+
+//#undef  vp8_variance_subpixvar8x16
+//#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
+
+//#undef  vp8_variance_subpixvar16x8
+//#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
+
+//#undef  vp8_variance_getmbss
+//#define vp8_variance_getmbss vp8_get_mb_ss_c
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_neon
+
+#undef  vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon
+
+//#undef  vp8_variance_get8x8var
+//#define vp8_variance_get8x8var vp8_get8x8var_c
+
+//#undef  vp8_variance_get16x16var
+//#define vp8_variance_get16x16var vp8_get16x16var_c
+
+#undef  vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
+
+#endif
+
+#endif

diff --git a/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c
new file mode 100644
index 0000000..8cdf079
--- /dev/null
+++ b/vp8/encoder/arm/vpx_vp8_enc_asm_offsets.c

@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include <stddef.h>
+
+#include "../treewriter.h"
+#include "../tokenize.h"
+#include "../onyx_int.h"
+
+#define ct_assert(name,cond) \
+    static void assert_##name(void) UNUSED;\
+    static void assert_##name(void) {switch(0){case 0:case !!(cond):;}}
+
+#define DEFINE(sym, val) int sym = val;
+
+/*
+#define BLANK() asm volatile("\n->" : : )
+*/
+
+/*
+ * int main(void)
+ * {
+ */
+
+DEFINE(vp8_writer_lowvalue,                     offsetof(vp8_writer, lowvalue));
+DEFINE(vp8_writer_range,                        offsetof(vp8_writer, range));
+DEFINE(vp8_writer_value,                        offsetof(vp8_writer, value));
+DEFINE(vp8_writer_count,                        offsetof(vp8_writer, count));
+DEFINE(vp8_writer_pos,                          offsetof(vp8_writer, pos));
+DEFINE(vp8_writer_buffer,                       offsetof(vp8_writer, buffer));
+
+DEFINE(tokenextra_token,                        offsetof(TOKENEXTRA, Token));
+DEFINE(tokenextra_extra,                        offsetof(TOKENEXTRA, Extra));
+DEFINE(tokenextra_context_tree,                  offsetof(TOKENEXTRA, context_tree));
+DEFINE(tokenextra_skip_eob_node,                offsetof(TOKENEXTRA, skip_eob_node));
+DEFINE(TOKENEXTRA_SZ,                           sizeof(TOKENEXTRA));
+
+DEFINE(vp8_extra_bit_struct_sz,                   sizeof(vp8_extra_bit_struct));
+
+DEFINE(vp8_token_value,                         offsetof(vp8_token, value));
+DEFINE(vp8_token_len,                           offsetof(vp8_token, Len));
+
+DEFINE(vp8_extra_bit_struct_tree,                 offsetof(vp8_extra_bit_struct, tree));
+DEFINE(vp8_extra_bit_struct_prob,                 offsetof(vp8_extra_bit_struct, prob));
+DEFINE(vp8_extra_bit_struct_prob_bc,               offsetof(vp8_extra_bit_struct, prob_bc));
+DEFINE(vp8_extra_bit_struct_len,                  offsetof(vp8_extra_bit_struct, Len));
+DEFINE(vp8_extra_bit_struct_base_val,              offsetof(vp8_extra_bit_struct, base_val));
+
+DEFINE(vp8_comp_tplist,                         offsetof(VP8_COMP, tplist));
+DEFINE(vp8_comp_common,                         offsetof(VP8_COMP, common));
+DEFINE(vp8_comp_bc2,                            offsetof(VP8_COMP, bc2));
+
+DEFINE(tokenlist_start,                         offsetof(TOKENLIST, start));
+DEFINE(tokenlist_stop,                          offsetof(TOKENLIST, stop));
+DEFINE(TOKENLIST_SZ,                            sizeof(TOKENLIST));
+
+DEFINE(vp8_common_mb_rows,                       offsetof(VP8_COMMON, mb_rows));
+
+// These two sizes are used in vp7cx_pack_tokens.  They are hard coded
+//  so if the size changes this will have to be adjusted.
+ct_assert(TOKENEXTRA_SZ, sizeof(TOKENEXTRA) == 20)
+ct_assert(vp8_extra_bit_struct_sz, sizeof(vp8_extra_bit_struct) == 20)
+
+//add asserts for any offset that is not supported by assembly code
+//add asserts for any size that is not supported by assembly code
+/*
+ * return 0;
+ * }
+ */

diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
new file mode 100644
index 0000000..31ad56a
--- /dev/null
+++ b/vp8/encoder/bitstream.c

@@ -0,0 +1,1719 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "header.h"
+#include "encodemv.h"
+#include "entropymode.h"
+#include "findnearmv.h"
+#include "mcomp.h"
+#include "systemdependent.h"
+#include <assert.h>
+#include <stdio.h>
+#include "pragmas.h"
+#include "vpx_mem/vpx_mem.h"
+#include "bitstream.h"
+
+const int vp8cx_base_skip_false_prob[128] =
+{
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    255, 255, 255, 255, 255, 255, 255, 255,
+    251, 248, 244, 240, 236, 232, 229, 225,
+    221, 217, 213, 208, 204, 199, 194, 190,
+    187, 183, 179, 175, 172, 168, 164, 160,
+    157, 153, 149, 145, 142, 138, 134, 130,
+    127, 124, 120, 117, 114, 110, 107, 104,
+    101, 98,  95,  92,  89,  86,  83, 80,
+    77,  74,  71,  68,  65,  62,  59, 56,
+    53,  50,  47,  44,  41,  38,  35, 32,
+    30,  28,  26,  24,  22,  20,  18, 16,
+};
+#ifdef VP8REF
+#define __int64 long long
+#endif
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+#endif
+
+#ifdef ENTROPY_STATS
+int intra_mode_stats[10][10][10];
+static unsigned int tree_update_hist [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] [2];
+extern unsigned int active_section;
+#endif
+
+#ifdef MODE_STATS
+int count_mb_seg[4] = { 0, 0, 0, 0 };
+#endif
+
+#if CONFIG_BIG_ENDIAN
+# define make_endian_16(a)  \
+    (((unsigned int)(a & 0xff)) << 8) | (((unsigned int)(a & 0xff00)) >> 8)
+# define make_endian_32(a)                              \
+    (((unsigned int)(a & 0xff)) << 24)    | (((unsigned int)(a & 0xff00)) << 8) |   \
+    (((unsigned int)(a & 0xff0000)) >> 8) | (((unsigned int)(a & 0xff000000)) >> 24)
+#else
+# define make_endian_16(a)  a
+# define make_endian_32(a)  a
+#endif
+
+static void update_mode(
+    vp8_writer *const w,
+    int n,
+    vp8_token tok               [/* n */],
+    vp8_tree tree,
+    vp8_prob Pnew               [/* n-1 */],
+    vp8_prob Pcur               [/* n-1 */],
+    unsigned int bct            [/* n-1 */] [2],
+    const unsigned int num_events[/* n */]
+)
+{
+    unsigned int new_b = 0, old_b = 0;
+    int i = 0;
+
+    vp8_tree_probs_from_distribution(
+        n--, tok, tree,
+        Pnew, bct, num_events,
+        256, 1
+    );
+
+    do
+    {
+        new_b += vp8_cost_branch(bct[i], Pnew[i]);
+        old_b += vp8_cost_branch(bct[i], Pcur[i]);
+    }
+    while (++i < n);
+
+    if (new_b + (n << 8) < old_b)
+    {
+        int i = 0;
+
+        vp8_write_bit(w, 1);
+
+        do
+        {
+            const vp8_prob p = Pnew[i];
+
+            vp8_write_literal(w, Pcur[i] = p ? p : 1, 8);
+        }
+        while (++i < n);
+    }
+    else
+        vp8_write_bit(w, 0);
+}
+
+static void update_mbintra_mode_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *const x = & cpi->common;
+
+    vp8_writer *const w = & cpi->bc;
+
+    {
+        vp8_prob Pnew   [VP8_YMODES-1];
+        unsigned int bct [VP8_YMODES-1] [2];
+
+        update_mode(
+            w, VP8_YMODES, vp8_ymode_encodings, vp8_ymode_tree,
+            Pnew, x->fc.ymode_prob, bct, (unsigned int *)cpi->ymode_count
+        );
+    }
+    {
+        vp8_prob Pnew   [VP8_UV_MODES-1];
+        unsigned int bct [VP8_UV_MODES-1] [2];
+
+        update_mode(
+            w, VP8_UV_MODES, vp8_uv_mode_encodings, vp8_uv_mode_tree,
+            Pnew, x->fc.uv_mode_prob, bct, (unsigned int *)cpi->uv_mode_count
+        );
+    }
+}
+
+static void write_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_ymode_tree, p, vp8_ymode_encodings + m);
+}
+
+static void kfwrite_ymode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_kf_ymode_tree, p, vp8_kf_ymode_encodings + m);
+}
+
+static void write_uv_mode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_uv_mode_tree, p, vp8_uv_mode_encodings + m);
+}
+
+
+static void write_bmode(vp8_writer *bc, int m, const vp8_prob *p)
+{
+    vp8_write_token(bc, vp8_bmode_tree, p, vp8_bmode_encodings + m);
+}
+
+static void write_split(vp8_writer *bc, int x)
+{
+    vp8_write_token(
+        bc, vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + x
+    );
+}
+
+static const unsigned int norm[256] =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+static void pack_tokens_c(vp8_writer *w, const TOKENEXTRA *p, int xcount)
+{
+    const TOKENEXTRA *const stop = p + xcount;
+    unsigned int split;
+    unsigned int shift;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+
+    while (p < stop)
+    {
+        const int t = p->Token;
+        vp8_token *const a = vp8_coef_encodings + t;
+        const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+        int i = 0;
+        const unsigned char *pp = p->context_tree;
+        int v = a->value;
+        int n = a->Len;
+
+        if (p->skip_eob_node)
+        {
+            n--;
+            i = 2;
+        }
+
+        do
+        {
+            const int bb = (v >> --n) & 1;
+            split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+            i = vp8_coef_tree[i+bb];
+
+            if (bb)
+            {
+                lowvalue += split;
+                range = range - split;
+            }
+            else
+            {
+                range = split;
+            }
+
+            shift = norm[range];
+            range <<= shift;
+            count += shift;
+
+            if (count >= 0)
+            {
+                int offset = shift - count;
+
+                if ((lowvalue << (offset - 1)) & 0x80000000)
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+                }
+
+                w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                lowvalue <<= offset;
+                shift = count;
+                lowvalue &= 0xffffff;
+                count -= 8 ;
+            }
+
+            lowvalue <<= shift;
+        }
+        while (n);
+
+
+        if (b->base_val)
+        {
+            const int e = p->Extra, L = b->Len;
+
+            if (L)
+            {
+                const unsigned char *pp = b->prob;
+                int v = e >> 1;
+                int n = L;              /* number of bits in v, assumed nonzero */
+                int i = 0;
+
+                do
+                {
+                    const int bb = (v >> --n) & 1;
+                    split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                    i = b->tree[i+bb];
+
+                    if (bb)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    shift = norm[range];
+                    range <<= shift;
+                    count += shift;
+
+                    if (count >= 0)
+                    {
+                        int offset = shift - count;
+
+                        if ((lowvalue << (offset - 1)) & 0x80000000)
+                        {
+                            int x = w->pos - 1;
+
+                            while (x >= 0 && w->buffer[x] == 0xff)
+                            {
+                                w->buffer[x] = (unsigned char)0;
+                                x--;
+                            }
+
+                            w->buffer[x] += 1;
+                        }
+
+                        w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                        lowvalue <<= offset;
+                        shift = count;
+                        lowvalue &= 0xffffff;
+                        count -= 8 ;
+                    }
+
+                    lowvalue <<= shift;
+                }
+                while (n);
+            }
+
+
+            {
+
+                split = (range + 1) >> 1;
+
+                if (e & 1)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                range <<= 1;
+
+                if ((lowvalue & 0x80000000))
+                {
+                    int x = w->pos - 1;
+
+                    while (x >= 0 && w->buffer[x] == 0xff)
+                    {
+                        w->buffer[x] = (unsigned char)0;
+                        x--;
+                    }
+
+                    w->buffer[x] += 1;
+
+                }
+
+                lowvalue  <<= 1;
+
+                if (!++count)
+                {
+                    count = -8;
+                    w->buffer[w->pos++] = (lowvalue >> 24);
+                    lowvalue &= 0xffffff;
+                }
+            }
+
+        }
+
+        ++p;
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_partition_size(unsigned char *cx_data, int size)
+{
+    signed char csize;
+
+    csize = size & 0xff;
+    *cx_data = csize;
+    csize = (size >> 8) & 0xff;
+    *(cx_data + 1) = csize;
+    csize = (size >> 16) & 0xff;
+    *(cx_data + 2) = csize;
+
+}
+
+static void pack_tokens_into_partitions_c(VP8_COMP *cpi, unsigned char *cx_data, int num_part, int *size)
+{
+
+    int i;
+    unsigned char *ptr = cx_data;
+    unsigned int shift;
+    vp8_writer *w = &cpi->bc2;
+    *size = 3 * (num_part - 1);
+    ptr = cx_data + (*size);
+
+    for (i = 0; i < num_part; i++)
+    {
+        vp8_start_encode(w, ptr);
+        {
+            unsigned int split;
+            int count = w->count;
+            unsigned int range = w->range;
+            unsigned int lowvalue = w->lowvalue;
+            int mb_row;
+
+            for (mb_row = i; mb_row < cpi->common.mb_rows; mb_row += num_part)
+            {
+                TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+                TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+                while (p < stop)
+                {
+                    const int t = p->Token;
+                    vp8_token *const a = vp8_coef_encodings + t;
+                    const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+                    int i = 0;
+                    const unsigned char *pp = p->context_tree;
+                    int v = a->value;
+                    int n = a->Len;
+
+                    if (p->skip_eob_node)
+                    {
+                        n--;
+                        i = 2;
+                    }
+
+                    do
+                    {
+                        const int bb = (v >> --n) & 1;
+                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                        i = vp8_coef_tree[i+bb];
+
+                        if (bb)
+                        {
+                            lowvalue += split;
+                            range = range - split;
+                        }
+                        else
+                        {
+                            range = split;
+                        }
+
+                        shift = norm[range];
+                        range <<= shift;
+                        count += shift;
+
+                        if (count >= 0)
+                        {
+                            int offset = shift - count;
+
+                            if ((lowvalue << (offset - 1)) & 0x80000000)
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+                            }
+
+                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                            lowvalue <<= offset;
+                            shift = count;
+                            lowvalue &= 0xffffff;
+                            count -= 8 ;
+                        }
+
+                        lowvalue <<= shift;
+                    }
+                    while (n);
+
+
+                    if (b->base_val)
+                    {
+                        const int e = p->Extra, L = b->Len;
+
+                        if (L)
+                        {
+                            const unsigned char *pp = b->prob;
+                            int v = e >> 1;
+                            int n = L;              /* number of bits in v, assumed nonzero */
+                            int i = 0;
+
+                            do
+                            {
+                                const int bb = (v >> --n) & 1;
+                                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                                i = b->tree[i+bb];
+
+                                if (bb)
+                                {
+                                    lowvalue += split;
+                                    range = range - split;
+                                }
+                                else
+                                {
+                                    range = split;
+                                }
+
+                                shift = norm[range];
+                                range <<= shift;
+                                count += shift;
+
+                                if (count >= 0)
+                                {
+                                    int offset = shift - count;
+
+                                    if ((lowvalue << (offset - 1)) & 0x80000000)
+                                    {
+                                        int x = w->pos - 1;
+
+                                        while (x >= 0 && w->buffer[x] == 0xff)
+                                        {
+                                            w->buffer[x] = (unsigned char)0;
+                                            x--;
+                                        }
+
+                                        w->buffer[x] += 1;
+                                    }
+
+                                    w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                                    lowvalue <<= offset;
+                                    shift = count;
+                                    lowvalue &= 0xffffff;
+                                    count -= 8 ;
+                                }
+
+                                lowvalue <<= shift;
+                            }
+                            while (n);
+                        }
+
+                        {
+                            split = (range + 1) >> 1;
+
+                            if (e & 1)
+                            {
+                                lowvalue += split;
+                                range = range - split;
+                            }
+                            else
+                            {
+                                range = split;
+                            }
+
+                            range <<= 1;
+
+                            if ((lowvalue & 0x80000000))
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+
+                            }
+
+                            lowvalue  <<= 1;
+
+                            if (!++count)
+                            {
+                                count = -8;
+                                w->buffer[w->pos++] = (lowvalue >> 24);
+                                lowvalue &= 0xffffff;
+                            }
+                        }
+
+                    }
+
+                    ++p;
+                }
+            }
+
+            w->count    = count;
+            w->lowvalue = lowvalue;
+            w->range    = range;
+
+        }
+
+        vp8_stop_encode(w);
+        *size +=   w->pos;
+
+        if (i < (num_part - 1))
+        {
+            write_partition_size(cx_data, w->pos);
+            cx_data += 3;
+            ptr += w->pos;
+        }
+    }
+}
+
+
+static void pack_mb_row_tokens_c(VP8_COMP *cpi, vp8_writer *w)
+{
+
+    unsigned int split;
+    int count = w->count;
+    unsigned int range = w->range;
+    unsigned int lowvalue = w->lowvalue;
+    unsigned int shift;
+    int mb_row;
+
+    for (mb_row = 0; mb_row < cpi->common.mb_rows; mb_row++)
+    {
+        TOKENEXTRA *p    = cpi->tplist[mb_row].start;
+        TOKENEXTRA *stop = cpi->tplist[mb_row].stop;
+
+        while (p < stop)
+        {
+            const int t = p->Token;
+            vp8_token *const a = vp8_coef_encodings + t;
+            const vp8_extra_bit_struct *const b = vp8_extra_bits + t;
+            int i = 0;
+            const unsigned char *pp = p->context_tree;
+            int v = a->value;
+            int n = a->Len;
+
+            if (p->skip_eob_node)
+            {
+                n--;
+                i = 2;
+            }
+
+            do
+            {
+                const int bb = (v >> --n) & 1;
+                split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                i = vp8_coef_tree[i+bb];
+
+                if (bb)
+                {
+                    lowvalue += split;
+                    range = range - split;
+                }
+                else
+                {
+                    range = split;
+                }
+
+                shift = norm[range];
+                range <<= shift;
+                count += shift;
+
+                if (count >= 0)
+                {
+                    int offset = shift - count;
+
+                    if ((lowvalue << (offset - 1)) & 0x80000000)
+                    {
+                        int x = w->pos - 1;
+
+                        while (x >= 0 && w->buffer[x] == 0xff)
+                        {
+                            w->buffer[x] = (unsigned char)0;
+                            x--;
+                        }
+
+                        w->buffer[x] += 1;
+                    }
+
+                    w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                    lowvalue <<= offset;
+                    shift = count;
+                    lowvalue &= 0xffffff;
+                    count -= 8 ;
+                }
+
+                lowvalue <<= shift;
+            }
+            while (n);
+
+
+            if (b->base_val)
+            {
+                const int e = p->Extra, L = b->Len;
+
+                if (L)
+                {
+                    const unsigned char *pp = b->prob;
+                    int v = e >> 1;
+                    int n = L;              /* number of bits in v, assumed nonzero */
+                    int i = 0;
+
+                    do
+                    {
+                        const int bb = (v >> --n) & 1;
+                        split = 1 + (((range - 1) * pp[i>>1]) >> 8);
+                        i = b->tree[i+bb];
+
+                        if (bb)
+                        {
+                            lowvalue += split;
+                            range = range - split;
+                        }
+                        else
+                        {
+                            range = split;
+                        }
+
+                        shift = norm[range];
+                        range <<= shift;
+                        count += shift;
+
+                        if (count >= 0)
+                        {
+                            int offset = shift - count;
+
+                            if ((lowvalue << (offset - 1)) & 0x80000000)
+                            {
+                                int x = w->pos - 1;
+
+                                while (x >= 0 && w->buffer[x] == 0xff)
+                                {
+                                    w->buffer[x] = (unsigned char)0;
+                                    x--;
+                                }
+
+                                w->buffer[x] += 1;
+                            }
+
+                            w->buffer[w->pos++] = (lowvalue >> (24 - offset));
+                            lowvalue <<= offset;
+                            shift = count;
+                            lowvalue &= 0xffffff;
+                            count -= 8 ;
+                        }
+
+                        lowvalue <<= shift;
+                    }
+                    while (n);
+                }
+
+                {
+                    split = (range + 1) >> 1;
+
+                    if (e & 1)
+                    {
+                        lowvalue += split;
+                        range = range - split;
+                    }
+                    else
+                    {
+                        range = split;
+                    }
+
+                    range <<= 1;
+
+                    if ((lowvalue & 0x80000000))
+                    {
+                        int x = w->pos - 1;
+
+                        while (x >= 0 && w->buffer[x] == 0xff)
+                        {
+                            w->buffer[x] = (unsigned char)0;
+                            x--;
+                        }
+
+                        w->buffer[x] += 1;
+
+                    }
+
+                    lowvalue  <<= 1;
+
+                    if (!++count)
+                    {
+                        count = -8;
+                        w->buffer[w->pos++] = (lowvalue >> 24);
+                        lowvalue &= 0xffffff;
+                    }
+                }
+
+            }
+
+            ++p;
+        }
+    }
+
+    w->count = count;
+    w->lowvalue = lowvalue;
+    w->range = range;
+
+}
+
+static void write_mv_ref
+(
+    vp8_writer *w, MB_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+
+    vp8_write_token(w, vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m);
+}
+
+static void write_sub_mv_ref
+(
+    vp8_writer *w, B_PREDICTION_MODE m, const vp8_prob *p
+)
+{
+    assert(LEFT4X4 <= m  &&  m <= NEW4X4);
+
+    vp8_write_token(w, vp8_sub_mv_ref_tree, p, VP8_SUBMVREFENCODINGS + m);
+}
+
+static void write_mv
+(
+    vp8_writer *w, const MV *mv, const MV *ref, const MV_CONTEXT *mvc
+)
+{
+    MV e;
+    e.row = mv->row - ref->row;
+    e.col = mv->col - ref->col;
+
+    vp8_encode_motion_vector(w, &e, mvc);
+}
+
+static void write_mb_features(vp8_writer *w, const MB_MODE_INFO *mi, const MACROBLOCKD *x)
+{
+    // Encode the MB segment id.
+    if (x->segmentation_enabled && x->update_mb_segmentation_map)
+    {
+        switch (mi->segment_id)
+        {
+        case 0:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        case 1:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[1]);
+            break;
+        case 2:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[2]);
+            break;
+        case 3:
+            vp8_write(w, 1, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 1, x->mb_segment_tree_probs[2]);
+            break;
+
+            // TRAP.. This should not happen
+        default:
+            vp8_write(w, 0, x->mb_segment_tree_probs[0]);
+            vp8_write(w, 0, x->mb_segment_tree_probs[1]);
+            break;
+        }
+    }
+}
+
+
+static void pack_inter_mode_mvs(VP8_COMP *const cpi)
+{
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const w = & cpi->bc;
+    const MV_CONTEXT *mvc = pc->fc.mvc;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    MODE_INFO *m = pc->mi, *ms;
+    const int mis = pc->mode_info_stride;
+    int mb_row = -1;
+
+    int prob_last_coded;
+    int prob_gf_coded;
+    int prob_skip_false = 0;
+    ms = pc->mi - 1;
+
+    // Calculate the probabilities to be used to code the reference frame based on actual useage this frame
+    if (!(cpi->prob_intra_coded = rf_intra * 255 / (rf_intra + rf_inter)))
+        cpi->prob_intra_coded = 1;
+
+    prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+    if (!prob_last_coded)
+        prob_last_coded = 1;
+
+    prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                    ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+    if (!prob_gf_coded)
+        prob_gf_coded = 1;
+
+
+#ifdef ENTROPY_STATS
+    active_section = 1;
+#endif
+
+    if (pc->mb_no_coeff_skip)
+    {
+        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false > 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(w, prob_skip_false, 8);
+    }
+
+    vp8_write_literal(w, cpi->prob_intra_coded, 8);
+    vp8_write_literal(w, prob_last_coded, 8);
+    vp8_write_literal(w, prob_gf_coded, 8);
+
+    update_mbintra_mode_probs(cpi);
+
+    vp8_write_mvprobs(cpi);
+
+    while (++mb_row < pc->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < pc->mb_cols)
+        {
+            const MB_MODE_INFO *const mi = & m->mbmi;
+            const MV_REFERENCE_FRAME rf = mi->ref_frame;
+            const MB_PREDICTION_MODE mode = mi->mode;
+
+            MACROBLOCKD *xd = &cpi->mb.e_mbd;
+
+            // Distance of Mb to the various image edges.
+            // These specified to 8th pel as they are always compared to MV values that are in 1/8th pel units
+            xd->mb_to_left_edge = -((mb_col * 16) << 3);
+            xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
+            xd->mb_to_top_edge = -((mb_row * 16)) << 3;
+            xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
+
+#ifdef ENTROPY_STATS
+            active_section = 9;
+#endif
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(w, mi, &cpi->mb.e_mbd);
+
+            if (pc->mb_no_coeff_skip)
+                vp8_encode_bool(w, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            if (rf == INTRA_FRAME)
+            {
+                vp8_write(w, 0, cpi->prob_intra_coded);
+#ifdef ENTROPY_STATS
+                active_section = 6;
+#endif
+                write_ymode(w, mode, pc->fc.ymode_prob);
+
+                if (mode == B_PRED)
+                {
+                    int j = 0;
+
+                    do
+                        write_bmode(w, m->bmi[j].mode, pc->fc.bmode_prob);
+
+                    while (++j < 16);
+                }
+
+                write_uv_mode(w, mi->uv_mode, pc->fc.uv_mode_prob);
+            }
+            else    /* inter coded */
+            {
+                MV best_mv;
+                vp8_prob mv_ref_p [VP8_MVREFS-1];
+
+                vp8_write(w, 1, cpi->prob_intra_coded);
+
+                if (rf == LAST_FRAME)
+                    vp8_write(w, 0, prob_last_coded);
+                else
+                {
+                    vp8_write(w, 1, prob_last_coded);
+                    vp8_write(w, (rf == GOLDEN_FRAME) ? 0 : 1, prob_gf_coded);
+                }
+
+                {
+                    MV n1, n2;
+                    int ct[4];
+
+                    vp8_find_near_mvs(xd, m, &n1, &n2, &best_mv, ct, rf, cpi->common.ref_frame_sign_bias);
+                    vp8_mv_ref_probs(mv_ref_p, ct);
+
+#ifdef ENTROPY_STATS
+                    accum_mv_refs(mode, ct);
+#endif
+
+                }
+
+#ifdef ENTROPY_STATS
+                active_section = 3;
+#endif
+
+                write_mv_ref(w, mode, mv_ref_p);
+
+                switch (mode)   /* new, split require MVs */
+                {
+                case NEWMV:
+
+#ifdef ENTROPY_STATS
+                    active_section = 5;
+#endif
+
+                    write_mv(w, &mi->mv.as_mv, &best_mv, mvc);
+                    break;
+
+                case SPLITMV:
+                {
+                    int j = 0;
+
+#ifdef MODE_STATS
+                    ++count_mb_seg [mi->partitioning];
+#endif
+
+                    write_split(w, mi->partitioning);
+
+                    do
+                    {
+                        const B_MODE_INFO *const b = mi->partition_bmi + j;
+                        const int *const  L = vp8_mbsplits [mi->partitioning];
+                        int k = -1;  /* first block in subset j */
+                        int mv_contz;
+
+                        while (j != L[++k])
+                            if (k >= 16)
+                                assert(0);
+
+                        mv_contz = vp8_mv_cont
+                                   (&(vp8_left_bmi(m, k)->mv.as_mv),
+                                    &(vp8_above_bmi(m, k, mis)->mv.as_mv));
+                        write_sub_mv_ref(w, b->mode, vp8_sub_mv_ref_prob2 [mv_contz]); //pc->fc.sub_mv_ref_prob);
+
+                        if (b->mode == NEW4X4)
+                        {
+#ifdef ENTROPY_STATS
+                            active_section = 11;
+#endif
+                            write_mv(w, &b->mv.as_mv, &best_mv, (const MV_CONTEXT *) mvc);
+                        }
+                    }
+                    while (++j < mi->partition_count);
+                }
+                break;
+                default:
+                    break;
+                }
+            }
+
+            ++m;
+        }
+
+        ++m;  /* skip L prediction border */
+    }
+}
+
+
+static void write_kfmodes(VP8_COMP *cpi)
+{
+    vp8_writer *const bc = & cpi->bc;
+    const VP8_COMMON *const c = & cpi->common;
+    /* const */
+    MODE_INFO *m = c->mi;
+
+    int mb_row = -1;
+    int prob_skip_false = 0;
+
+    if (c->mb_no_coeff_skip)
+    {
+        prob_skip_false = cpi->skip_false_count * 256 / (cpi->skip_false_count + cpi->skip_true_count);
+
+        if (prob_skip_false <= 1)
+            prob_skip_false = 1;
+
+        if (prob_skip_false >= 255)
+            prob_skip_false = 255;
+
+        cpi->prob_skip_false = prob_skip_false;
+        vp8_write_literal(bc, prob_skip_false, 8);
+    }
+
+    while (++mb_row < c->mb_rows)
+    {
+        int mb_col = -1;
+
+        while (++mb_col < c->mb_cols)
+        {
+            const int ym = m->mbmi.mode;
+
+            if (cpi->mb.e_mbd.update_mb_segmentation_map)
+                write_mb_features(bc, &m->mbmi, &cpi->mb.e_mbd);
+
+            if (c->mb_no_coeff_skip)
+                vp8_encode_bool(bc, m->mbmi.mb_skip_coeff, prob_skip_false);
+
+            kfwrite_ymode(bc, ym, c->kf_ymode_prob);
+
+            if (ym == B_PRED)
+            {
+                const int mis = c->mode_info_stride;
+                int i = 0;
+
+                do
+                {
+                    const B_PREDICTION_MODE A = vp8_above_bmi(m, i, mis)->mode;
+                    const B_PREDICTION_MODE L = vp8_left_bmi(m, i)->mode;
+                    const int bm = m->bmi[i].mode;
+
+#ifdef ENTROPY_STATS
+                    ++intra_mode_stats [A] [L] [bm];
+#endif
+
+                    write_bmode(bc, bm, c->kf_bmode_prob [A] [L]);
+                }
+                while (++i < 16);
+            }
+
+            write_uv_mode(bc, (m++)->mbmi.uv_mode, c->kf_uv_mode_prob);
+        }
+
+        m++;    // skip L prediction border
+    }
+}
+int vp8_estimate_entropy_savings(VP8_COMP *cpi)
+{
+    int i = 0;
+    int savings = 0;
+
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+    int new_intra, new_last, gf_last, oldtotal, newtotal;
+    int ref_frame_cost[MAX_REF_FRAMES];
+
+    vp8_clear_system_state(); //__asm emms;
+
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        if (!(new_intra = rf_intra * 255 / (rf_intra + rf_inter)))
+            new_intra = 1;
+
+        new_last = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+        gf_last = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                  ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+        // new costs
+        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(new_intra);
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(new_intra)
+                                        + vp8_cost_zero(new_last);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(new_intra)
+                                        + vp8_cost_one(new_last)
+                                        + vp8_cost_zero(gf_last);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(new_intra)
+                                        + vp8_cost_one(new_last)
+                                        + vp8_cost_one(gf_last);
+
+        newtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+
+        // old costs
+        ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_zero(cpi->prob_last_coded);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_zero(cpi->prob_gf_coded);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_one(cpi->prob_gf_coded);
+
+        oldtotal =
+            rfct[INTRA_FRAME] * ref_frame_cost[INTRA_FRAME] +
+            rfct[LAST_FRAME] * ref_frame_cost[LAST_FRAME] +
+            rfct[GOLDEN_FRAME] * ref_frame_cost[GOLDEN_FRAME] +
+            rfct[ALTREF_FRAME] * ref_frame_cost[ALTREF_FRAME];
+
+        savings += (oldtotal - newtotal) / 256;
+    }
+
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [vp8_coef_tokens-1];
+                //unsigned int branch_ct   [vp8_coef_tokens-1] [2];
+
+                int t = 0;      /* token/prob index */
+
+                vp8_tree_probs_from_distribution(
+                    vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
+                    cpi->frame_coef_probs [i][j][k], cpi->frame_branch_ct [i][j][k], cpi->coef_counts [i][j][k],
+                    256, 1
+                );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+                    const vp8_prob old = cpi->common.fc.coef_probs [i][j][k][t];
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const int old_b = vp8_cost_branch(ct, old);
+                    const int new_b = vp8_cost_branch(ct, newp);
+
+                    const int update_b = 8 +
+                                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+                    const int s = old_b - new_b - update_b;
+
+                    if (s > 0)
+                        savings += s;
+
+
+                }
+                while (++t < vp8_coef_tokens - 1);
+
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+
+    return savings;
+}
+
+static void update_coef_probs(VP8_COMP *cpi)
+{
+    int i = 0;
+    vp8_writer *const w = & cpi->bc;
+    int savings = 0;
+
+    vp8_clear_system_state(); //__asm emms;
+
+
+    do
+    {
+        int j = 0;
+
+        do
+        {
+            int k = 0;
+
+            do
+            {
+                //note: use result from vp8_estimate_entropy_savings, so no need to call vp8_tree_probs_from_distribution here.
+                /* at every context */
+
+                /* calc probs and branch cts for this frame only */
+                //vp8_prob new_p           [vp8_coef_tokens-1];
+                //unsigned int branch_ct   [vp8_coef_tokens-1] [2];
+
+                int t = 0;      /* token/prob index */
+
+                //vp8_tree_probs_from_distribution(
+                //    vp8_coef_tokens, vp8_coef_encodings, vp8_coef_tree,
+                //    new_p, branch_ct, (unsigned int *)cpi->coef_counts [i][j][k],
+                //    256, 1
+                //    );
+
+                do
+                {
+                    const unsigned int *ct  = cpi->frame_branch_ct [i][j][k][t];
+                    const vp8_prob newp = cpi->frame_coef_probs [i][j][k][t];
+
+                    vp8_prob *Pold = cpi->common.fc.coef_probs [i][j][k] + t;
+                    const vp8_prob old = *Pold;
+                    const vp8_prob upd = vp8_coef_update_probs [i][j][k][t];
+
+                    const int old_b = vp8_cost_branch(ct, old);
+                    const int new_b = vp8_cost_branch(ct, newp);
+
+                    const int update_b = 8 +
+                                         ((vp8_cost_one(upd) - vp8_cost_zero(upd)) >> 8);
+
+                    const int s = old_b - new_b - update_b;
+                    const int u = s > 0 ? 1 : 0;
+
+                    vp8_write(w, u, upd);
+
+
+#ifdef ENTROPY_STATS
+                    ++ tree_update_hist [i][j][k][t] [u];
+#endif
+
+                    if (u)
+                    {
+                        /* send/use new probability */
+
+                        *Pold = newp;
+                        vp8_write_literal(w, newp, 8);
+
+                        savings += s;
+
+                    }
+
+                }
+                while (++t < vp8_coef_tokens - 1);
+
+                /* Accum token counts for generation of default statistics */
+#ifdef ENTROPY_STATS
+                t = 0;
+
+                do
+                {
+                    context_counters [i][j][k][t] += cpi->coef_counts [i][j][k][t];
+                }
+                while (++t < vp8_coef_tokens);
+
+#endif
+
+            }
+            while (++k < PREV_COEF_CONTEXTS);
+        }
+        while (++j < COEF_BANDS);
+    }
+    while (++i < BLOCK_TYPES);
+
+}
+#ifdef PACKET_TESTING
+FILE *vpxlogc = 0;
+#endif
+
+static void put_delta_q(vp8_writer *bc, int delta_q)
+{
+    if (delta_q != 0)
+    {
+        vp8_write_bit(bc, 1);
+        vp8_write_literal(bc, abs(delta_q), 4);
+
+        if (delta_q < 0)
+            vp8_write_bit(bc, 1);
+        else
+            vp8_write_bit(bc, 0);
+    }
+    else
+        vp8_write_bit(bc, 0);
+}
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size)
+{
+    int i, j;
+    VP8_HEADER oh;
+    VP8_COMMON *const pc = & cpi->common;
+    vp8_writer *const bc = & cpi->bc;
+    MACROBLOCKD *const xd = & cpi->mb.e_mbd;
+    int extra_bytes_packed = 0;
+
+    unsigned char *cx_data = dest;
+    const int *mb_feature_data_bits;
+
+    oh.show_frame = (int) pc->show_frame;
+    oh.type = (int)pc->frame_type;
+    oh.version = pc->version;
+
+    mb_feature_data_bits = vp8_mb_feature_data_bits;
+    cx_data += 3;
+
+#if defined(SECTIONBITS_OUTPUT)
+    Sectionbits[active_section = 1] += sizeof(VP8_HEADER) * 8 * 256;
+#endif
+
+    //vp8_kf_default_bmode_probs() is called in vp8_setup_key_frame() once for each
+    //K frame before encode frame. pc->kf_bmode_prob doesn't get changed anywhere
+    //else. No need to call it again here. --yw
+    //vp8_kf_default_bmode_probs( pc->kf_bmode_prob);
+
+    // every keyframe send startcode, width, height, scale factor, clamp and color type
+    if (oh.type == KEY_FRAME)
+    {
+        int w, h, hs, vs;
+
+        // Start / synch code
+        cx_data[0] = 0x9D;
+        cx_data[1] = 0x01;
+        cx_data[2] = 0x2a;
+
+        *((unsigned short *)(cx_data + 3)) = make_endian_16((pc->horiz_scale << 14) | pc->Width);
+        *((unsigned short *)(cx_data + 5)) = make_endian_16((pc->vert_scale << 14) | pc->Height);
+
+        extra_bytes_packed = 7;
+        cx_data += extra_bytes_packed ;
+
+        vp8_start_encode(bc, cx_data);
+
+        // signal clr type
+        vp8_write_bit(bc, pc->clr_type);
+        vp8_write_bit(bc, pc->clamp_type);
+
+    }
+    else
+        vp8_start_encode(bc, cx_data);
+
+
+    // Signal whether or not Segmentation is enabled
+    vp8_write_bit(bc, (xd->segmentation_enabled) ? 1 : 0);
+
+    // Indicate which features are enabled
+    if (xd->segmentation_enabled)
+    {
+        // Signal whether or not the segmentation map is being updated.
+        vp8_write_bit(bc, (xd->update_mb_segmentation_map) ? 1 : 0);
+        vp8_write_bit(bc, (xd->update_mb_segmentation_data) ? 1 : 0);
+
+        if (xd->update_mb_segmentation_data)
+        {
+            signed char Data;
+
+            vp8_write_bit(bc, (xd->mb_segement_abs_delta) ? 1 : 0);
+
+            // For each segmentation feature (Quant and loop filter level)
+            for (i = 0; i < MB_LVL_MAX; i++)
+            {
+                // For each of the segments
+                for (j = 0; j < MAX_MB_SEGMENTS; j++)
+                {
+                    Data = xd->segment_feature_data[i][j];
+
+                    // Frame level data
+                    if (Data)
+                    {
+                        vp8_write_bit(bc, 1);
+
+                        if (Data < 0)
+                        {
+                            Data = - Data;
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 1);
+                        }
+                        else
+                        {
+                            vp8_write_literal(bc, Data, mb_feature_data_bits[i]);
+                            vp8_write_bit(bc, 0);
+                        }
+                    }
+                    else
+                        vp8_write_bit(bc, 0);
+                }
+            }
+        }
+
+        if (xd->update_mb_segmentation_map)
+        {
+            // Write the probs used to decode the segment id for each macro block.
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i++)
+            {
+                int Data = xd->mb_segment_tree_probs[i];
+
+                if (Data != 255)
+                {
+                    vp8_write_bit(bc, 1);
+                    vp8_write_literal(bc, Data, 8);
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    // Code to determine whether or not to update the scan order.
+    vp8_write_bit(bc, pc->filter_type);
+    vp8_write_literal(bc, pc->filter_level, 6);
+    vp8_write_literal(bc, pc->sharpness_level, 3);
+
+    // Write out loop filter deltas applied at the MB level based on mode or ref frame (if they are enabled).
+    vp8_write_bit(bc, (xd->mode_ref_lf_delta_enabled) ? 1 : 0);
+
+    if (xd->mode_ref_lf_delta_enabled)
+    {
+        // Do the deltas need to be updated
+        vp8_write_bit(bc, (xd->mode_ref_lf_delta_update) ? 1 : 0);
+
+        if (xd->mode_ref_lf_delta_update)
+        {
+            int Data;
+
+            // Send update
+            for (i = 0; i < MAX_REF_LF_DELTAS; i++)
+            {
+                Data = xd->ref_lf_deltas[i];
+
+                // Frame level data
+                if (Data)
+                {
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    // sign
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    // sign
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+
+            // Send update
+            for (i = 0; i < MAX_MODE_LF_DELTAS; i++)
+            {
+                Data = xd->mode_lf_deltas[i];
+
+                if (Data)
+                {
+                    vp8_write_bit(bc, 1);
+
+                    if (Data > 0)
+                    {
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 0);    // sign
+                    }
+                    else
+                    {
+                        Data = -Data;
+                        vp8_write_literal(bc, (Data & 0x3F), 6);
+                        vp8_write_bit(bc, 1);    // sign
+                    }
+                }
+                else
+                    vp8_write_bit(bc, 0);
+            }
+        }
+    }
+
+    //signal here is multi token partition is enabled
+    vp8_write_literal(bc, pc->multi_token_partition, 2);
+
+    // Frame Qbaseline quantizer index
+    vp8_write_literal(bc, pc->base_qindex, 7);
+
+    // Transmit Dc, Second order and Uv quantizer delta information
+    put_delta_q(bc, pc->y1dc_delta_q);
+    put_delta_q(bc, pc->y2dc_delta_q);
+    put_delta_q(bc, pc->y2ac_delta_q);
+    put_delta_q(bc, pc->uvdc_delta_q);
+    put_delta_q(bc, pc->uvac_delta_q);
+
+    // When there is a key frame all reference buffers are updated using the new key frame
+    if (pc->frame_type != KEY_FRAME)
+    {
+        // Should the GF or ARF be updated using the transmitted frame or buffer
+        vp8_write_bit(bc, pc->refresh_golden_frame);
+        vp8_write_bit(bc, pc->refresh_alt_ref_frame);
+
+        // If not being updated from current frame should either GF or ARF be updated from another buffer
+        if (!pc->refresh_golden_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_gf, 2);
+
+        if (!pc->refresh_alt_ref_frame)
+            vp8_write_literal(bc, pc->copy_buffer_to_arf, 2);
+
+        // Indicate reference frame sign bias for Golden and ARF frames (always 0 for last frame buffer)
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[GOLDEN_FRAME]);
+        vp8_write_bit(bc, pc->ref_frame_sign_bias[ALTREF_FRAME]);
+    }
+
+    vp8_write_bit(bc, pc->refresh_entropy_probs);
+
+    if (pc->frame_type != KEY_FRAME)
+        vp8_write_bit(bc, pc->refresh_last_frame);
+
+#ifdef ENTROPY_STATS
+
+    if (pc->frame_type == INTER_FRAME)
+        active_section = 0;
+    else
+        active_section = 7;
+
+#endif
+
+    vp8_clear_system_state();  //__asm emms;
+
+    //************************************************
+    // save a copy for later refresh
+    {
+        vpx_memcpy(&cpi->common.lfc, &cpi->common.fc, sizeof(cpi->common.fc));
+    }
+
+    update_coef_probs(cpi);
+
+#ifdef ENTROPY_STATS
+    active_section = 2;
+#endif
+
+    // Write out the mb_no_coeff_skip flag
+    vp8_write_bit(bc, pc->mb_no_coeff_skip);
+
+    if (pc->frame_type == KEY_FRAME)
+    {
+        write_kfmodes(cpi);
+
+#ifdef ENTROPY_STATS
+        active_section = 8;
+#endif
+    }
+    else
+    {
+        pack_inter_mode_mvs(cpi);
+
+#ifdef ENTROPY_STATS
+        active_section = 1;
+#endif
+    }
+
+    vp8_stop_encode(bc);
+
+
+    if (pc->multi_token_partition != ONE_PARTITION)
+    {
+        int num_part;
+        int asize;
+        num_part = 1 << pc->multi_token_partition;
+
+        pack_tokens_into_partitions(cpi, cx_data + bc->pos, num_part, &asize);
+
+        oh.first_partition_length_in_bytes = cpi->bc.pos;
+
+        *size = cpi->bc.pos + VP8_HEADER_SIZE + asize + extra_bytes_packed;
+    }
+    else
+    {
+        vp8_start_encode(&cpi->bc2, cx_data + bc->pos);
+
+        if (!cpi->b_multi_threaded)
+            pack_tokens(&cpi->bc2, cpi->tok, cpi->tok_count);
+        else
+            pack_mb_row_tokens(cpi, &cpi->bc2);
+
+        vp8_stop_encode(&cpi->bc2);
+        oh.first_partition_length_in_bytes = cpi->bc.pos ;
+        *size = cpi->bc2.pos + cpi->bc.pos + VP8_HEADER_SIZE + extra_bytes_packed;
+    }
+
+#if CONFIG_BIG_ENDIAN
+    {
+        int v = (oh.first_partition_length_in_bytes << 5) |
+                (oh.show_frame << 4) |
+                (oh.version << 1) |
+                oh.type;
+
+        v = make_endian_32(v);
+        vpx_memcpy(dest, &v, 3);
+    }
+#else
+    vpx_memcpy(dest, &oh, 3);
+#endif
+}
+
+#ifdef ENTROPY_STATS
+void print_tree_update_probs()
+{
+    int i, j, k, l;
+    FILE *f = fopen("context.c", "a");
+    int Sum;
+    fprintf(f, "\n/* Update probabilities for token entropy tree. */\n\n");
+    fprintf(f, "const vp8_prob tree_update_probs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1] = {\n");
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+    {
+        fprintf(f, "  { \n");
+
+        for (j = 0; j < COEF_BANDS; j++)
+        {
+            fprintf(f, "    {\n");
+
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+            {
+                fprintf(f, "      {");
+
+                for (l = 0; l < MAX_ENTROPY_TOKENS - 1; l++)
+                {
+                    Sum = tree_update_hist[i][j][k][l][0] + tree_update_hist[i][j][k][l][1];
+
+                    if (Sum > 0)
+                    {
+                        if (((tree_update_hist[i][j][k][l][0] * 255) / Sum) > 0)
+                            fprintf(f, "%3ld, ", (tree_update_hist[i][j][k][l][0] * 255) / Sum);
+                        else
+                            fprintf(f, "%3ld, ", 1);
+                    }
+                    else
+                        fprintf(f, "%3ld, ", 128);
+                }
+
+                fprintf(f, "},\n");
+            }
+
+            fprintf(f, "    },\n");
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+#endif

diff --git a/vp8/encoder/bitstream.h b/vp8/encoder/bitstream.h
new file mode 100644
index 0000000..ee69f66
--- /dev/null
+++ b/vp8/encoder/bitstream.h

@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BITSTREAM_H
+#define __INC_BITSTREAM_H
+
+#if HAVE_ARMV7
+void vp8cx_pack_tokens_armv7(vp8_writer *w, const TOKENEXTRA *p, int xcount,
+                             vp8_token *,
+                             vp8_extra_bit_struct *,
+                             const vp8_tree_index *);
+void vp8cx_pack_tokens_into_partitions_armv7(VP8_COMP *, unsigned char *, int , int *,
+        vp8_token *,
+        vp8_extra_bit_struct *,
+        const vp8_tree_index *);
+void vp8cx_pack_mb_row_tokens_armv7(VP8_COMP *cpi, vp8_writer *w,
+                                    vp8_token *,
+                                    vp8_extra_bit_struct *,
+                                    const vp8_tree_index *);
+# define pack_tokens(a,b,c)                  \
+    vp8cx_pack_tokens_armv7(a,b,c,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_tokens_into_partitions(a,b,c,d)  \
+    vp8cx_pack_tokens_into_partitions_armv7(a,b,c,d,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+# define pack_mb_row_tokens(a,b)               \
+    vp8cx_pack_mb_row_tokens_armv7(a,b,vp8_coef_encodings,vp8_extra_bits,vp8_coef_tree)
+#else
+# define pack_tokens(a,b,c)                  pack_tokens_c(a,b,c)
+# define pack_tokens_into_partitions(a,b,c,d)  pack_tokens_into_partitions_c(a,b,c,d)
+# define pack_mb_row_tokens(a,b)               pack_mb_row_tokens_c(a,b)
+#endif
+#endif

diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
new file mode 100644
index 0000000..cc4cbe0
--- /dev/null
+++ b/vp8/encoder/block.h

@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_BLOCK_H
+#define __INC_BLOCK_H
+
+#include "onyx.h"
+#include "blockd.h"
+#include "entropymv.h"
+#include "entropy.h"
+#include "vpx_ports/mem.h"
+
+// motion search site
+typedef struct
+{
+    MV mv;
+    int offset;
+} search_site;
+
+typedef struct
+{
+    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    short *src_diff;
+    short *coeff;
+
+    // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries
+    short(*quant)[4];
+    short(*zbin)[4];
+    short(*zrun_zbin_boost);
+    short(*round)[4];
+
+    // Zbin Over Quant value
+    short zbin_extra;
+
+    unsigned char **base_src;
+    int src;
+    int src_stride;
+
+//  MV  enc_mv;
+    int force_empty;
+
+} BLOCK;
+
+typedef struct
+{
+    DECLARE_ALIGNED(16, short, src_diff[400]);       // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+    DECLARE_ALIGNED(16, short, coeff[400]);     // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y
+
+    // 16 Y blocks, 4 U blocks, 4 V blocks, 1 DC 2nd order block each with 16 entries
+    BLOCK block[25];
+
+    YV12_BUFFER_CONFIG src;
+
+    MACROBLOCKD e_mbd;
+
+    search_site *ss;
+    int ss_count;
+    int searches_per_step;
+
+    int errorperbit;
+    int sadperbit16;
+    int sadperbit4;
+    int errthresh;
+    int rddiv;
+    int rdmult;
+
+    int mvcosts[2][MVvals+1];
+    int *mvcost[2];
+    int mvsadcosts[2][MVvals+1];
+    int *mvsadcost[2];
+    int mbmode_cost[2][MB_MODE_COUNT];
+    int intra_uv_mode_cost[2][MB_MODE_COUNT];
+    unsigned int bmode_costs[10][10][10];
+    unsigned int inter_bmode_costs[B_MODE_COUNT];
+
+    // These define limits to motion vector components to prevent them from extending outside the UMV borders
+    int mv_col_min;
+    int mv_col_max;
+    int mv_row_min;
+    int mv_row_max;
+
+    int vector_range;    // Used to monitor limiting range of recent vectors to guide search.
+    int skip;
+
+    int encode_breakout;
+
+    unsigned char *active_ptr;
+    MV_CONTEXT *mvc;
+
+    unsigned int token_costs[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+    int optimize;
+
+    void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+    void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+    void (*short_fdct4x4rd)(short *input, short *output, int pitch);
+    void (*short_fdct8x4rd)(short *input, short *output, int pitch);
+    void (*vp8_short_fdct4x4_ptr)(short *input, short *output, int pitch);
+    void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+    void (*quantize_b)(BLOCK *b, BLOCKD *d);
+    void (*quantize_brd)(BLOCK *b, BLOCKD *d);
+
+
+
+} MACROBLOCK;
+
+
+#endif

diff --git a/vp8/encoder/boolhuff.c b/vp8/encoder/boolhuff.c
new file mode 100644
index 0000000..c101384
--- /dev/null
+++ b/vp8/encoder/boolhuff.c

@@ -0,0 +1,147 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "boolhuff.h"
+#include "blockd.h"
+
+
+
+#if defined(SECTIONBITS_OUTPUT)
+unsigned __int64 Sectionbits[500];
+
+#endif
+
+#ifdef ENTROPY_STATS
+unsigned int active_section = 0;
+#endif
+
+const unsigned int vp8_prob_cost[256] =
+{
+    2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161, 1129, 1099, 1072, 1046,
+    1023, 1000,  979,  959,  940,  922,  905,  889,  873,  858,  843,  829,  816,  803,  790,  778,
+    767,  755,  744,  733,  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
+    617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,  534,  528,  522,  516,
+    511,  505,  499,  494,  488,  483,  477,  472,  467,  462,  457,  452,  447,  442,  437,  433,
+    428,  424,  419,  415,  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
+    361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,  317,  314,  311,  307,
+    304,  301,  297,  294,  291,  288,  285,  281,  278,  275,  272,  269,  266,  263,  260,  257,
+    255,  252,  249,  246,  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
+    211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,  181,  179,  177,  174,
+    172,  170,  168,  165,  163,  161,  159,  156,  154,  152,  150,  148,  145,  143,  141,  139,
+    137,  135,  133,  131,  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
+    105,  103,  101,   99,   97,   95,   93,   92,   90,   88,   86,   84,   82,   81,   79,   77,
+    75,   73,   72,   70,   68,   66,   65,   63,   61,   60,   58,   56,   55,   53,   51,   50,
+    48,   46,   45,   43,   41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
+    22,   21,   19,   18,   16,   15,   13,   12,   10,    9,    7,    6,    4,    3,    1,   1
+};
+
+void vp8_start_encode(BOOL_CODER *br, unsigned char *source)
+{
+
+    br->lowvalue = 0;
+    br->range    = 255;
+    br->value    = 0;
+    br->count    = -24;
+    br->buffer   = source;
+    br->pos      = 0;
+}
+
+void vp8_stop_encode(BOOL_CODER *br)
+{
+    int i;
+
+    for (i = 0; i < 32; i++)
+        vp8_encode_bool(br, 0, 128);
+}
+
+DECLARE_ALIGNED(16, static const unsigned int, norm[256]) =
+{
+    0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+void vp8_encode_bool(BOOL_CODER *br, int bit, int probability)
+{
+    unsigned int split;
+    int count = br->count;
+    unsigned int range = br->range;
+    unsigned int lowvalue = br->lowvalue;
+    register unsigned int shift;
+
+#ifdef ENTROPY_STATS
+#if defined(SECTIONBITS_OUTPUT)
+
+    if (bit)
+        Sectionbits[active_section] += vp8_prob_cost[255-probability];
+    else
+        Sectionbits[active_section] += vp8_prob_cost[probability];
+
+#endif
+#endif
+
+    split = 1 + (((range - 1) * probability) >> 8);
+
+    range = split;
+
+    if (bit)
+    {
+        lowvalue += split;
+        range = br->range - split;
+    }
+
+    shift = norm[range];
+
+    range <<= shift;
+    count += shift;
+
+    if (count >= 0)
+    {
+        int offset = shift - count;
+
+        if ((lowvalue << (offset - 1)) & 0x80000000)
+        {
+            int x = br->pos - 1;
+
+            while (x >= 0 && br->buffer[x] == 0xff)
+            {
+                br->buffer[x] = (unsigned char)0;
+                x--;
+            }
+
+            br->buffer[x] += 1;
+        }
+
+        br->buffer[br->pos++] = (lowvalue >> (24 - offset));
+        lowvalue <<= offset;
+        shift = count;
+        lowvalue &= 0xffffff;
+        count -= 8 ;
+    }
+
+    lowvalue <<= shift;
+    br->count = count;
+    br->lowvalue = lowvalue;
+    br->range = range;
+}
+
+void vp8_encode_value(BOOL_CODER *br, int data, int bits)
+{
+    int bit;
+
+    for (bit = bits - 1; bit >= 0; bit--)
+        vp8_encode_bool(br, (1 & (data >> bit)), 0x80);
+
+}

diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h
new file mode 100644
index 0000000..0d929f0
--- /dev/null
+++ b/vp8/encoder/boolhuff.h

@@ -0,0 +1,42 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     boolhuff.h
+*
+*   Description  :     Bool Coder header file.
+*
+****************************************************************************/
+#ifndef __INC_BOOLHUFF_H
+#define __INC_BOOLHUFF_H
+
+
+typedef struct
+{
+    unsigned int lowvalue;
+    unsigned int range;
+    unsigned int value;
+    int count;
+    unsigned int pos;
+    unsigned char *buffer;
+
+    // Variables used to track bit costs without outputing to the bitstream
+    unsigned int  measure_cost;
+    unsigned long bit_counter;
+} BOOL_CODER;
+
+extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer);
+extern void vp8_encode_bool(BOOL_CODER *bc, int x, int context);
+extern void vp8_encode_value(BOOL_CODER *br, int data, int bits);
+extern void vp8_stop_encode(BOOL_CODER *bc);
+extern const unsigned int vp8_prob_cost[256];
+
+#endif

diff --git a/vp8/encoder/dct.c b/vp8/encoder/dct.c
new file mode 100644
index 0000000..5207e39
--- /dev/null
+++ b/vp8/encoder/dct.c

@@ -0,0 +1,223 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+
+
+static const short dct_matrix2[4][4] =
+{
+    { 23170,  30274,  23170, 12540 },
+    { 23170,  12540, -23170, -30274 },
+    { 23170, -12540, -23170, 30274 },
+    { 23170, -30274,  23170, -12540 }
+};
+
+static const short dct_matrix1[4][4] =
+{
+    { 23170,  23170,  23170,  23170 },
+    { 30274,  12540, -12540, -30274 },
+    { 23170, -23170, -23170,  23170 },
+    { 12540, -30274,  30274, -12540 }
+};
+
+
+#define _1STSTAGESHIFT           14
+#define _1STSTAGEROUNDING        (1<<( _1STSTAGESHIFT-1))
+#define _2NDSTAGESHIFT           16
+#define _2NDSTAGEROUNDING        (1<<( _2NDSTAGESHIFT-1))
+
+// using matrix multiply
+void vp8_short_fdct4x4_c(short *input, short *output, int pitch)
+{
+    int i, j, k;
+    short temp[4][4];
+    int sumtemp;
+    pitch >>= 1;
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            sumtemp = 0;
+
+            for (k = 0; k < 4; k++)
+            {
+                sumtemp += input[i*pitch+k] * dct_matrix2[k][j];
+
+            }
+
+            temp[i][j] = (short)((sumtemp + _1STSTAGEROUNDING) >> _1STSTAGESHIFT);
+        }
+    }
+
+
+    for (i = 0; i < 4; i++)
+    {
+        for (j = 0; j < 4; j++)
+        {
+            sumtemp = 0;
+
+            for (k = 0; k < 4; k++)
+            {
+                sumtemp += dct_matrix1[i][ k] * temp[k][ j];
+            }
+
+            output[i*4+j] = (short)((sumtemp + _2NDSTAGEROUNDING) >> _2NDSTAGESHIFT);
+        }
+    }
+
+}
+
+
+void vp8_short_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_c(input,   output,    pitch);
+    vp8_short_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+
+static const signed short x_c1 = 60547;
+static const signed short x_c2 = 46341;
+static const signed short x_c3 = 25080;
+
+void vp8_fast_fdct4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+
+    short *op = output;
+    int temp1, temp2;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = (ip[0] + ip[3]) * 2;
+        b1 = (ip[1] + ip[2]) * 2;
+        c1 = (ip[1] - ip[2]) * 2;
+        d1 = (ip[0] - ip[3]) * 2;
+
+        temp1 = a1 + b1;
+        temp2 = a1 - b1;
+
+        op[0] = ((temp1 * x_c2) >> 16) + temp1;
+        op[2] = ((temp2 * x_c2) >> 16) + temp2;
+
+        temp1 = (c1 * x_c3) >> 16;
+        temp2 = ((d1 * x_c1) >> 16) + d1;
+
+        op[1] = temp1 + temp2;
+
+        temp1 = (d1 * x_c3) >> 16;
+        temp2 = ((c1 * x_c1) >> 16) + c1;
+
+        op[3] = temp1 - temp2;
+
+        ip += pitch / 2;
+        op += 4;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+
+        temp1 = a1 + b1;
+        temp2 = a1 - b1;
+
+        a2 = ((temp1 * x_c2) >> 16) + temp1;
+        c2 = ((temp2 * x_c2) >> 16) + temp2;
+
+        temp1 = (c1 * x_c3) >> 16;
+        temp2 = ((d1 * x_c1) >> 16) + d1;
+
+        b2 = temp1 + temp2;
+
+        temp1 = (d1 * x_c3) >> 16;
+        temp2 = ((c1 * x_c1) >> 16) + c1;
+
+        d2 = temp1 - temp2;
+
+
+        op[0]   = (a2 + 1) >> 1;
+        op[4]   = (b2 + 1) >> 1;
+        op[8]   = (c2 + 1) >> 1;
+        op[12]  = (d2 + 1) >> 1;
+
+        ip++;
+        op++;
+    }
+}
+
+void vp8_fast_fdct8x4_c(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_c(input,   output,    pitch);
+    vp8_fast_fdct4x4_c(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_c(short *input, short *output, int pitch)
+{
+    int i;
+    int a1, b1, c1, d1;
+    int a2, b2, c2, d2;
+    short *ip = input;
+    short *op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[3];
+        b1 = ip[1] + ip[2];
+        c1 = ip[1] - ip[2];
+        d1 = ip[0] - ip[3];
+
+        op[0] = a1 + b1;
+        op[1] = c1 + d1;
+        op[2] = a1 - b1;
+        op[3] = d1 - c1;
+        ip += pitch / 2;
+        op += 4;
+    }
+
+    ip = output;
+    op = output;
+
+    for (i = 0; i < 4; i++)
+    {
+        a1 = ip[0] + ip[12];
+        b1 = ip[4] + ip[8];
+        c1 = ip[4] - ip[8];
+        d1 = ip[0] - ip[12];
+
+        a2 = a1 + b1;
+        b2 = c1 + d1;
+        c2 = a1 - b1;
+        d2 = d1 - c1;
+
+        a2 += (a2 > 0);
+        b2 += (b2 > 0);
+        c2 += (c2 > 0);
+        d2 += (d2 > 0);
+
+        op[0] = (a2) >> 1;
+        op[4] = (b2) >> 1;
+        op[8] = (c2) >> 1;
+        op[12] = (d2) >> 1;
+
+        ip++;
+        op++;
+    }
+}

diff --git a/vp8/encoder/dct.h b/vp8/encoder/dct.h
new file mode 100644
index 0000000..fb307cf
--- /dev/null
+++ b/vp8/encoder/dct.h

@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_DCT_H
+#define __INC_DCT_H
+
+#define prototype_fdct(sym) void (sym)(short *input, short *output, int pitch)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/dct_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/dct_arm.h"
+#endif
+
+#ifndef vp8_fdct_short4x4
+#define vp8_fdct_short4x4  vp8_short_fdct4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_short4x4);
+
+#ifndef vp8_fdct_short8x4
+#define vp8_fdct_short8x4  vp8_short_fdct8x4_c
+#endif
+extern prototype_fdct(vp8_fdct_short8x4);
+
+#ifndef vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4  vp8_fast_fdct4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_fast4x4);
+
+#ifndef vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4  vp8_fast_fdct8x4_c
+#endif
+extern prototype_fdct(vp8_fdct_fast8x4);
+
+#ifndef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_c
+#endif
+extern prototype_fdct(vp8_fdct_walsh_short4x4);
+
+typedef prototype_fdct(*vp8_fdct_fn_t);
+typedef struct
+{
+    vp8_fdct_fn_t    short4x4;
+    vp8_fdct_fn_t    short8x4;
+    vp8_fdct_fn_t    fast4x4;
+    vp8_fdct_fn_t    fast8x4;
+    vp8_fdct_fn_t    walsh_short4x4;
+} vp8_fdct_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define FDCT_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define FDCT_INVOKE(ctx,fn) vp8_fdct_##fn
+#endif
+
+#endif

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
new file mode 100644
index 0000000..a4e3772
--- /dev/null
+++ b/vp8/encoder/encodeframe.c

@@ -0,0 +1,1223 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "encodemv.h"
+#include "common.h"
+#include "onyx_int.h"
+#include "extend.h"
+#include "entropymode.h"
+#include "quant_common.h"
+#include "segmentation_common.h"
+#include "setupintrarecon.h"
+#include "encodeintra.h"
+#include "reconinter.h"
+#include "rdopt.h"
+#include "pickinter.h"
+#include "findnearmv.h"
+#include "reconintra.h"
+#include <stdio.h>
+#include <limits.h>
+#include "subpixel.h"
+#include "vpx_ports/vpx_timer.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define RTCD(x)     &cpi->common.rtcd.x
+#define IF_RTCD(x)  (x)
+#else
+#define RTCD(x)     NULL
+#define IF_RTCD(x)  NULL
+#endif
+extern void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+
+extern void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex);
+extern void vp8_auto_select_speed(VP8_COMP *cpi);
+extern void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                                      MACROBLOCK *x,
+                                      MB_ROW_COMP *mbr_ei,
+                                      int mb_row,
+                                      int count);
+void vp8_build_block_offsets(MACROBLOCK *x);
+void vp8_setup_block_ptrs(MACROBLOCK *x);
+int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+
+#ifdef MODE_STATS
+unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int inter_uv_modes[4] = {0, 0, 0, 0};
+unsigned int inter_b_modes[15]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int y_modes[5]   = {0, 0, 0, 0, 0};
+unsigned int uv_modes[4]  = {0, 0, 0, 0};
+unsigned int b_modes[14]  = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+#endif
+
+// The first four entries are dummy values
+static const int qrounding_factors[129] =
+{
+    56, 56, 56, 56, 56, 56, 56, 56,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    48,
+};
+
+static const int qzbin_factors[129] =
+{
+    64, 64, 64, 64, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80, 80, 80, 80, 80, 80, 80, 80,
+    80,
+};
+
+void vp8cx_init_quantizer(VP8_COMP *cpi)
+{
+    int r, c;
+    int i;
+    int quant_val;
+    int Q;
+
+    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
+
+    for (Q = 0; Q < QINDEX_RANGE; Q++)
+    {
+        // dc values
+        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
+        cpi->Y1quant[Q][0][0] = (1 << 16) / quant_val;
+        cpi->Y1zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y1round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y1dequant[Q][0][0] = quant_val;
+        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
+        cpi->Y2quant[Q][0][0] = (1 << 16) / quant_val;
+        cpi->Y2zbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+        cpi->Y2round[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.Y2dequant[Q][0][0] = quant_val;
+        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
+        cpi->UVquant[Q][0][0] = (1 << 16) / quant_val;
+        cpi->UVzbin[Q][0][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
+        cpi->UVround[Q][0][0] = (qrounding_factors[Q] * quant_val) >> 7;
+        cpi->common.UVdequant[Q][0][0] = quant_val;
+        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
+
+        // all the ac values = ;
+        for (i = 1; i < 16; i++)
+        {
+            int rc = vp8_default_zig_zag1d[i];
+            r = (rc >> 2);
+            c = (rc & 3);
+
+            quant_val = vp8_ac_yquant(Q);
+            cpi->Y1quant[Q][r][c] = (1 << 16) / quant_val;
+            cpi->Y1zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->Y1round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.Y1dequant[Q][r][c] = quant_val;
+            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
+            cpi->Y2quant[Q][r][c] = (1 << 16) / quant_val;
+            cpi->Y2zbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->Y2round[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.Y2dequant[Q][r][c] = quant_val;
+            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+
+            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
+            cpi->UVquant[Q][r][c] = (1 << 16) / quant_val;
+            cpi->UVzbin[Q][r][c] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
+            cpi->UVround[Q][r][c] = (qrounding_factors[Q] * quant_val) >> 7;
+            cpi->common.UVdequant[Q][r][c] = quant_val;
+            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
+        }
+    }
+}
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    int i;
+    int QIndex;
+    MACROBLOCKD *xd = &x->e_mbd;
+    MB_MODE_INFO *mbmi = &xd->mbmi;
+    int zbin_extra;
+
+    // Select the baseline MB Q index.
+    if (xd->segmentation_enabled)
+    {
+        // Abs Value
+        if (xd->mb_segement_abs_delta == SEGMENT_ABSDATA)
+            QIndex = xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+
+        // Delta Value
+        else
+        {
+            QIndex = cpi->common.base_qindex + xd->segment_feature_data[MB_LVL_ALT_Q][mbmi->segment_id];
+            QIndex = (QIndex >= 0) ? ((QIndex <= MAXQ) ? QIndex : MAXQ) : 0;    // Clamp to valid range
+        }
+    }
+    else
+        QIndex = cpi->common.base_qindex;
+
+    // Y
+    zbin_extra = (cpi->common.Y1dequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+
+    for (i = 0; i < 16; i++)
+    {
+        x->block[i].quant = cpi->Y1quant[QIndex];
+        x->block[i].zbin = cpi->Y1zbin[QIndex];
+        x->block[i].round = cpi->Y1round[QIndex];
+        x->e_mbd.block[i].dequant = cpi->common.Y1dequant[QIndex];
+        x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_y1[QIndex];
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // UV
+    zbin_extra = (cpi->common.UVdequant[QIndex][0][1] * (cpi->zbin_over_quant + cpi->zbin_mode_boost)) >> 7;
+
+    for (i = 16; i < 24; i++)
+    {
+        x->block[i].quant = cpi->UVquant[QIndex];
+        x->block[i].zbin = cpi->UVzbin[QIndex];
+        x->block[i].round = cpi->UVround[QIndex];
+        x->e_mbd.block[i].dequant = cpi->common.UVdequant[QIndex];
+        x->block[i].zrun_zbin_boost = cpi->zrun_zbin_boost_uv[QIndex];
+        x->block[i].zbin_extra = (short)zbin_extra;
+    }
+
+    // Y2
+    zbin_extra = (cpi->common.Y2dequant[QIndex][0][1] * ((cpi->zbin_over_quant / 2) + cpi->zbin_mode_boost)) >> 7;
+    x->block[24].quant = cpi->Y2quant[QIndex];
+    x->block[24].zbin = cpi->Y2zbin[QIndex];
+    x->block[24].round = cpi->Y2round[QIndex];
+    x->e_mbd.block[24].dequant = cpi->common.Y2dequant[QIndex];
+    x->block[24].zrun_zbin_boost = cpi->zrun_zbin_boost_y2[QIndex];
+    x->block[24].zbin_extra = (short)zbin_extra;
+}
+
+void vp8cx_frame_init_quantizer(VP8_COMP *cpi)
+{
+    // vp8cx_init_quantizer() is first called in vp8_create_compressor(). A check is added here so that vp8cx_init_quantizer() is only called
+    // when these values are not all zero.
+    if (cpi->common.y1dc_delta_q | cpi->common.y2dc_delta_q | cpi->common.uvdc_delta_q | cpi->common.y2ac_delta_q | cpi->common.uvac_delta_q)
+    {
+        vp8cx_init_quantizer(cpi);
+    }
+
+    // MB level quantizer setup
+    vp8cx_mb_init_quantizer(cpi, &cpi->mb);
+}
+
+
+
+static
+void encode_mb_row(VP8_COMP *cpi,
+                   VP8_COMMON *cm,
+                   int mb_row,
+                   MACROBLOCK  *x,
+                   MACROBLOCKD *xd,
+                   TOKENEXTRA **tp,
+                   int *segment_counts,
+                   int *totalrate)
+{
+    int i;
+    int recon_yoffset, recon_uvoffset;
+    int mb_col;
+    int recon_y_stride = cm->last_frame.y_stride;
+    int recon_uv_stride = cm->last_frame.uv_stride;
+    int seg_map_index = (mb_row * cpi->common.mb_cols);
+
+
+    // reset above block coeffs
+    xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT];
+    xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ];
+    xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ];
+    xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT];
+
+    xd->up_available = (mb_row != 0);
+    recon_yoffset = (mb_row * recon_y_stride * 16);
+    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+    cpi->tplist[mb_row].start = *tp;
+    //printf("Main mb_row = %d\n", mb_row);
+
+    // for each macroblock col in image
+    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+    {
+        // Distance of Mb to the various image edges.
+        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+        xd->mb_to_top_edge = -((mb_row * 16) << 3);
+        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+        // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+        xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+        xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+        xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+        xd->left_available = (mb_col != 0);
+
+        // Is segmentation enabled
+        // MB level adjutment to quantizer
+        if (xd->segmentation_enabled)
+        {
+            // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+            if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
+                xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
+            else
+                xd->mbmi.segment_id = 0;
+
+            vp8cx_mb_init_quantizer(cpi, x);
+        }
+        else
+            xd->mbmi.segment_id = 0;         // Set to Segment 0 by default
+
+        x->active_ptr = cpi->active_map + seg_map_index + mb_col;
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+#ifdef MODE_STATS
+            y_modes[xd->mbmi.mode] ++;
+#endif
+        }
+        else
+        {
+            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+            inter_y_modes[xd->mbmi.mode] ++;
+
+            if (xd->mbmi.mode == SPLITMV)
+            {
+                int b;
+
+                for (b = 0; b < xd->mbmi.partition_count; b++)
+                {
+                    inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++;
+                }
+            }
+
+#endif
+
+            // Count of last ref frame 0,0 useage
+            if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+                cpi->inter_zz_count ++;
+
+            // Special case code for cyclic refresh
+            // If cyclic update enabled then copy xd->mbmi.segment_id; (which may have been updated based on mode
+            // during vp8cx_encode_inter_macroblock()) back into the global sgmentation map
+            if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
+            {
+                cpi->segmentation_map[seg_map_index+mb_col] = xd->mbmi.segment_id;
+
+                // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
+                // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
+                // else mark it as dirty (1).
+                if (xd->mbmi.segment_id)
+                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = -1;
+                else if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+                {
+                    if (cpi->cyclic_refresh_map[seg_map_index+mb_col] == 1)
+                        cpi->cyclic_refresh_map[seg_map_index+mb_col] = 0;
+                }
+                else
+                    cpi->cyclic_refresh_map[seg_map_index+mb_col] = 1;
+
+            }
+        }
+
+        cpi->tplist[mb_row].stop = *tp;
+
+        xd->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
+
+        // store macroblock mode info into context array
+        vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi));
+
+        for (i = 0; i < 16; i++)
+            vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+        // adjust to the next column of macroblocks
+        x->src.y_buffer += 16;
+        x->src.u_buffer += 8;
+        x->src.v_buffer += 8;
+
+        recon_yoffset += 16;
+        recon_uvoffset += 8;
+
+        // Keep track of segment useage
+        segment_counts[xd->mbmi.segment_id] ++;
+
+        // skip to next mb
+        xd->mode_info_context++;
+
+        xd->above_context[Y1CONTEXT] += 4;
+        xd->above_context[UCONTEXT ] += 2;
+        xd->above_context[VCONTEXT ] += 2;
+        xd->above_context[Y2CONTEXT] ++;
+        cpi->current_mb_col_main = mb_col;
+    }
+
+    //extend the recon for intra prediction
+    vp8_extend_mb_row(
+        &cm->new_frame,
+        xd->dst.y_buffer + 16,
+        xd->dst.u_buffer + 8,
+        xd->dst.v_buffer + 8);
+
+    // this is to account for the border
+    xd->mode_info_context++;
+}
+
+
+
+
+
+void vp8_encode_frame(VP8_COMP *cpi)
+{
+    int mb_row;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    int i;
+    TOKENEXTRA *tp = cpi->tok;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+
+    if (cm->frame_type != KEY_FRAME)
+    {
+        if (cm->mcomp_filter_type == SIXTAP)
+        {
+            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap4x4);
+            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x4);
+            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap8x8);
+            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, sixtap16x16);
+        }
+        else
+        {
+            xd->subpixel_predict     = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear4x4);
+            xd->subpixel_predict8x4      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x4);
+            xd->subpixel_predict8x8      = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear8x8);
+            xd->subpixel_predict16x16    = SUBPIX_INVOKE(&cpi->common.rtcd.subpix, bilinear16x16);
+        }
+    }
+
+    //else  // Key Frame
+    //{
+    // For key frames make sure the intra ref frame probability value
+    // is set to "all intra"
+    //cpi->prob_intra_coded = 255;
+    //}
+
+
+    xd->gf_active_ptr = (signed char *)cm->gf_active_flags;     // Point to base of GF active flags data structure
+
+    x->vector_range = 32;
+
+    // Count of MBs using the alternate Q if any
+    cpi->alt_qcount = 0;
+
+    // Reset frame count of inter 0,0 motion vector useage.
+    cpi->inter_zz_count = 0;
+
+    vpx_memset(segment_counts, 0, sizeof(segment_counts));
+
+    cpi->prediction_error = 0;
+    cpi->intra_error = 0;
+    cpi->skip_true_count = 0;
+    cpi->skip_false_count = 0;
+
+#if 0
+    // Experimental code
+    cpi->frame_distortion = 0;     
+    cpi->last_mb_distortion = 0;
+#endif
+
+    totalrate = 0;
+
+    xd->mode_info = cm->mi - 1;
+
+    xd->mode_info_context = cm->mi;
+    xd->mode_info_stride = cm->mode_info_stride;
+
+    xd->frame_type = cm->frame_type;
+
+    xd->frames_since_golden = cm->frames_since_golden;
+    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+    vp8_zero(cpi->MVcount);
+    // vp8_zero( Contexts)
+    vp8_zero(cpi->coef_counts);
+
+    // reset intra mode contexts
+    if (cm->frame_type == KEY_FRAME)
+        vp8_init_mbmode_probs(cm);
+
+
+    vp8cx_frame_init_quantizer(cpi);
+
+    if (cpi->compressor_speed == 2)
+    {
+        if (cpi->oxcf.cpu_used < 0)
+            cpi->Speed = -(cpi->oxcf.cpu_used);
+        else
+            vp8_auto_select_speed(cpi);
+    }
+
+    vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+    //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
+    vp8cx_initialize_me_consts(cpi, cm->base_qindex);
+    //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);
+
+    // Copy data over into macro block data sturctures.
+
+    x->src = * cpi->Source;
+    xd->pre = cm->last_frame;
+    xd->dst = cm->new_frame;
+
+    // set up frame new frame for intra coded blocks
+
+    vp8_setup_intra_recon(&cm->new_frame);
+
+    vp8_build_block_offsets(x);
+
+    vp8_setup_block_dptrs(&x->e_mbd);
+
+    vp8_setup_block_ptrs(x);
+
+    x->rddiv = cpi->RDDIV;
+    x->rdmult = cpi->RDMULT;
+
+#if 0
+    // Experimental rd code
+    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
+    // such as cpi->rate_correction_factor that indicate relative complexity.
+    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
+    {
+        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
+        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
+    }
+    else
+        x->rdmult = cpi->RDMULT; */
+    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
+#endif
+
+    xd->mbmi.mode = DC_PRED;
+    xd->mbmi.uv_mode = DC_PRED;
+
+    xd->left_context = cm->left_context;
+
+    vp8_zero(cpi->count_mb_ref_frame_usage)
+    vp8_zero(cpi->ymode_count)
+    vp8_zero(cpi->uv_mode_count)
+
+    x->mvc = cm->fc.mvc;
+
+    // vp8_zero( entropy_stats)
+    {
+        ENTROPY_CONTEXT **p = cm->above_context;
+        const size_t L = cm->mb_cols;
+
+        vp8_zero_array(p [Y1CONTEXT], L * 4)
+        vp8_zero_array(p [ UCONTEXT], L * 2)
+        vp8_zero_array(p [ VCONTEXT], L * 2)
+        vp8_zero_array(p [Y2CONTEXT], L)
+    }
+
+
+    {
+        struct vpx_usec_timer  emr_timer;
+        vpx_usec_timer_start(&emr_timer);
+
+        if (!cpi->b_multi_threaded)
+        {
+            // for each macroblock row in image
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+            {
+
+                vp8_zero(cm->left_context)
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+                x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+                x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+            }
+
+            cpi->tok_count = tp - cpi->tok;
+
+        }
+        else
+        {
+#if CONFIG_MULTITHREAD
+            vp8cx_init_mbrthread_data(cpi, x, cpi->mb_row_ei, 1,  cpi->encoding_thread_count);
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
+            {
+                int i;
+                cpi->current_mb_col_main = -1;
+
+                for (i = 0; i < cpi->encoding_thread_count; i++)
+                {
+                    if ((mb_row + i + 1) >= cm->mb_rows)
+                        break;
+
+                    cpi->mb_row_ei[i].mb_row = mb_row + i + 1;
+                    cpi->mb_row_ei[i].tp  = cpi->tok + (mb_row + i + 1) * (cm->mb_cols * 16 * 24);
+                    cpi->mb_row_ei[i].current_mb_col = -1;
+                    //SetEvent(cpi->h_event_mbrencoding[i]);
+                    sem_post(&cpi->h_event_mbrencoding[i]);
+                }
+
+                vp8_zero(cm->left_context)
+
+                tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+                encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+
+                // adjust to the next row of mbs
+                x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+
+                if (mb_row < cm->mb_rows - 1)
+                    //WaitForSingleObject(cpi->h_event_main, INFINITE);
+                    sem_wait(&cpi->h_event_main);
+            }
+
+            /*
+            for( ;mb_row<cm->mb_rows; mb_row ++)
+            {
+            vp8_zero( cm->left_context)
+
+            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+
+            encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
+            // adjust to the next row of mbs
+            x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+            x->src.u_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
+            x->src.v_buffer +=  8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+            }
+            */
+            cpi->tok_count = 0;
+
+            for (mb_row = 0; mb_row < cm->mb_rows; mb_row ++)
+            {
+                cpi->tok_count += cpi->tplist[mb_row].stop - cpi->tplist[mb_row].start;
+            }
+
+            if (xd->segmentation_enabled)
+            {
+
+                int i, j;
+
+                if (xd->segmentation_enabled)
+                {
+
+                    for (i = 0; i < cpi->encoding_thread_count; i++)
+                    {
+                        for (j = 0; j < 4; j++)
+                            segment_counts[j] += cpi->mb_row_ei[i].segment_counts[j];
+                    }
+                }
+
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                totalrate += cpi->mb_row_ei[i].totalrate;
+            }
+
+#endif
+
+        }
+
+        vpx_usec_timer_mark(&emr_timer);
+        cpi->time_encode_mb_row += vpx_usec_timer_elapsed(&emr_timer);
+
+    }
+
+
+    // Work out the segment probabilites if segmentation is enabled
+    if (xd->segmentation_enabled)
+    {
+        int tot_count;
+        int i;
+
+        // Set to defaults
+        vpx_memset(xd->mb_segment_tree_probs, 255 , sizeof(xd->mb_segment_tree_probs));
+
+        tot_count = segment_counts[0] + segment_counts[1] + segment_counts[2] + segment_counts[3];
+
+        if (tot_count)
+        {
+            xd->mb_segment_tree_probs[0] = ((segment_counts[0] + segment_counts[1]) * 255) / tot_count;
+
+            tot_count = segment_counts[0] + segment_counts[1];
+
+            if (tot_count > 0)
+            {
+                xd->mb_segment_tree_probs[1] = (segment_counts[0] * 255) / tot_count;
+            }
+
+            tot_count = segment_counts[2] + segment_counts[3];
+
+            if (tot_count > 0)
+                xd->mb_segment_tree_probs[2] = (segment_counts[2] * 255) / tot_count;
+
+            // Zero probabilities not allowed
+            for (i = 0; i < MB_FEATURE_TREE_PROBS; i ++)
+            {
+                if (xd->mb_segment_tree_probs[i] == 0)
+                    xd->mb_segment_tree_probs[i] = 1;
+            }
+        }
+    }
+
+    // 256 rate units to the bit
+    cpi->projected_frame_size = totalrate >> 8;   // projected_frame_size in units of BYTES
+
+    // Make a note of the percentage MBs coded Intra.
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->this_frame_percent_intra = 100;
+    }
+    else
+    {
+        int tot_modes;
+
+        tot_modes = cpi->count_mb_ref_frame_usage[INTRA_FRAME]
+                    + cpi->count_mb_ref_frame_usage[LAST_FRAME]
+                    + cpi->count_mb_ref_frame_usage[GOLDEN_FRAME]
+                    + cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+
+        if (tot_modes)
+            cpi->this_frame_percent_intra = cpi->count_mb_ref_frame_usage[INTRA_FRAME] * 100 / tot_modes;
+
+    }
+
+#if 0
+    {
+        int cnt = 0;
+        int flag[2] = {0, 0};
+
+        for (cnt = 0; cnt < MVPcount; cnt++)
+        {
+            if (cm->fc.pre_mvc[0][cnt] != cm->fc.mvc[0][cnt])
+            {
+                flag[0] = 1;
+                vpx_memcpy(cm->fc.pre_mvc[0], cm->fc.mvc[0], MVPcount);
+                break;
+            }
+        }
+
+        for (cnt = 0; cnt < MVPcount; cnt++)
+        {
+            if (cm->fc.pre_mvc[1][cnt] != cm->fc.mvc[1][cnt])
+            {
+                flag[1] = 1;
+                vpx_memcpy(cm->fc.pre_mvc[1], cm->fc.mvc[1], MVPcount);
+                break;
+            }
+        }
+
+        if (flag[0] || flag[1])
+            vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+    }
+#endif
+
+    // Adjust the projected reference frame useage probability numbers to reflect
+    // what we have just seen. This may be usefull when we make multiple itterations
+    // of the recode loop rather than continuing to use values from the previous frame.
+    if ((cm->frame_type != KEY_FRAME) && !cm->refresh_alt_ref_frame && !cm->refresh_golden_frame)
+    {
+        const int *const rfct = cpi->count_mb_ref_frame_usage;
+        const int rf_intra = rfct[INTRA_FRAME];
+        const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+        if ((rf_intra + rf_inter) > 0)
+        {
+            cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+            if (cpi->prob_intra_coded < 1)
+                cpi->prob_intra_coded = 1;
+
+            if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
+            {
+                cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+                if (cpi->prob_last_coded < 1)
+                    cpi->prob_last_coded = 1;
+
+                cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                                     ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+                if (cpi->prob_gf_coded < 1)
+                    cpi->prob_gf_coded = 1;
+            }
+        }
+    }
+
+#if 0
+    // Keep record of the total distortion this time around for future use
+    cpi->last_frame_distortion = cpi->frame_distortion;
+#endif
+
+}
+void vp8_setup_block_ptrs(MACROBLOCK *x)
+{
+    int r, c;
+    int i;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            x->block[r*4+c].src_diff = x->src_diff + r * 4 * 16 + c * 4;
+        }
+    }
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[16 + r*2+c].src_diff = x->src_diff + 256 + r * 4 * 8 + c * 4;
+        }
+    }
+
+
+    for (r = 0; r < 2; r++)
+    {
+        for (c = 0; c < 2; c++)
+        {
+            x->block[20 + r*2+c].src_diff = x->src_diff + 320 + r * 4 * 8 + c * 4;
+        }
+    }
+
+    x->block[24].src_diff = x->src_diff + 384;
+
+
+    for (i = 0; i < 25; i++)
+    {
+        x->block[i].coeff = x->coeff + i * 16;
+    }
+}
+
+void vp8_build_block_offsets(MACROBLOCK *x)
+{
+    int block = 0;
+    int br, bc;
+
+    vp8_build_block_doffsets(&x->e_mbd);
+
+    // y blocks
+    for (br = 0; br < 4; br++)
+    {
+        for (bc = 0; bc < 4; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.y_buffer;
+            this_block->src_stride = x->src.y_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+
+    // u blocks
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.u_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+
+    // v blocks
+    for (br = 0; br < 2; br++)
+    {
+        for (bc = 0; bc < 2; bc++)
+        {
+            BLOCK *this_block = &x->block[block];
+            this_block->base_src = &x->src.v_buffer;
+            this_block->src_stride = x->src.uv_stride;
+            this_block->src = 4 * br * this_block->src_stride + 4 * bc;
+            ++block;
+        }
+    }
+}
+
+static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
+{
+    const MACROBLOCKD *xd = & x->e_mbd;
+    const MB_PREDICTION_MODE m = xd->mbmi.mode;
+    const MB_PREDICTION_MODE uvm = xd->mbmi.uv_mode;
+
+#ifdef MODE_STATS
+    const int is_key = cpi->common.frame_type == KEY_FRAME;
+
+    ++ (is_key ? uv_modes : inter_uv_modes)[uvm];
+
+    if (m == B_PRED)
+    {
+        unsigned int *const bct = is_key ? b_modes : inter_b_modes;
+
+        int b = 0;
+
+        do
+        {
+            ++ bct[xd->block[b].bmi.mode];
+        }
+        while (++b < 16);
+    }
+
+#endif
+
+    ++cpi->ymode_count[m];
+    ++cpi->uv_mode_count[uvm];
+
+}
+int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
+{
+    int Error4x4, Error16x16, error_uv;
+    B_PREDICTION_MODE intra_bmodes[16];
+    int rate4x4, rate16x16, rateuv;
+    int dist4x4, dist16x16, distuv;
+    int rate = 0;
+    int rate4x4_tokenonly = 0;
+    int rate16x16_tokenonly = 0;
+    int rateuv_tokenonly = 0;
+    int i;
+
+    x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->sf.RD || cpi->compressor_speed != 2)
+    {
+        Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
+
+        //save the b modes for possible later use
+        for (i = 0; i < 16; i++)
+            intra_bmodes[i] = x->e_mbd.block[i].bmi.mode;
+
+        Error16x16 = vp8_rd_pick_intra16x16mby_mode(cpi, x, &rate16x16, &rate16x16_tokenonly, &dist16x16);
+
+        error_uv = vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
+
+        x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+        rate += rateuv;
+
+        if (Error4x4 < Error16x16)
+        {
+            rate += rate4x4;
+            x->e_mbd.mbmi.mode = B_PRED;
+
+            // get back the intra block modes
+            for (i = 0; i < 16; i++)
+                x->e_mbd.block[i].bmi.mode = intra_bmodes[i];
+
+            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+            cpi->prediction_error += Error4x4 ;
+#if 0
+            // Experimental RD code
+            cpi->frame_distortion += dist4x4;
+#endif
+        }
+        else
+        {
+            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+            rate += rate16x16;
+
+#if 0
+            // Experimental RD code
+            cpi->prediction_error += Error16x16;
+            cpi->frame_distortion += dist16x16;
+#endif
+        }
+
+        sum_intra_stats(cpi, x);
+
+        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    }
+    else
+#endif
+    {
+
+        int rate2, distortion2;
+        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+        int this_rd;
+        Error16x16 = INT_MAX;
+
+        for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+        {
+            x->e_mbd.mbmi.mode = mode;
+            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+            rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
+            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+            if (Error16x16 > this_rd)
+            {
+                Error16x16 = this_rd;
+                best_mode = mode;
+            }
+        }
+
+        vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &distortion2);
+
+        if (distortion2 == INT_MAX)
+            Error4x4 = INT_MAX;
+        else
+            Error4x4 = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+        x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+        if (Error4x4 < Error16x16)
+        {
+            x->e_mbd.mbmi.mode = B_PRED;
+            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+            cpi->prediction_error += Error4x4;
+        }
+        else
+        {
+            x->e_mbd.mbmi.mode = best_mode;
+            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+            cpi->prediction_error += Error16x16;
+        }
+
+        vp8_pick_intra_mbuv_mode(x);
+        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+        sum_intra_stats(cpi, x);
+        vp8_tokenize_mb(cpi, &x->e_mbd, t);
+    }
+
+    return rate;
+}
+#ifdef SPEEDSTATS
+extern int cnt_pm;
+#endif
+
+extern void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x);
+
+int vp8cx_encode_inter_macroblock
+(
+    VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
+    int recon_yoffset, int recon_uvoffset
+)
+{
+    MACROBLOCKD *const xd = &x->e_mbd;
+    int inter_error;
+    int intra_error = 0;
+    int rate;
+    int distortion;
+
+    x->skip = 0;
+
+    if (xd->segmentation_enabled)
+        x->encode_breakout = cpi->segment_encode_breakout[xd->mbmi.segment_id];
+    else
+        x->encode_breakout = cpi->oxcf.encode_breakout;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->sf.RD)
+    {
+        inter_error = vp8_rd_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+    }
+    else
+#endif
+        inter_error = vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error);
+
+
+    cpi->prediction_error += inter_error;
+    cpi->intra_error += intra_error;
+
+#if 0
+    // Experimental RD code
+    cpi->frame_distortion += distortion;
+    cpi->last_mb_distortion = distortion;
+#endif
+
+    // MB level adjutment to quantizer setup
+    if (xd->segmentation_enabled || cpi->zbin_mode_boost_enabled)
+    {
+        // If cyclic update enabled
+        if (cpi->cyclic_refresh_mode_enabled)
+        {
+            // Clear segment_id back to 0 if not coded (last frame 0,0)
+            if ((xd->mbmi.segment_id == 1) &&
+                ((xd->mbmi.ref_frame != LAST_FRAME) || (xd->mbmi.mode != ZEROMV)))
+            {
+                xd->mbmi.segment_id = 0;
+            }
+        }
+
+        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
+        if (cpi->zbin_mode_boost_enabled)
+        {
+            if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame != LAST_FRAME))
+                cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+            else
+                cpi->zbin_mode_boost = 0;
+        }
+
+        vp8cx_mb_init_quantizer(cpi,  x);
+    }
+
+    cpi->count_mb_ref_frame_usage[xd->mbmi.ref_frame] ++;
+
+    if (xd->mbmi.ref_frame == INTRA_FRAME)
+    {
+        x->e_mbd.mbmi.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+
+        vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
+
+        if (xd->mbmi.mode == B_PRED)
+        {
+            vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+        }
+        else
+        {
+            vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+        }
+
+        sum_intra_stats(cpi, x);
+    }
+    else
+    {
+        MV best_ref_mv;
+        MV nearest, nearby;
+        int mdcounts[4];
+
+        vp8_find_near_mvs(xd, xd->mode_info_context,
+                          &nearest, &nearby, &best_ref_mv, mdcounts, xd->mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
+
+        vp8_build_uvmvs(xd, cpi->common.full_pixel);
+
+        // store motion vectors in our motion vector list
+        if (xd->mbmi.ref_frame == LAST_FRAME)
+        {
+            // Set up pointers for this macro block into the previous frame recon buffer
+            xd->pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset;
+        }
+        else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
+        {
+            // Set up pointers for this macro block into the golden frame recon buffer
+            xd->pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+        }
+        else
+        {
+            // Set up pointers for this macro block into the alternate reference frame recon buffer
+            xd->pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+            xd->pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+            xd->pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+        }
+
+        if (xd->mbmi.mode == SPLITMV)
+        {
+            int i;
+
+            for (i = 0; i < 16; i++)
+            {
+                if (xd->block[i].bmi.mode == NEW4X4)
+                {
+                    cpi->MVcount[0][mv_max+((xd->block[i].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++;
+                    cpi->MVcount[1][mv_max+((xd->block[i].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
+                }
+            }
+        }
+        else if (xd->mbmi.mode == NEWMV)
+        {
+            cpi->MVcount[0][mv_max+((xd->block[0].bmi.mv.as_mv.row - best_ref_mv.row) >> 1)]++;
+            cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++;
+        }
+
+        if (!x->skip && !x->e_mbd.mbmi.force_no_skip)
+        {
+            vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+
+            // Clear mb_skip_coeff if mb_no_coeff_skip is not set
+            if (!cpi->common.mb_no_coeff_skip)
+                xd->mbmi.mb_skip_coeff = 0;
+
+        }
+        else
+            vp8_stuff_inter16x16(x);
+    }
+
+    if (!x->skip)
+        vp8_tokenize_mb(cpi, xd, t);
+    else
+    {
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            if (xd->mbmi.mode != B_PRED && xd->mbmi.mode != SPLITMV)
+                xd->mbmi.dc_diff = 0;
+            else
+                xd->mbmi.dc_diff = 1;
+
+            xd->mbmi.mb_skip_coeff = 1;
+            cpi->skip_true_count ++;
+            vp8_fix_contexts(cpi, xd);
+        }
+        else
+        {
+            vp8_stuff_mb(cpi, xd, t);
+            xd->mbmi.mb_skip_coeff = 0;
+            cpi->skip_false_count ++;
+        }
+    }
+
+    return rate;
+}

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
new file mode 100644
index 0000000..403d020
--- /dev/null
+++ b/vp8/encoder/encodeintra.c

@@ -0,0 +1,236 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "idct.h"
+#include "quantize.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "encodemb.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "dct.h"
+#include "g_common.h"
+#include "encodeintra.h"
+
+#define intra4x4ibias_rate    128
+#define intra4x4pbias_rate    256
+
+
+void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode)
+{
+    if (i < 12)
+    {
+        abmode[i+4] = best_mode;
+    }
+
+    if ((i & 3) != 3)
+    {
+        lbmode[i+1] = best_mode;
+    }
+
+}
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
+{
+    vp8_predict_intra4x4(b, best_mode, b->predictor);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
+
+    x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32);
+
+    x->quantize_b(be, b);
+
+    x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob);
+
+    vp8_inverse_transform_b(IF_RTCD(&rtcd->common->idct), b, 32);
+
+    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode)
+{
+    vp8_predict_intra4x4(b, best_mode, b->predictor);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16);
+
+    x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+
+    x->quantize_brd(be, b);
+
+    x->e_mbd.mbmi.mb_skip_coeff &= (!b->eob);
+
+    IDCT_INVOKE(&rtcd->common->idct, idct16)(b->dqcoeff, b->diff, 32);
+
+    RECON_INVOKE(&rtcd->common->recon, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+}
+
+void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb)
+{
+    int i;
+
+    MACROBLOCKD *x = &mb->e_mbd;
+    vp8_intra_prediction_down_copy(x);
+
+    for (i = 0; i < 16; i++)
+    {
+        BLOCK *be = &mb->block[i];
+        BLOCKD *b = &x->block[i];
+
+        vp8_encode_intra4x4block(rtcd, mb, be, b, b->bmi.mode);
+    }
+
+    return;
+}
+
+void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    int b;
+
+    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+    vp8_transform_intra_mby(x);
+
+    vp8_quantize_mby(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+    if (x->optimize && x->rddiv > 1)
+        vp8_optimize_mby(x, rtcd);
+
+#endif
+#endif
+
+    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+
+    // make sure block modes are set the way we want them for context updates
+    for (b = 0; b < 16; b++)
+    {
+        BLOCKD *d = &x->e_mbd.block[b];
+
+        switch (x->e_mbd.mbmi.mode)
+        {
+
+        case DC_PRED:
+            d->bmi.mode = B_DC_PRED;
+            break;
+        case V_PRED:
+            d->bmi.mode = B_VE_PRED;
+            break;
+        case H_PRED:
+            d->bmi.mode = B_HE_PRED;
+            break;
+        case TM_PRED:
+            d->bmi.mode = B_TM_PRED;
+            break;
+        default:
+            d->bmi.mode = B_DC_PRED;
+            break;
+
+        }
+    }
+}
+
+void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    int b;
+
+    vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+    vp8_transform_intra_mbyrd(x);
+
+    x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+    vp8_quantize_mbyrd(x);
+
+
+    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+
+    // make sure block modes are set the way we want them for context updates
+    for (b = 0; b < 16; b++)
+    {
+        BLOCKD *d = &x->e_mbd.block[b];
+
+        switch (x->e_mbd.mbmi.mode)
+        {
+
+        case DC_PRED:
+            d->bmi.mode = B_DC_PRED;
+            break;
+        case V_PRED:
+            d->bmi.mode = B_VE_PRED;
+            break;
+        case H_PRED:
+            d->bmi.mode = B_HE_PRED;
+            break;
+        case TM_PRED:
+            d->bmi.mode = B_TM_PRED;
+            break;
+        default:
+            d->bmi.mode = B_DC_PRED;
+            break;
+
+        }
+    }
+}
+
+void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_intra_predictors_mbuv(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+    vp8_transform_mbuv(x);
+
+    vp8_quantize_mbuv(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+    if (x->optimize && x->rddiv > 1)
+        vp8_optimize_mbuv(x, rtcd);
+
+#endif
+#endif
+
+    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_intra_predictors_mbuv(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+    vp8_transform_mbuvrd(x);
+
+    vp8_quantize_mbuvrd(x);
+
+
+
+    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}

diff --git a/vp8/encoder/encodeintra.h b/vp8/encoder/encodeintra.h
new file mode 100644
index 0000000..4a43ab2
--- /dev/null
+++ b/vp8/encoder/encodeintra.h

@@ -0,0 +1,24 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef _ENCODEINTRA_H_
+#define _ENCODEINTRA_H_
+#include "onyx_int.h"
+
+void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *, MACROBLOCK *mb);
+void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
+void vp8_update_mode_context(int *abmode, int *lbmode, int i, int best_mode);
+void vp8_encode_intra4x4block_rd(const VP8_ENCODER_RTCD *, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode);
+void vp8_encode_intra16x16mbyrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+void vp8_encode_intra16x16mbuvrd(const VP8_ENCODER_RTCD *, MACROBLOCK *x);
+
+#endif

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
new file mode 100644
index 0000000..d825133
--- /dev/null
+++ b/vp8/encoder/encodemb.c

@@ -0,0 +1,1129 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "encodemb.h"
+#include "reconinter.h"
+#include "quantize.h"
+#include "invtrans.h"
+#include "recon.h"
+#include "reconintra.h"
+#include "dct.h"
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *src_ptr = (*(be->base_src) + be->src);
+    short *diff_ptr = be->src_diff;
+    unsigned char *pred_ptr = bd->predictor;
+    int src_stride = be->src_stride;
+
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
+        }
+
+        diff_ptr += pitch;
+        pred_ptr += pitch;
+        src_ptr  += src_stride;
+    }
+}
+
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+{
+    short *udiff = diff + 256;
+    short *vdiff = diff + 320;
+    unsigned char *upred = pred + 256;
+    unsigned char *vpred = pred + 320;
+
+    int r, c;
+
+    for (r = 0; r < 8; r++)
+    {
+        for (c = 0; c < 8; c++)
+        {
+            udiff[c] = usrc[c] - upred[c];
+        }
+
+        udiff += 8;
+        upred += 8;
+        usrc  += stride;
+    }
+
+    for (r = 0; r < 8; r++)
+    {
+        for (c = 0; c < 8; c++)
+        {
+            vdiff[c] = vsrc[c] - vpred[c];
+        }
+
+        vdiff += 8;
+        vpred += 8;
+        vsrc  += stride;
+    }
+}
+
+void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
+{
+    int r, c;
+
+    for (r = 0; r < 16; r++)
+    {
+        for (c = 0; c < 16; c++)
+        {
+            diff[c] = src[c] - pred[c];
+        }
+
+        diff += 16;
+        pred += 16;
+        src  += stride;
+    }
+}
+
+static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+}
+
+void vp8_build_dcblock(MACROBLOCK *x)
+{
+    short *src_diff_ptr = &x->src_diff[384];
+    int i;
+
+    for (i = 0; i < 16; i++)
+    {
+        src_diff_ptr[i] = x->coeff[i * 16];
+    }
+}
+
+void vp8_transform_mbuv(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+    }
+}
+
+void vp8_transform_mbuvrd(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+    }
+}
+
+void vp8_transform_intra_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+    }
+
+    // build dc block from 16 y dc values
+    vp8_build_dcblock(x);
+
+    // do 2nd order transform on the dc block
+    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+
+}
+
+void vp8_transform_intra_mbyrd(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+    }
+
+    // build dc block from 16 y dc values
+    vp8_build_dcblock(x);
+
+    // do 2nd order transform on the dc block
+    x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+}
+
+void vp8_transform_mb(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+    }
+
+    // build dc block from 16 y dc values
+    if (x->e_mbd.mbmi.mode != SPLITMV)
+        vp8_build_dcblock(x);
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+    }
+
+    // do 2nd order transform on the dc block
+    if (x->e_mbd.mbmi.mode != SPLITMV)
+        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+
+}
+
+void vp8_transform_mby(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->vp8_short_fdct8x4(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+    }
+
+    // build dc block from 16 y dc values
+    if (x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        vp8_build_dcblock(x);
+        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+    }
+}
+
+void vp8_transform_mbrd(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 0; i < 16; i += 2)
+    {
+        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 32);
+    }
+
+    // build dc block from 16 y dc values
+    if (x->e_mbd.mbmi.mode != SPLITMV)
+        vp8_build_dcblock(x);
+
+    for (i = 16; i < 24; i += 2)
+    {
+        x->short_fdct8x4rd(&x->block[i].src_diff[0], &x->block[i].coeff[0], 16);
+    }
+
+    // do 2nd order transform on the dc block
+    if (x->e_mbd.mbmi.mode != SPLITMV)
+        x->short_walsh4x4(&x->block[24].src_diff[0], &x->block[24].coeff[0], 8);
+}
+
+void vp8_stuff_inter16x16(MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mb_s(&x->e_mbd);
+    /*
+        // recon = copy from predictors to destination
+        {
+            BLOCKD *b = &x->e_mbd.block[0];
+            unsigned char *pred_ptr = b->predictor;
+            unsigned char *dst_ptr = *(b->base_dst) + b->dst;
+            int stride = b->dst_stride;
+
+            int i;
+            for(i=0;i<16;i++)
+                vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16);
+
+            b = &x->e_mbd.block[16];
+            pred_ptr = b->predictor;
+            dst_ptr = *(b->base_dst) + b->dst;
+            stride = b->dst_stride;
+
+            for(i=0;i<8;i++)
+                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
+
+            b = &x->e_mbd.block[20];
+            pred_ptr = b->predictor;
+            dst_ptr = *(b->base_dst) + b->dst;
+            stride = b->dst_stride;
+
+            for(i=0;i<8;i++)
+                vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8);
+        }
+    */
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+extern const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+extern int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+extern int *vp8_dct_value_cost_ptr;
+
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int c = !type;              /* start at coef 0, unless Y with Y2 */
+    int eob = b->eob;
+    int pt ;    /* surrounding block/prev coef predictor */
+    int cost = 0;
+    short *qcoeff_ptr = b->qcoeff;
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I)  ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+    for (; c < eob; c++)
+    {
+        int v = QC(c);
+        int t = vp8_dct_value_tokens_ptr[v].Token;
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+        cost += vp8_dct_value_cost_ptr[v];
+        pt = vp8_prev_token_class[t];
+    }
+
+# undef QC
+
+    if (c < 16)
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+    return cost;
+}
+
+static int mbycost_coeffs(MACROBLOCK *mb)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t;
+    int type = 0;
+
+    MACROBLOCKD *x = &mb->e_mbd;
+
+    vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4);
+
+    if (x->mbmi.mode == SPLITMV)
+        type = 3;
+
+    for (b = 0; b < 16; b++)
+        cost += cost_coeffs(mb, x->block + b, type,
+                            t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+    return cost;
+}
+
+#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+void vp8_optimize_b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+    BLOCK *b = &x->block[i];
+    BLOCKD *bd = &x->e_mbd.block[i];
+    short *dequant_ptr = &bd->dequant[0][0];
+    int nzpos[16] = {0};
+    short saved_qcoefs[16];
+    short saved_dqcoefs[16];
+    int baserate, baseerror, baserd;
+    int rate, error, thisrd;
+    int k;
+    int nzcoefcount = 0;
+    int nc, bestnc = 0;
+    int besteob;
+
+    // count potential coefficient to be optimized
+    for (k = !type; k < 16; k++)
+    {
+        int qcoef = abs(bd->qcoeff[k]);
+        int coef = abs(b->coeff[k]);
+        int dq   = dequant_ptr[k];
+
+        if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq))
+        {
+            nzpos[nzcoefcount] = k;
+            nzcoefcount++;
+        }
+    }
+
+    // if nothing here, do nothing for this block.
+    if (!nzcoefcount)
+    {
+        *a = *l = (bd->eob != !type);
+        return;
+    }
+
+    // save a copy of quantized coefficients
+    vpx_memcpy(saved_qcoefs, bd->qcoeff, 32);
+    vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32);
+
+    besteob   = bd->eob;
+    baserate  = cost_coeffs(x, bd, type, a, l);
+    baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+    baserd    = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+    for (nc = 1; nc < (1 << nzcoefcount); nc++)
+    {
+        //reset coefficients
+        vpx_memcpy(bd->qcoeff,  saved_qcoefs,  32);
+        vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int pos = nzpos[k];
+
+            if ((nc & (1 << k)))
+            {
+                int cur_qcoef = bd->qcoeff[pos];
+
+                if (cur_qcoef < 0)
+                {
+                    bd->qcoeff[pos]++;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    bd->qcoeff[pos]--;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+            }
+        }
+
+        {
+            int eob = -1;
+            int rc;
+            int m;
+
+            for (m = 0; m < 16; m++)
+            {
+                rc   = vp8_default_zig_zag1d[m];
+
+                if (bd->qcoeff[rc])
+                    eob = m;
+            }
+
+            bd->eob = eob + 1;
+        }
+
+        rate  = cost_coeffs(x, bd, type, a, l);
+        error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+        thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+        if (thisrd < baserd)
+        {
+            baserd = thisrd;
+            bestnc = nc;
+            besteob = bd->eob;
+        }
+    }
+
+    //reset coefficients
+    vpx_memcpy(bd->qcoeff,  saved_qcoefs, 32);
+    vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+    if (bestnc)
+    {
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int pos = nzpos[k];
+
+            if (bestnc & (1 << k))
+            {
+                int cur_qcoef = bd->qcoeff[pos];
+
+                if (cur_qcoef < 0)
+                {
+                    bd->qcoeff[pos]++;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    bd->qcoeff[pos]--;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+            }
+        }
+
+#if 0
+        {
+            int eob = -1;
+            int rc;
+            int m;
+
+            for (m = 0; m < 16; m++)
+            {
+                rc   = vp8_default_zig_zag1d[m];
+
+                if (bd->qcoeff[rc])
+                    eob = m;
+            }
+
+            bd->eob = eob + 1;
+        }
+#endif
+    }
+
+#if 1
+    bd->eob = besteob;
+#endif
+#if 0
+    {
+        int eob = -1;
+        int rc;
+        int m;
+
+        for (m = 0; m < 16; m++)
+        {
+            rc   = vp8_default_zig_zag1d[m];
+
+            if (bd->qcoeff[rc])
+                eob = m;
+        }
+
+        bd->eob = eob + 1;
+    }
+
+#endif
+    *a = *l = (bd->eob != !type);
+    return;
+}
+
+void vp8_optimize_bplus(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+    BLOCK *b = &x->block[i];
+    BLOCKD *bd = &x->e_mbd.block[i];
+    short *dequant_ptr = &bd->dequant[0][0];
+    int nzpos[16] = {0};
+    short saved_qcoefs[16];
+    short saved_dqcoefs[16];
+    int baserate, baseerror, baserd;
+    int rate, error, thisrd;
+    int k;
+    int nzcoefcount = 0;
+    int nc, bestnc = 0;
+    int besteob;
+
+    // count potential coefficient to be optimized
+    for (k = !type; k < 16; k++)
+    {
+        int qcoef = abs(bd->qcoeff[k]);
+        int coef = abs(b->coeff[k]);
+        int dq   = dequant_ptr[k];
+
+        if (qcoef && (qcoef * dq < coef) && (coef < (qcoef * dq + dq)))
+        {
+            nzpos[nzcoefcount] = k;
+            nzcoefcount++;
+        }
+    }
+
+    // if nothing here, do nothing for this block.
+    if (!nzcoefcount)
+    {
+        //do not update context, we need do the other half.
+        //*a = *l = (bd->eob != !type);
+        return;
+    }
+
+    // save a copy of quantized coefficients
+    vpx_memcpy(saved_qcoefs, bd->qcoeff, 32);
+    vpx_memcpy(saved_dqcoefs, bd->dqcoeff, 32);
+
+    besteob   = bd->eob;
+    baserate  = cost_coeffs(x, bd, type, a, l);
+    baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+    baserd    = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+    for (nc = 1; nc < (1 << nzcoefcount); nc++)
+    {
+        //reset coefficients
+        vpx_memcpy(bd->qcoeff, saved_qcoefs, 32);
+        vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int pos = nzpos[k];
+
+            if ((nc & (1 << k)))
+            {
+                int cur_qcoef = bd->qcoeff[pos];
+
+                if (cur_qcoef < 0)
+                {
+                    bd->qcoeff[pos]--;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    bd->qcoeff[pos]++;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+            }
+        }
+
+        {
+            int eob = -1;
+            int rc;
+            int m;
+
+            for (m = 0; m < 16; m++)
+            {
+                rc   = vp8_default_zig_zag1d[m];
+
+                if (bd->qcoeff[rc])
+                    eob = m;
+            }
+
+            bd->eob = eob + 1;
+        }
+
+        rate  = cost_coeffs(x, bd, type, a, l);
+        error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 2;
+        thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+        if (thisrd < baserd)
+        {
+            baserd = thisrd;
+            bestnc = nc;
+            besteob = bd->eob;
+        }
+    }
+
+    //reset coefficients
+    vpx_memcpy(bd->qcoeff,  saved_qcoefs, 32);
+    vpx_memcpy(bd->dqcoeff, saved_dqcoefs, 32);
+
+    if (bestnc)
+    {
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int pos = nzpos[k];
+
+            if (bestnc & (1 << k))
+            {
+                int cur_qcoef = bd->qcoeff[pos];
+
+                if (cur_qcoef < 0)
+                {
+                    bd->qcoeff[pos]++;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    bd->qcoeff[pos]--;
+                    bd->dqcoeff[pos] = bd->qcoeff[pos] * dequant_ptr[pos];
+                }
+            }
+        }
+    }
+
+    bd->eob = besteob;
+    //do not update context, we need do the other half.
+    //*a = *l = (bd->eob != !type);
+    return;
+}
+
+void vp8_optimize_y2b(MACROBLOCK *x, int i, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, const VP8_ENCODER_RTCD *rtcd)
+{
+
+    BLOCK *b = &x->block[i];
+    BLOCKD *bd = &x->e_mbd.block[i];
+    short *dequant_ptr = &bd->dequant[0][0];
+
+    int baserate, baseerror, baserd;
+    int rate, error, thisrd;
+    int k;
+
+    if (bd->eob == 0)
+        return;
+
+    baserate  = cost_coeffs(x, bd, type, a, l);
+    baseerror = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4;
+    baserd = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+    for (k = 0; k < 16; k++)
+    {
+        int cur_qcoef = bd->qcoeff[k];
+
+        if (!cur_qcoef)
+            continue;
+
+        if (cur_qcoef < 0)
+        {
+            bd->qcoeff[k]++;
+            bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k];
+        }
+        else
+        {
+            bd->qcoeff[k]--;
+            bd->dqcoeff[k] = bd->qcoeff[k] * dequant_ptr[k];
+        }
+
+        if (bd->qcoeff[k] == 0)
+        {
+            int eob = -1;
+            int rc;
+            int l;
+
+            for (l = 0; l < 16; l++)
+            {
+                rc   = vp8_default_zig_zag1d[l];
+
+                if (bd->qcoeff[rc])
+                    eob = l;
+            }
+
+            bd->eob = eob + 1;
+        }
+
+        rate  =   cost_coeffs(x, bd, type, a, l);
+        error = ENCODEMB_INVOKE(&rtcd->encodemb, berr)(b->coeff, bd->dqcoeff) >> 4;
+        thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+        if (thisrd > baserd)
+        {
+            bd->qcoeff[k] = cur_qcoef;
+            bd->dqcoeff[k] = cur_qcoef * dequant_ptr[k];
+        }
+        else
+        {
+            baserd = thisrd;
+        }
+
+    }
+
+    {
+        int eob = -1;
+        int rc;
+
+        for (k = 0; k < 16; k++)
+        {
+            rc   = vp8_default_zig_zag1d[k];
+
+            if (bd->qcoeff[rc])
+                eob = k;
+        }
+
+        bd->eob = eob + 1;
+    }
+
+    return;
+}
+
+
+void vp8_optimize_mb(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t, t2;
+    int type = 0;
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+    if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+        type = 3;
+
+    for (b = 0; b < 16; b++)
+    {
+        //vp8_optimize_bplus(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+        vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+    }
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+    vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+    for (b = 16; b < 20; b++)
+    {
+        //vp8_optimize_bplus(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+        vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+    }
+
+    for (b = 20; b < 24; b++)
+    {
+        //vp8_optimize_bplus(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]);
+        vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+    }
+}
+
+
+
+void vp8_super_slow_yquant_optimization(MACROBLOCK *x, int type, const VP8_ENCODER_RTCD *rtcd)
+{
+    BLOCK  *b = &x->block[0];
+    BLOCKD *bd = &x->e_mbd.block[0];
+    short *dequant_ptr = &bd->dequant[0][0];
+    struct
+    {
+        int block;
+        int pos;
+    } nzpos[256];
+    short saved_qcoefs[256];
+    short saved_dqcoefs[256];
+    short *coef_ptr   = x->coeff;
+    short *qcoef_ptr  = x->e_mbd.qcoeff;
+    short *dqcoef_ptr = x->e_mbd.dqcoeff;
+
+    int baserate, baseerror, baserd;
+    int rate, error, thisrd;
+    int i, k;
+    int nzcoefcount = 0;
+    int nc, bestnc = 0;
+    int besteob;
+
+    //this code has assumption in macroblock coeff buffer layout
+    for (i = 0; i < 16; i++)
+    {
+        // count potential coefficient to be optimized
+        for (k = !type; k < 16; k++)
+        {
+            int qcoef = abs(qcoef_ptr[i*16 + k]);
+            int coef = abs(coef_ptr[i*16 + k]);
+            int dq   = dequant_ptr[k];
+
+            if (qcoef && (qcoef * dq > coef) && (qcoef * dq < coef + dq))
+            {
+                nzpos[nzcoefcount].block = i;
+                nzpos[nzcoefcount].pos   = k;
+                nzcoefcount++;
+            }
+        }
+    }
+
+    // if nothing here, do nothing for this macro_block.
+    if (!nzcoefcount || nzcoefcount > 15)
+    {
+        return;
+    }
+
+    /******************************************************************************
+    looking from each coeffient's perspective, each identifed coefficent above could
+    have 2 values:roundeddown(x) and roundedup(x). Therefore the total number of
+    different states is less than 2**nzcoefcount.
+    ******************************************************************************/
+    // save the qunatized coefficents and dequantized coefficicents
+    vpx_memcpy(saved_qcoefs, x->e_mbd.qcoeff,  256);
+    vpx_memcpy(saved_dqcoefs, x->e_mbd.dqcoeff, 256);
+
+    baserate    = mbycost_coeffs(x);
+    baseerror   = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type);
+    baserd      = RDFUNC(x->rdmult, x->rddiv, baserate, baseerror, 100);
+
+    for (nc = 1; nc < (1 << nzcoefcount); nc++)
+    {
+        //reset coefficients
+        vpx_memcpy(x->e_mbd.qcoeff,  saved_qcoefs, 256);
+        vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256);
+
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int bk  = nzpos[k].block;
+            int pos = nzpos[k].pos;
+            int mbkpos  = bk * 16 + pos;
+
+            if ((nc & (1 << k)))
+            {
+                int cur_qcoef = x->e_mbd.qcoeff[mbkpos];
+
+                if (cur_qcoef < 0)
+                {
+                    x->e_mbd.qcoeff[mbkpos]++;
+                    x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    x->e_mbd.qcoeff[mbkpos]--;
+                    x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+                }
+            }
+        }
+
+        for (i = 0; i < 16; i++)
+        {
+            BLOCKD *bd = &x->e_mbd.block[i];
+            {
+                int eob = -1;
+                int rc;
+                int l;
+
+                for (l = 0; l < 16; l++)
+                {
+                    rc   = vp8_default_zig_zag1d[l];
+
+                    if (bd->qcoeff[rc])
+                        eob = l;
+                }
+
+                bd->eob = eob + 1;
+            }
+        }
+
+        rate  = mbycost_coeffs(x);
+        error = ENCODEMB_INVOKE(&rtcd->encodemb, mberr)(x, !type);;
+        thisrd = RDFUNC(x->rdmult, x->rddiv, rate, error, 100);
+
+        if (thisrd < baserd)
+        {
+            baserd = thisrd;
+            bestnc = nc;
+            besteob = bd->eob;
+        }
+    }
+
+    //reset coefficients
+    vpx_memcpy(x->e_mbd.qcoeff,  saved_qcoefs, 256);
+    vpx_memcpy(x->e_mbd.dqcoeff, saved_dqcoefs, 256);
+
+    if (bestnc)
+    {
+        for (k = 0; k < nzcoefcount; k++)
+        {
+            int bk  = nzpos[k].block;
+            int pos = nzpos[k].pos;
+            int mbkpos  = bk * 16 + pos;
+
+            if ((nc & (1 << k)))
+            {
+                int cur_qcoef = x->e_mbd.qcoeff[mbkpos];
+
+                if (cur_qcoef < 0)
+                {
+                    x->e_mbd.qcoeff[mbkpos]++;
+                    x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+                }
+                else
+                {
+                    x->e_mbd.qcoeff[mbkpos]--;
+                    x->e_mbd.dqcoeff[mbkpos] = x->e_mbd.qcoeff[mbkpos] * dequant_ptr[pos];
+                }
+            }
+        }
+    }
+
+    for (i = 0; i < 16; i++)
+    {
+        BLOCKD *bd = &x->e_mbd.block[i];
+        {
+            int eob = -1;
+            int rc;
+            int l;
+
+            for (l = 0; l < 16; l++)
+            {
+                rc   = vp8_default_zig_zag1d[l];
+
+                if (bd->qcoeff[rc])
+                    eob = l;
+            }
+
+            bd->eob = eob + 1;
+        }
+    }
+
+    return;
+}
+
+static void vp8_find_mb_skip_coef(MACROBLOCK *x)
+{
+    int i;
+
+    x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+    if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+        }
+
+        for (i = 16; i < 25; i++)
+        {
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+    else
+    {
+        for (i = 0; i < 24; i++)
+        {
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+}
+
+
+void vp8_optimize_mb_slow(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t, t2;
+    int type = 0;
+
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+    if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+        type = 3;
+
+    vp8_super_slow_yquant_optimization(x, type, rtcd);
+    /*
+    for(b=0;b<16;b++)
+    {
+        vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+    }
+    */
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+
+    for (b = 16; b < 20; b++)
+    {
+        vp8_optimize_b(x, b, vp8_block2type[b], t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+    }
+
+    vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+    for (b = 20; b < 24; b++)
+    {
+        vp8_optimize_b(x, b, vp8_block2type[b], t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+    }
+}
+
+
+void vp8_optimize_mby(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t;
+    int type = 0;
+
+    if (!x->e_mbd.above_context[Y1CONTEXT])
+        return;
+
+    if (!x->e_mbd.left_context[Y1CONTEXT])
+        return;
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[Y1CONTEXT], x->e_mbd.left_context[Y1CONTEXT], 4);
+
+    if (x->e_mbd.mbmi.mode == SPLITMV || x->e_mbd.mbmi.mode == B_PRED)
+        type = 3;
+
+    for (b = 0; b < 16; b++)
+    {
+        vp8_optimize_b(x, b, type, t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+    }
+
+}
+
+void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t, t2;
+    int type = 0;
+
+    if (!x->e_mbd.above_context[UCONTEXT])
+        return;
+
+    if (!x->e_mbd.left_context[UCONTEXT])
+        return;
+
+    if (!x->e_mbd.above_context[VCONTEXT])
+        return;
+
+    if (!x->e_mbd.left_context[VCONTEXT])
+        return;
+
+
+    vp8_setup_temp_context(&t, x->e_mbd.above_context[UCONTEXT], x->e_mbd.left_context[UCONTEXT], 2);
+    vp8_setup_temp_context(&t2, x->e_mbd.above_context[VCONTEXT], x->e_mbd.left_context[VCONTEXT], 2);
+
+    for (b = 16; b < 20; b++)
+    {
+        vp8_optimize_b(x, b, vp8_block2type[b],
+                       t.a + vp8_block2above[b], t.l + vp8_block2left[b], rtcd);
+
+    }
+
+    for (b = 20; b < 24; b++)
+    {
+        vp8_optimize_b(x, b, vp8_block2type[b],
+                       t2.a + vp8_block2above[b], t2.l + vp8_block2left[b], rtcd);
+    }
+
+}
+#endif
+
+void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mb(&x->e_mbd);
+
+    vp8_subtract_mb(rtcd, x);
+
+    vp8_transform_mb(x);
+
+    vp8_quantize_mb(x);
+
+#if !(CONFIG_REALTIME_ONLY)
+#if 1
+
+    if (x->optimize && x->rddiv > 1)
+    {
+        vp8_optimize_mb(x, rtcd);
+        vp8_find_mb_skip_coef(x);
+    }
+
+#endif
+#endif
+
+    vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+/* this funciton is used by first pass only */
+void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mby(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride);
+
+    vp8_transform_mby(x);
+
+    vp8_quantize_mby(x);
+
+    vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+void vp8_encode_inter16x16uv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mbuv(&x->e_mbd);
+
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+    vp8_transform_mbuv(x);
+
+    vp8_quantize_mbuv(x);
+
+    vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
+
+    vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+}
+
+
+void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
+{
+    vp8_build_inter_predictors_mbuv(&x->e_mbd);
+    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+    vp8_transform_mbuvrd(x);
+
+    vp8_quantize_mbuvrd(x);
+
+}

diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
new file mode 100644
index 0000000..91ca8f5
--- /dev/null
+++ b/vp8/encoder/encodemb.h

@@ -0,0 +1,112 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMB_H
+#define __INC_ENCODEMB_H
+
+#include "vpx_ports/config.h"
+#include "block.h"
+
+#define prototype_mberr(sym) \
+    int (sym)(MACROBLOCK *mb, int dc)
+
+#define prototype_berr(sym) \
+    int (sym)(short *coeff, short *dqcoeff)
+
+#define prototype_mbuverr(sym) \
+    int (sym)(MACROBLOCK *mb)
+
+#define prototype_subb(sym) \
+    void (sym)(BLOCK *be,BLOCKD *bd, int pitch)
+
+#define prototype_submby(sym) \
+    void (sym)(short *diff, unsigned char *src, unsigned char *pred, int stride)
+
+#define prototype_submbuv(sym) \
+    void (sym)(short *diff, unsigned char *usrc, unsigned char *vsrc,\
+               unsigned char *pred, int stride)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/encodemb_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/encodemb_arm.h"
+#endif
+
+#ifndef vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_c
+#endif
+extern prototype_berr(vp8_encodemb_berr);
+
+#ifndef vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_c
+#endif
+extern prototype_mberr(vp8_encodemb_mberr);
+
+#ifndef vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_c
+#endif
+extern prototype_mbuverr(vp8_encodemb_mbuverr);
+
+#ifndef vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_c
+#endif
+extern prototype_subb(vp8_encodemb_subb);
+
+#ifndef vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_c
+#endif
+extern prototype_submby(vp8_encodemb_submby);
+
+#ifndef vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_c
+#endif
+extern prototype_submbuv(vp8_encodemb_submbuv);
+
+
+typedef struct
+{
+    prototype_berr(*berr);
+    prototype_mberr(*mberr);
+    prototype_mbuverr(*mbuverr);
+    prototype_subb(*subb);
+    prototype_submby(*submby);
+    prototype_submbuv(*submbuv);
+} vp8_encodemb_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define ENCODEMB_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define ENCODEMB_INVOKE(ctx,fn) vp8_encodemb_##fn
+#endif
+
+
+
+#include "onyx_int.h"
+struct VP8_ENCODER_RTCD;
+void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
+extern void vp8_stuff_inter16x16(MACROBLOCK *x);
+
+void vp8_build_dcblock(MACROBLOCK *b);
+void vp8_transform_mb(MACROBLOCK *mb);
+void vp8_transform_mbuv(MACROBLOCK *x);
+void vp8_transform_mbuvrd(MACROBLOCK *x);
+void vp8_transform_intra_mby(MACROBLOCK *x);
+void vp8_transform_intra_mbyrd(MACROBLOCK *x);
+void Encode16x16Y(MACROBLOCK *x);
+void Encode16x16UV(MACROBLOCK *x);
+void vp8_encode_inter16x16uv(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
+void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+#endif

diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
new file mode 100644
index 0000000..f287edc
--- /dev/null
+++ b/vp8/encoder/encodemv.c

@@ -0,0 +1,445 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "common.h"
+#include "encodemv.h"
+#include "entropymode.h"
+#include "systemdependent.h"
+
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+extern unsigned int active_section;
+#endif
+
+static void encode_mvcomponent(
+    vp8_writer *const w,
+    const int v,
+    const struct mv_context *mvc
+)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v < 0 ? -v : v;
+
+    if (x < mvnum_short)     // Small
+    {
+        vp8_write(w, 0, p [mvpis_short]);
+        vp8_treed_write(w, vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return;         // no sign bit
+    }
+    else                    // Large
+    {
+        int i = 0;
+
+        vp8_write(w, 1, p [mvpis_short]);
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            vp8_write(w, (x >> i) & 1, p [MVPbits + i]);
+
+        while (--i > 3);
+
+        if (x & 0xFFF0)
+            vp8_write(w, (x >> 3) & 1, p [MVPbits + 3]);
+    }
+
+    vp8_write(w, v < 0, p [MVPsign]);
+}
+#if 0
+static int max_mv_r = 0;
+static int max_mv_c = 0;
+#endif
+void vp8_encode_motion_vector(vp8_writer *w, const MV *mv, const MV_CONTEXT *mvc)
+{
+
+#if 0
+    {
+        if (abs(mv->row >> 1) > max_mv_r)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            max_mv_r = abs(mv->row >> 1);
+            fprintf(f, "New Mv Row Max %6d\n", (mv->row >> 1));
+
+            if ((abs(mv->row) / 2) != max_mv_r)
+                fprintf(f, "MV Row conversion error %6d\n", abs(mv->row) / 2);
+
+            fclose(f);
+        }
+
+        if (abs(mv->col >> 1) > max_mv_c)
+        {
+            FILE *f = fopen("maxmv.stt", "a");
+            fprintf(f, "New Mv Col Max %6d\n", (mv->col >> 1));
+            max_mv_c = abs(mv->col >> 1);
+            fclose(f);
+        }
+    }
+#endif
+
+    encode_mvcomponent(w, mv->row >> 1, &mvc[0]);
+    encode_mvcomponent(w, mv->col >> 1, &mvc[1]);
+}
+
+
+static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
+{
+    const vp8_prob *p = mvc->prob;
+    const int x = v;   //v<0? -v:v;
+    unsigned int cost;
+
+    if (x < mvnum_short)
+    {
+        cost = vp8_cost_zero(p [mvpis_short])
+               + vp8_treed_cost(vp8_small_mvtree, p + MVPshort, x, 3);
+
+        if (!x)
+            return cost;
+    }
+    else
+    {
+        int i = 0;
+        cost = vp8_cost_one(p [mvpis_short]);
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (++i < 3);
+
+        i = mvlong_width - 1;  /* Skip bit 3, which is sometimes implicit */
+
+        do
+            cost += vp8_cost_bit(p [MVPbits + i], (x >> i) & 1);
+
+        while (--i > 3);
+
+        if (x & 240)
+            cost += vp8_cost_bit(p [MVPbits + 3], (x >> 3) & 1);
+    }
+
+    return cost;   // + vp8_cost_bit( p [MVPsign], v < 0);
+}
+//#define M_LOG2_E 0.693147180559945309417
+//#define log2f(x) (log (x) / (float) M_LOG2_E)
+
+void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+{
+    int i = 1;   //-mv_max;
+    unsigned int cost0 = 0;
+    unsigned int cost1 = 0;
+
+    vp8_clear_system_state();
+#if 0
+    mvsadcost [0] [0] = 300;
+    mvsadcost [1] [0] = 300;
+
+    do
+    {
+        double z = 256 * (2 * (log2f(2 * i) + .6));
+        mvsadcost [0][i] = (int) z;
+        mvsadcost [1][i] = (int) z;
+        mvsadcost [0][-i] = (int) z;
+        mvsadcost [1][-i] = (int) z;
+    }
+    while (++i <= mv_max);
+
+#endif
+
+    i = 1;
+
+    if (mvc_flag[0])
+    {
+        mvcost [0] [0] = cost_mvcomponent(0, &mvc[0]);
+
+        do
+        {
+            //mvcost [0] [i] = cost_mvcomponent( i, &mvc[0]);
+            cost0 = cost_mvcomponent(i, &mvc[0]);
+
+            mvcost [0] [i] = cost0 + vp8_cost_zero(mvc[0].prob[MVPsign]);
+            mvcost [0] [-i] = cost0 + vp8_cost_one(mvc[0].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+
+    i = 1;
+
+    if (mvc_flag[1])
+    {
+        mvcost [1] [0] = cost_mvcomponent(0, &mvc[1]);
+
+        do
+        {
+            //mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
+            cost1 = cost_mvcomponent(i, &mvc[1]);
+
+            mvcost [1] [i] = cost1 + vp8_cost_zero(mvc[1].prob[MVPsign]);
+            mvcost [1] [-i] = cost1 + vp8_cost_one(mvc[1].prob[MVPsign]);
+        }
+        while (++i <= mv_max);
+    }
+
+    /*
+        i=-mv_max;
+        do
+        {
+            mvcost [0] [i] = cost_mvcomponent( i, mvc[0]);
+            mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
+        }
+        while( ++i <= mv_max);
+    */
+}
+
+
+// Motion vector probability table update depends on benefit.
+// Small correction allows for the fact that an update to an MV probability
+// may have benefit in subsequent frames as well as the current one.
+
+#define MV_PROB_UPDATE_CORRECTION   -1
+
+
+__inline static void calc_prob(vp8_prob *p, const unsigned int ct[2])
+{
+    const unsigned int tot = ct[0] + ct[1];
+
+    if (tot)
+    {
+        const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+        *p = x ? x : 1;
+    }
+}
+
+static void update(
+    vp8_writer *const w,
+    const unsigned int ct[2],
+    vp8_prob *const cur_p,
+    const vp8_prob new_p,
+    const vp8_prob update_p,
+    int *updated
+)
+{
+    const int cur_b = vp8_cost_branch(ct, *cur_p);
+    const int new_b = vp8_cost_branch(ct, new_p);
+    const int cost = 7 + MV_PROB_UPDATE_CORRECTION + ((vp8_cost_one(update_p) - vp8_cost_zero(update_p) + 128) >> 8);
+
+    if (cur_b - new_b > cost)
+    {
+        *cur_p = new_p;
+        vp8_write(w, 1, update_p);
+        vp8_write_literal(w, new_p >> 1, 7);
+        *updated = 1;
+
+    }
+    else
+        vp8_write(w, 0, update_p);
+}
+
+static void write_component_probs(
+    vp8_writer *const w,
+    struct mv_context *cur_mvc,
+    const struct mv_context *default_mvc_,
+    const struct mv_context *update_mvc,       
+    const unsigned int events [MVvals],
+    unsigned int rc,
+    int *updated
+)
+{
+    vp8_prob *Pcur = cur_mvc->prob;
+    const vp8_prob *default_mvc = default_mvc_->prob;
+    const vp8_prob *Pupdate = update_mvc->prob;
+    unsigned int is_short_ct[2], sign_ct[2];
+
+    unsigned int bit_ct [mvlong_width] [2];
+
+    unsigned int short_ct  [mvnum_short];
+    unsigned int short_bct [mvnum_short-1] [2];
+
+    vp8_prob Pnew [MVPcount];
+
+    (void) rc;
+    vp8_copy_array(Pnew, default_mvc, MVPcount);
+
+    vp8_zero(is_short_ct)
+    vp8_zero(sign_ct)
+    vp8_zero(bit_ct)
+    vp8_zero(short_ct)
+    vp8_zero(short_bct)
+
+
+    //j=0
+    {
+        int j = 0;
+
+        const int c = events [mv_max];
+
+        is_short_ct [0] += c;     // Short vector
+        short_ct [0] += c;       // Magnitude distribution
+    }
+
+    //j: 1 ~ mv_max (1023)
+    {
+        int j = 1;
+
+        do
+        {
+            const int c1 = events [mv_max + j];  //positive
+            const int c2 = events [mv_max - j];  //negative
+            const int c  = c1 + c2;
+            int a = j;
+
+            sign_ct [0] += c1;
+            sign_ct [1] += c2;
+
+            if (a < mvnum_short)
+            {
+                is_short_ct [0] += c;     // Short vector
+                short_ct [a] += c;       // Magnitude distribution
+            }
+            else
+            {
+                int k = mvlong_width - 1;
+                is_short_ct [1] += c;     // Long vector
+
+                /*  bit 3 not always encoded. */
+                do
+                    bit_ct [k] [(a >> k) & 1] += c;
+
+                while (--k >= 0);
+            }
+        }
+        while (++j <= mv_max);
+    }
+
+    /*
+    {
+        int j = -mv_max;
+        do
+        {
+
+            const int c = events [mv_max + j];
+            int a = j;
+
+            if( j < 0)
+            {
+                sign_ct [1] += c;
+                a = -j;
+            }
+            else if( j)
+                sign_ct [0] += c;
+
+            if( a < mvnum_short)
+            {
+                is_short_ct [0] += c;     // Short vector
+                short_ct [a] += c;       // Magnitude distribution
+            }
+            else
+            {
+                int k = mvlong_width - 1;
+                is_short_ct [1] += c;     // Long vector
+
+                //  bit 3 not always encoded.
+
+                do
+                    bit_ct [k] [(a >> k) & 1] += c;
+                while( --k >= 0);
+            }
+        } while( ++j <= mv_max);
+    }
+    */
+
+    calc_prob(Pnew + mvpis_short, is_short_ct);
+
+    calc_prob(Pnew + MVPsign, sign_ct);
+
+    {
+        vp8_prob p [mvnum_short - 1];    /* actually only need branch ct */
+        int j = 0;
+
+        vp8_tree_probs_from_distribution(
+            8, vp8_small_mvencodings, vp8_small_mvtree,
+            p, short_bct, short_ct,
+            256, 1
+        );
+
+        do
+            calc_prob(Pnew + MVPshort + j, short_bct[j]);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        int j = 0;
+
+        do
+            calc_prob(Pnew + MVPbits + j, bit_ct[j]);
+
+        while (++j < mvlong_width);
+    }
+
+    update(w, is_short_ct, Pcur + mvpis_short, Pnew[mvpis_short], *Pupdate++, updated);
+
+    update(w, sign_ct, Pcur + MVPsign, Pnew[MVPsign], *Pupdate++, updated);
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPshort;
+        vp8_prob *const cur_p = Pcur + MVPshort;
+
+        int j = 0;
+
+        do
+
+            update(w, short_bct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvnum_short - 1);
+    }
+
+    {
+        const vp8_prob *const new_p = Pnew + MVPbits;
+        vp8_prob *const cur_p = Pcur + MVPbits;
+
+        int j = 0;
+
+        do
+
+            update(w, bit_ct[j], cur_p + j, new_p[j], *Pupdate++, updated);
+
+        while (++j < mvlong_width);
+    }
+}
+
+void vp8_write_mvprobs(VP8_COMP *cpi)
+{
+    vp8_writer *const w  = & cpi->bc;
+    MV_CONTEXT *mvc = cpi->common.fc.mvc;
+    int flags[2] = {0, 0};
+#ifdef ENTROPY_STATS
+    active_section = 4;
+#endif
+    write_component_probs(
+        w, &mvc[0], &vp8_default_mv_context[0], &vp8_mv_update_probs[0], cpi->MVcount[0], 0, &flags[0]
+    );
+    write_component_probs(
+        w, &mvc[1], &vp8_default_mv_context[1], &vp8_mv_update_probs[1], cpi->MVcount[1], 1, &flags[1]
+    );
+
+    if (flags[0] || flags[1])
+        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+
+#ifdef ENTROPY_STATS
+    active_section = 5;
+#endif
+}

diff --git a/vp8/encoder/encodemv.h b/vp8/encoder/encodemv.h
new file mode 100644
index 0000000..1c1f450
--- /dev/null
+++ b/vp8/encoder/encodemv.h

@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_ENCODEMV_H
+#define __INC_ENCODEMV_H
+
+#include "onyx_int.h"
+
+void vp8_write_mvprobs(VP8_COMP *);
+void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
+void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+
+#endif

diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
new file mode 100644
index 0000000..a0b50d2
--- /dev/null
+++ b/vp8/encoder/ethreading.c

@@ -0,0 +1,510 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyx_int.h"
+#include "threading.h"
+#include "common.h"
+#include "extend.h"
+
+
+extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
+extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+
+static
+THREAD_FUNCTION thread_encoding_proc(void *p_data)
+{
+#if CONFIG_MULTITHREAD
+    int ithread = ((ENCODETHREAD_DATA *)p_data)->ithread;
+    VP8_COMP *cpi   = (VP8_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr1);
+    MB_ROW_COMP *mbri = (MB_ROW_COMP *)(((ENCODETHREAD_DATA *)p_data)->ptr2);
+    ENTROPY_CONTEXT mb_row_left_context[4][4];
+
+    //printf("Started thread %d\n", ithread);
+
+    while (1)
+    {
+        if (cpi->b_multi_threaded == 0)
+            break;
+
+        //if(WaitForSingleObject(cpi->h_event_mbrencoding[ithread], INFINITE) == WAIT_OBJECT_0)
+        if (sem_wait(&cpi->h_event_mbrencoding[ithread]) == 0)
+        {
+            if (cpi->b_multi_threaded == FALSE) // we're shutting down
+                break;
+            else
+            {
+                VP8_COMMON *cm      = &cpi->common;
+                int mb_row           = mbri->mb_row;
+                MACROBLOCK  *x      = &mbri->mb;
+                MACROBLOCKD *xd     = &x->e_mbd;
+                TOKENEXTRA **tp     = &mbri->tp;
+                int *segment_counts  = mbri->segment_counts;
+                int *totalrate      = &mbri->totalrate;
+
+                {
+                    int i;
+                    int recon_yoffset, recon_uvoffset;
+                    int mb_col;
+                    int recon_y_stride = cm->last_frame.y_stride;
+                    int recon_uv_stride = cm->last_frame.uv_stride;
+                    volatile int *last_row_current_mb_col;
+
+                    if (ithread > 0)
+                        last_row_current_mb_col = &cpi->mb_row_ei[ithread-1].current_mb_col;
+                    else
+                        last_row_current_mb_col = &cpi->current_mb_col_main;
+
+                    // reset above block coeffs
+                    xd->above_context[Y1CONTEXT] = cm->above_context[Y1CONTEXT];
+                    xd->above_context[UCONTEXT ] = cm->above_context[UCONTEXT ];
+                    xd->above_context[VCONTEXT ] = cm->above_context[VCONTEXT ];
+                    xd->above_context[Y2CONTEXT] = cm->above_context[Y2CONTEXT];
+                    xd->left_context = mb_row_left_context;
+
+                    vp8_zero(mb_row_left_context);
+
+                    xd->up_available = (mb_row != 0);
+                    recon_yoffset = (mb_row * recon_y_stride * 16);
+                    recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+
+                    cpi->tplist[mb_row].start = *tp;
+
+                    //printf("Thread mb_row = %d\n", mb_row);
+
+                    // for each macroblock col in image
+                    for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+                    {
+                        int seg_map_index = (mb_row * cm->mb_cols);
+
+                        while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != cm->mb_cols - 1)
+                        {
+                            x86_pause_hint();
+                            thread_sleep(0);
+                        }
+
+                        // Distance of Mb to the various image edges.
+                        // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
+                        xd->mb_to_left_edge = -((mb_col * 16) << 3);
+                        xd->mb_to_right_edge = ((cm->mb_cols - 1 - mb_col) * 16) << 3;
+                        xd->mb_to_top_edge = -((mb_row * 16) << 3);
+                        xd->mb_to_bottom_edge = ((cm->mb_rows - 1 - mb_row) * 16) << 3;
+
+                        // Set up limit values for motion vectors used to prevent them extending outside the UMV borders
+                        x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+                        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+                        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+                        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+                        xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+                        xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+                        xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+                        xd->left_available = (mb_col != 0);
+
+                        // Is segmentation enabled
+                        // MB level adjutment to quantizer
+                        if (xd->segmentation_enabled)
+                        {
+                            // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
+                            if (cpi->segmentation_map[seg_map_index+mb_col] <= 3)
+                                xd->mbmi.segment_id = cpi->segmentation_map[seg_map_index+mb_col];
+                            else
+                                xd->mbmi.segment_id = 0;
+
+                            vp8cx_mb_init_quantizer(cpi, x);
+                        }
+                        else
+                            xd->mbmi.segment_id = 0;         // Set to Segment 0 by default
+
+
+                        if (cm->frame_type == KEY_FRAME)
+                        {
+                            *totalrate += vp8cx_encode_intra_macro_block(cpi, x, tp);
+#ifdef MODE_STATS
+                            y_modes[xd->mbmi.mode] ++;
+#endif
+                        }
+                        else
+                        {
+                            *totalrate += vp8cx_encode_inter_macroblock(cpi, x, tp, recon_yoffset, recon_uvoffset);
+
+#ifdef MODE_STATS
+                            inter_y_modes[xd->mbmi.mode] ++;
+
+                            if (xd->mbmi.mode == SPLITMV)
+                            {
+                                int b;
+
+                                for (b = 0; b < xd->mbmi.partition_count; b++)
+                                {
+                                    inter_b_modes[xd->mbmi.partition_bmi[b].mode] ++;
+                                }
+                            }
+
+#endif
+
+                            // Count of last ref frame 0,0 useage
+                            if ((xd->mbmi.mode == ZEROMV) && (xd->mbmi.ref_frame == LAST_FRAME))
+                                cpi->inter_zz_count ++;
+
+                        }
+
+                        cpi->tplist[mb_row].stop = *tp;
+
+                        xd->gf_active_ptr++;      // Increment pointer into gf useage flags structure for next mb
+
+                        // store macroblock mode info into context array
+                        vpx_memcpy(&xd->mode_info_context->mbmi, &xd->mbmi, sizeof(xd->mbmi));
+
+                        for (i = 0; i < 16; i++)
+                            vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+
+                        // adjust to the next column of macroblocks
+                        x->src.y_buffer += 16;
+                        x->src.u_buffer += 8;
+                        x->src.v_buffer += 8;
+
+                        recon_yoffset += 16;
+                        recon_uvoffset += 8;
+
+                        // Keep track of segment useage
+                        segment_counts[xd->mbmi.segment_id] ++;
+
+                        // skip to next mb
+                        xd->mode_info_context++;
+
+                        xd->above_context[Y1CONTEXT] += 4;
+                        xd->above_context[UCONTEXT ] += 2;
+                        xd->above_context[VCONTEXT ] += 2;
+                        xd->above_context[Y2CONTEXT] ++;
+
+                        cpi->mb_row_ei[ithread].current_mb_col = mb_col;
+
+                    }
+
+                    //extend the recon for intra prediction
+                    vp8_extend_mb_row(
+                        &cm->new_frame,
+                        xd->dst.y_buffer + 16,
+                        xd->dst.u_buffer + 8,
+                        xd->dst.v_buffer + 8);
+
+                    // this is to account for the border
+                    xd->mode_info_context++;
+
+                    x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
+                    x->src.u_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+                    x->src.v_buffer +=  8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
+
+                    xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
+
+                    if (ithread == (cpi->encoding_thread_count - 1) || mb_row == cm->mb_rows - 1)
+                    {
+                        //SetEvent(cpi->h_event_main);
+                        sem_post(&cpi->h_event_main);
+                    }
+
+                }
+
+            }
+        }
+    }
+
+#else
+    (void) p_data;
+#endif
+
+    //printf("exit thread %d\n", ithread);
+    return 0;
+}
+
+static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
+{
+
+    MACROBLOCK *x = mbsrc;
+    MACROBLOCK *z = mbdst;
+    int i;
+
+    z->ss               = x->ss;
+    z->ss_count          = x->ss_count;
+    z->searches_per_step  = x->searches_per_step;
+    z->errorperbit      = x->errorperbit;
+
+    z->sadperbit16      = x->sadperbit16;
+    z->sadperbit4       = x->sadperbit4;
+    z->errthresh        = x->errthresh;
+    z->rddiv            = x->rddiv;
+    z->rdmult           = x->rdmult;
+
+    /*
+    z->mv_col_min    = x->mv_col_min;
+    z->mv_col_max    = x->mv_col_max;
+    z->mv_row_min    = x->mv_row_min;
+    z->mv_row_max    = x->mv_row_max;
+    z->vector_range = x->vector_range ;
+    */
+
+    z->vp8_short_fdct4x4     = x->vp8_short_fdct4x4;
+    z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
+    z->short_fdct4x4rd   = x->short_fdct4x4rd;
+    z->short_fdct8x4rd   = x->short_fdct8x4rd;
+    z->short_fdct8x4rd   = x->short_fdct8x4rd;
+    z->vp8_short_fdct4x4_ptr = x->vp8_short_fdct4x4_ptr;
+    z->short_walsh4x4    = x->short_walsh4x4;
+    z->quantize_b        = x->quantize_b;
+    z->quantize_brd      = x->quantize_brd;
+
+    /*
+    z->mvc              = x->mvc;
+    z->src.y_buffer      = x->src.y_buffer;
+    z->src.u_buffer      = x->src.u_buffer;
+    z->src.v_buffer      = x->src.v_buffer;
+    */
+
+
+    vpx_memcpy(z->mvcosts,          x->mvcosts,         sizeof(x->mvcosts));
+    z->mvcost[0] = &z->mvcosts[0][mv_max+1];
+    z->mvcost[1] = &z->mvcosts[1][mv_max+1];
+    z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];
+    z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];
+
+
+    vpx_memcpy(z->token_costs,       x->token_costs,      sizeof(x->token_costs));
+    vpx_memcpy(z->inter_bmode_costs,  x->inter_bmode_costs, sizeof(x->inter_bmode_costs));
+    //memcpy(z->mvcosts,            x->mvcosts,         sizeof(x->mvcosts));
+    //memcpy(z->mvcost,         x->mvcost,          sizeof(x->mvcost));
+    vpx_memcpy(z->mbmode_cost,       x->mbmode_cost,      sizeof(x->mbmode_cost));
+    vpx_memcpy(z->intra_uv_mode_cost,  x->intra_uv_mode_cost, sizeof(x->intra_uv_mode_cost));
+    vpx_memcpy(z->bmode_costs,       x->bmode_costs,      sizeof(x->bmode_costs));
+
+    for (i = 0; i < 25; i++)
+    {
+        z->block[i].quant           = x->block[i].quant;
+        z->block[i].zbin            = x->block[i].zbin;
+        z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
+        z->block[i].round           = x->block[i].round;
+        /*
+        z->block[i].src             = x->block[i].src;
+        */
+        z->block[i].src_stride       = x->block[i].src_stride;
+        z->block[i].force_empty      = x->block[i].force_empty;
+
+    }
+
+    {
+        MACROBLOCKD *xd = &x->e_mbd;
+        MACROBLOCKD *zd = &z->e_mbd;
+
+        /*
+        zd->mode_info_context = xd->mode_info_context;
+        zd->mode_info        = xd->mode_info;
+
+        zd->mode_info_stride  = xd->mode_info_stride;
+        zd->frame_type       = xd->frame_type;
+        zd->up_available     = xd->up_available   ;
+        zd->left_available   = xd->left_available;
+        zd->left_context     = xd->left_context;
+        zd->last_frame_dc     = xd->last_frame_dc;
+        zd->last_frame_dccons = xd->last_frame_dccons;
+        zd->gold_frame_dc     = xd->gold_frame_dc;
+        zd->gold_frame_dccons = xd->gold_frame_dccons;
+        zd->mb_to_left_edge    = xd->mb_to_left_edge;
+        zd->mb_to_right_edge   = xd->mb_to_right_edge;
+        zd->mb_to_top_edge     = xd->mb_to_top_edge   ;
+        zd->mb_to_bottom_edge  = xd->mb_to_bottom_edge;
+        zd->gf_active_ptr     = xd->gf_active_ptr;
+        zd->frames_since_golden       = xd->frames_since_golden;
+        zd->frames_till_alt_ref_frame   = xd->frames_till_alt_ref_frame;
+        */
+        zd->subpixel_predict         = xd->subpixel_predict;
+        zd->subpixel_predict8x4      = xd->subpixel_predict8x4;
+        zd->subpixel_predict8x8      = xd->subpixel_predict8x8;
+        zd->subpixel_predict16x16    = xd->subpixel_predict16x16;
+        zd->segmentation_enabled     = xd->segmentation_enabled;
+        zd->mb_segement_abs_delta      = xd->mb_segement_abs_delta;
+        vpx_memcpy(zd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
+
+        /*
+        memcpy(zd->above_context,        xd->above_context, sizeof(xd->above_context));
+        memcpy(zd->mb_segment_tree_probs,  xd->mb_segment_tree_probs, sizeof(xd->mb_segment_tree_probs));
+        memcpy(zd->segment_feature_data,  xd->segment_feature_data, sizeof(xd->segment_feature_data));
+        */
+        for (i = 0; i < 25; i++)
+        {
+            zd->block[i].dequant = xd->block[i].dequant;
+        }
+    }
+}
+
+
+void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
+                               MACROBLOCK *x,
+                               MB_ROW_COMP *mbr_ei,
+                               int mb_row,
+                               int count
+                              )
+{
+
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+    int i;
+    (void) mb_row;
+
+    for (i = 0; i < count; i++)
+    {
+        MACROBLOCK *mb = & mbr_ei[i].mb;
+        MACROBLOCKD *mbd = &mb->e_mbd;
+
+        mbd->subpixel_predict        = xd->subpixel_predict;
+        mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
+        mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
+        mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
+#if CONFIG_RUNTIME_CPU_DETECT
+        mbd->rtcd                   = xd->rtcd;
+#endif
+        mbd->gf_active_ptr            = xd->gf_active_ptr;
+
+        mb->vector_range             = 32;
+
+        vpx_memset(mbr_ei[i].segment_counts, 0, sizeof(mbr_ei[i].segment_counts));
+        mbr_ei[i].totalrate = 0;
+
+        mbd->mode_info        = cm->mi - 1;
+        mbd->mode_info_context = cm->mi   + x->e_mbd.mode_info_stride * (i + 1);
+        mbd->mode_info_stride  = cm->mode_info_stride;
+
+        mbd->frame_type = cm->frame_type;
+
+        mbd->frames_since_golden = cm->frames_since_golden;
+        mbd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+
+        mb->src = * cpi->Source;
+        mbd->pre = cm->last_frame;
+        mbd->dst = cm->new_frame;
+
+        mb->src.y_buffer += 16 * x->src.y_stride * (i + 1);
+        mb->src.u_buffer +=  8 * x->src.uv_stride * (i + 1);
+        mb->src.v_buffer +=  8 * x->src.uv_stride * (i + 1);
+
+
+        vp8_build_block_offsets(mb);
+
+        vp8_setup_block_dptrs(mbd);
+
+        vp8_setup_block_ptrs(mb);
+
+        mb->rddiv = cpi->RDDIV;
+        mb->rdmult = cpi->RDMULT;
+
+        mbd->mbmi.mode = DC_PRED;
+        mbd->mbmi.uv_mode = DC_PRED;
+
+        mbd->left_context = cm->left_context;
+        mb->mvc = cm->fc.mvc;
+
+        setup_mbby_copy(&mbr_ei[i].mb, x);
+
+    }
+}
+
+
+void vp8cx_create_encoder_threads(VP8_COMP *cpi)
+{
+    cpi->b_multi_threaded = 0;
+
+    cpi->processor_core_count = 32; //vp8_get_proc_core_count();
+
+    CHECK_MEM_ERROR(cpi->tplist, vpx_malloc(sizeof(TOKENLIST) * cpi->common.mb_rows));
+
+#if CONFIG_MULTITHREAD
+
+    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+    {
+        int ithread;
+
+        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
+            cpi->encoding_thread_count = cpi->processor_core_count - 1;
+        else
+            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->h_event_mbrencoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
+        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
+        sem_init(&cpi->h_event_main, 0, 0);
+
+        cpi->b_multi_threaded = 1;
+
+        //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+
+        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+        {
+            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
+            sem_init(&cpi->h_event_mbrencoding[ithread], 0, 0);
+            cpi->en_thread_data[ithread].ithread = ithread;
+            cpi->en_thread_data[ithread].ptr1 = (void *)cpi;
+            cpi->en_thread_data[ithread].ptr2 = (void *)&cpi->mb_row_ei[ithread];
+
+            //printf(" call begin thread %d \n", ithread);
+
+            //cpi->h_encoding_thread[ithread] =   (HANDLE)_beginthreadex(
+            //  NULL,           // security
+            //  0,              // stksize
+            //  thread_encoding_proc,
+            //  (&cpi->en_thread_data[ithread]),          // Thread data
+            //  0,
+            //  NULL);
+
+            pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, (&cpi->en_thread_data[ithread]));
+
+        }
+
+    }
+
+#endif
+}
+
+void vp8cx_remove_encoder_threads(VP8_COMP *cpi)
+{
+#if CONFIG_MULTITHREAD
+
+    if (cpi->b_multi_threaded)
+    {
+        //shutdown other threads
+        cpi->b_multi_threaded = 0;
+        {
+            int i;
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+            {
+                //SetEvent(cpi->h_event_mbrencoding[i]);
+                sem_post(&cpi->h_event_mbrencoding[i]);
+                pthread_join(cpi->h_encoding_thread[i], 0);
+            }
+
+            for (i = 0; i < cpi->encoding_thread_count; i++)
+                sem_destroy(&cpi->h_event_mbrencoding[i]);
+        }
+        //free thread related resources
+        vpx_free(cpi->h_event_mbrencoding);
+        vpx_free(cpi->h_encoding_thread);
+        vpx_free(cpi->mb_row_ei);
+        vpx_free(cpi->en_thread_data);
+    }
+
+#endif
+    vpx_free(cpi->tplist);
+}

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
new file mode 100644
index 0000000..c519080
--- /dev/null
+++ b/vp8/encoder/firstpass.c

@@ -0,0 +1,2512 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "math.h"
+#include "limits.h"
+#include "block.h"
+#include "onyx_int.h"
+#include "variance.h"
+#include "encodeintra.h"
+#include "setupintrarecon.h"
+#include "mcomp.h"
+#include "vpx_scale/vpxscale.h"
+#include "encodemb.h"
+#include "extend.h"
+#include "systemdependent.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_mem/vpx_mem.h"
+#include "swapyv12buffer.h"
+#include <stdio.h>
+#include "rdopt.h"
+#include "quant_common.h"
+#include "encodemv.h"
+
+//#define OUTPUT_FPF 1
+//#define FIRSTPASS_MM 1
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void vp8_build_block_offsets(MACROBLOCK *x);
+extern void vp8_setup_block_ptrs(MACROBLOCK *x);
+extern void vp8cx_frame_init_quantizer(VP8_COMP *cpi);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
+extern void vp8_alloc_compressor_data(VP8_COMP *cpi);
+
+//#define GFQ_ADJUSTMENT (40 + ((15*Q)/10))
+//#define GFQ_ADJUSTMENT (80 + ((15*Q)/10))
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+extern int vp8_kf_boost_qadjustment[QINDEX_RANGE];
+
+extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
+
+#define IIFACTOR   1.4
+#define IIKFACTOR1 1.40
+#define IIKFACTOR2 1.5
+#define RMAX    14.0
+#define GF_RMAX 48.0        // 128.0
+
+#define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
+
+#define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
+#define POW2 (double)cpi->oxcf.two_pass_vbrbias/100.0
+
+static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
+static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
+
+
+void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
+int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
+
+int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+{
+
+    int i;
+    int intra_pred_var = 0;
+    (void) cpi;
+
+    if (use_dc_pred)
+    {
+        x->e_mbd.mbmi.mode = DC_PRED;
+        x->e_mbd.mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+        vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+    }
+    else
+    {
+        for (i = 0; i < 16; i++)
+        {
+            BLOCKD *b = &x->e_mbd.block[i];
+            BLOCK  *be = &x->block[i];
+
+            vp8_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, be, b, B_DC_PRED);
+        }
+    }
+
+    intra_pred_var = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);
+
+    return intra_pred_var;
+}
+
+// Resets the first pass file to the given position using a relative seek from the current position
+static void reset_fpf_position(VP8_COMP *cpi, FIRSTPASS_STATS *Position)
+{
+    cpi->stats_in = Position;
+}
+
+static int lookup_next_frame_stats(VP8_COMP *cpi, FIRSTPASS_STATS *next_frame)
+{
+    /*FIRSTPASS_STATS * start_pos;
+    int ret_val;
+
+    start_pos = cpi->stats_in;
+    ret_val = vp8_input_stats(cpi, next_frame);
+    reset_fpf_position(cpi, start_pos);
+
+    return ret_val;*/
+
+    if (cpi->stats_in >= cpi->stats_in_end)
+        return EOF;
+
+    *next_frame = *cpi->stats_in;
+    return 1;
+}
+
+// Calculate a modified Error used in distributing bits between easier and harder frames
+static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    double av_err = cpi->total_stats.ssim_weighted_pred_err;
+    double this_err = this_frame->ssim_weighted_pred_err;
+    double modified_err;
+
+    //double relative_next_iiratio;
+    //double next_iiratio;
+    //double sum_iiratio;
+    //int i;
+
+    //FIRSTPASS_STATS next_frame;
+    //FIRSTPASS_STATS *start_pos;
+
+    /*start_pos = cpi->stats_in;
+    sum_iiratio = 0.0;
+    i = 0;
+    while ( (i < 1) && vp8_input_stats(cpi,&next_frame) != EOF )
+    {
+
+        next_iiratio = next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error);
+        next_iiratio = ( next_iiratio < 1.0 ) ? 1.0 : (next_iiratio > 20.0) ? 20.0 : next_iiratio;
+        sum_iiratio += next_iiratio;
+        i++;
+    }
+    if ( i > 0 )
+    {
+        relative_next_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK(cpi->avg_iiratio * (double)i);
+    }
+    else
+    {
+        relative_next_iiratio = 1.0;
+    }
+    reset_fpf_position(cpi, start_pos);*/
+
+    if (this_err > av_err)
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW1);
+    else
+        modified_err = av_err * pow((this_err / DOUBLE_DIVIDE_CHECK(av_err)), POW2);
+
+    /*
+    relative_next_iiratio = pow(relative_next_iiratio,0.25);
+    modified_err = modified_err * relative_next_iiratio;
+    */
+
+    return modified_err;
+}
+
+double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
+{
+    int i, j;
+    int Total = 0;
+
+    unsigned char *src = source->y_buffer;
+    unsigned char value;
+    double sum_weights = 0.0;
+    double Weight;
+
+    // Loop throught the Y plane raw examining levels and creating a weight for the image
+    for (i = 0; i < source->y_height; i++)
+    {
+        for (j = 0; j < source->y_width; j++)
+        {
+            value = src[j];
+
+            if (value >= 64)
+                Weight = 1.0;
+            else if (value > 32)
+                Weight = (value - 32.0f) / 32.0f;
+            else
+                Weight = 0.02;
+
+            sum_weights += Weight;
+        }
+
+        src += source->y_stride;
+    }
+
+    sum_weights /= (source->y_height * source->y_width);
+
+    return sum_weights;
+}
+
+// This function returns the current per frame maximum bitrate target
+int frame_max_bits(VP8_COMP *cpi)
+{
+    // Max allocation for a single frame based on the max section guidelines passed in and how many bits are left
+    int max_bits;
+
+    // For CBR we need to also consider buffer fullness.
+    // If we are running below the optimal level then we need to gradually tighten up on max_bits.
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double buffer_fullness_ratio = (double)DOUBLE_DIVIDE_CHECK(cpi->buffer_level) / (double)cpi->oxcf.optimal_buffer_level;
+
+        // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user
+        max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+
+        // If our buffer is below the optimum level
+        if (buffer_fullness_ratio < 1.0)
+        {
+            // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4.
+            int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
+
+            max_bits = (int)(max_bits * buffer_fullness_ratio);
+
+            if (max_bits < min_max_bits)
+                max_bits = min_max_bits;       // Lowest value we will set ... which should allow the buffer to refil.
+        }
+    }
+    // VBR
+    else
+    {
+        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+        max_bits = (int)(((double)cpi->bits_left / (cpi->total_stats.count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    }
+
+    // Trap case where we are out of bits
+    if (max_bits < 0)
+        max_bits = 0;
+
+    return max_bits;
+}
+
+void vp8_output_stats(struct vpx_codec_pkt_list *pktlist,
+                      FIRSTPASS_STATS            *stats)
+{
+    struct vpx_codec_cx_pkt pkt;
+    pkt.kind = VPX_CODEC_STATS_PKT;
+    pkt.data.twopass_stats.buf = stats;
+    pkt.data.twopass_stats.sz = sizeof(*stats);
+    vpx_codec_pkt_list_add(pktlist, &pkt);
+
+// TEMP debug code
+#ifdef OUTPUT_FPF
+    {
+        FILE *fpfile;
+        fpfile = fopen("firstpass.stt", "a");
+
+        fprintf(fpfile, "%12.0f %12.0f %12.0f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.4f %12.0f\n",
+                stats->frame,
+                stats->intra_error,
+                stats->coded_error,
+                stats->ssim_weighted_pred_err,
+                stats->pcnt_inter,
+                stats->pcnt_motion,
+                stats->pcnt_second_ref,
+                stats->MVr,
+                stats->mvr_abs,
+                stats->MVc,
+                stats->mvc_abs,
+                stats->MVrv,
+                stats->MVcv,
+                stats->mv_in_out_count,
+                stats->count);
+        fclose(fpfile);
+    }
+#endif
+}
+
+int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps)
+{
+    if (cpi->stats_in >= cpi->stats_in_end)
+        return EOF;
+
+    *fps = *cpi->stats_in++;
+    return 1;
+}
+
+void vp8_zero_stats(FIRSTPASS_STATS *section)
+{
+    section->frame      = 0.0;
+    section->intra_error = 0.0;
+    section->coded_error = 0.0;
+    section->ssim_weighted_pred_err = 0.0;
+    section->pcnt_inter  = 0.0;
+    section->pcnt_motion  = 0.0;
+    section->pcnt_second_ref = 0.0;
+    section->MVr        = 0.0;
+    section->mvr_abs     = 0.0;
+    section->MVc        = 0.0;
+    section->mvc_abs     = 0.0;
+    section->MVrv       = 0.0;
+    section->MVcv       = 0.0;
+    section->mv_in_out_count  = 0.0;
+    section->count      = 0.0;
+    section->duration   = 1.0;
+}
+void vp8_accumulate_stats(FIRSTPASS_STATS *section, FIRSTPASS_STATS *frame)
+{
+    section->frame += frame->frame;
+    section->intra_error += frame->intra_error;
+    section->coded_error += frame->coded_error;
+    section->ssim_weighted_pred_err += frame->ssim_weighted_pred_err;
+    section->pcnt_inter  += frame->pcnt_inter;
+    section->pcnt_motion += frame->pcnt_motion;
+    section->pcnt_second_ref += frame->pcnt_second_ref;
+    section->MVr        += frame->MVr;
+    section->mvr_abs     += frame->mvr_abs;
+    section->MVc        += frame->MVc;
+    section->mvc_abs     += frame->mvc_abs;
+    section->MVrv       += frame->MVrv;
+    section->MVcv       += frame->MVcv;
+    section->mv_in_out_count  += frame->mv_in_out_count;
+    section->count      += frame->count;
+    section->duration   += frame->duration;
+}
+void vp8_avg_stats(FIRSTPASS_STATS *section)
+{
+    if (section->count < 1.0)
+        return;
+
+    section->intra_error /= section->count;
+    section->coded_error /= section->count;
+    section->ssim_weighted_pred_err /= section->count;
+    section->pcnt_inter  /= section->count;
+    section->pcnt_second_ref /= section->count;
+    section->pcnt_motion /= section->count;
+    section->MVr        /= section->count;
+    section->mvr_abs     /= section->count;
+    section->MVc        /= section->count;
+    section->mvc_abs     /= section->count;
+    section->MVrv       /= section->count;
+    section->MVcv       /= section->count;
+    section->mv_in_out_count   /= section->count;
+    section->duration   /= section->count;
+}
+
+int vp8_fpmm_get_pos(VP8_COMP *cpi)
+{
+    return ftell(cpi->fp_motion_mapfile);
+}
+void vp8_fpmm_reset_pos(VP8_COMP *cpi, int target_pos)
+{
+    int Offset;
+
+    if (cpi->fp_motion_mapfile)
+    {
+        Offset = ftell(cpi->fp_motion_mapfile) - target_pos;
+        fseek(cpi->fp_motion_mapfile, (int) - Offset, SEEK_CUR);
+    }
+}
+
+void vp8_advance_fpmm(VP8_COMP *cpi, int count)
+{
+#ifdef FIRSTPASS_MM
+    fseek(cpi->fp_motion_mapfile, (int)(count * cpi->common.MBs), SEEK_CUR);
+#endif
+}
+
+void vp8_input_fpmm(VP8_COMP *cpi, int count)
+{
+#ifdef FIRSTPASS_MM
+
+    unsigned char *tmp_motion_map;
+    int i, j;
+
+    if (!cpi->fp_motion_mapfile)
+        return;                 // Error
+
+    // Create the first pass motion map structure and set to 0
+    CHECK_MEM_ERROR(tmp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+
+    // Reset the state of the global map
+    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+
+    // Read the specified number of frame maps and set the global map to the highest value seen for each mb.
+    for (i = 0; i < count; i++)
+    {
+        if (fread(tmp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile) == cpi->common.MBs)
+        {
+            for (j = 0; j < cpi->common.MBs; j++)
+            {
+                if (tmp_motion_map[j] > 1)
+                    cpi->fp_motion_map[j] += 5;   // Intra is flagged
+                else
+                    cpi->fp_motion_map[j] += tmp_motion_map[j];
+            }
+        }
+        else
+            break;  // Read error
+
+    }
+
+    if (tmp_motion_map != 0)
+        vpx_free(tmp_motion_map);
+
+#endif
+
+}
+
+void vp8_init_first_pass(VP8_COMP *cpi)
+{
+    vp8_zero_stats(&cpi->total_stats);
+
+#ifdef FIRSTPASS_MM
+    cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "wb");
+#endif
+
+// TEMP debug code
+#ifdef OUTPUT_FPF
+    {
+        FILE *fpfile;
+        fpfile = fopen("firstpass.stt", "w");
+        fclose(fpfile);
+    }
+#endif
+
+}
+
+void vp8_end_first_pass(VP8_COMP *cpi)
+{
+    vp8_output_stats(cpi->output_pkt_list, &cpi->total_stats);
+
+#ifdef FIRSTPASS_MM
+
+    if (cpi->fp_motion_mapfile)
+        fclose(cpi->fp_motion_mapfile);
+
+#endif
+
+}
+void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
+{
+    MACROBLOCKD * const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+
+    unsigned char *src_ptr = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    unsigned char *ref_ptr;
+    int ref_stride=d->pre_stride;
+
+    // Set up pointers for this macro block recon buffer
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+    ref_ptr = (unsigned char *)(*(d->base_pre) + d->pre );
+
+    VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
+}
+
+
+void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    int num00;
+
+    MV tmp_mv = {0, 0};
+
+    int tmp_err;
+    int step_param = 3;                                       //3;          // Dont search over full range for first pass
+    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3;
+    int n;
+    vp8_variance_fn_ptr_t v_fn_ptr;
+    int new_mv_mode_penalty = 256;
+
+    v_fn_ptr.vf    = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
+    v_fn_ptr.sdf   = cpi->fn_ptr.sdf;
+    v_fn_ptr.sdx4df = cpi->fn_ptr.sdx4df;
+
+    // Set up pointers for this macro block recon buffer
+    xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
+
+    // Initial step/diamond search centred on best mv
+    tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+    if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+        tmp_err += new_mv_mode_penalty;
+
+    if (tmp_err < *best_motion_err)
+    {
+        *best_motion_err = tmp_err;
+        best_mv->row = tmp_mv.row;
+        best_mv->col = tmp_mv.col;
+    }
+
+    // Further step/diamond searches as necessary
+    n = num00;
+    num00 = 0;
+
+    while (n < further_steps)
+    {
+        n++;
+
+        if (num00)
+            num00--;
+        else
+        {
+            tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost);
+            if ( tmp_err < INT_MAX-new_mv_mode_penalty )
+                tmp_err += new_mv_mode_penalty;
+
+            if (tmp_err < *best_motion_err)
+            {
+                *best_motion_err = tmp_err;
+                best_mv->row = tmp_mv.row;
+                best_mv->col = tmp_mv.col;
+            }
+        }
+    }
+}
+
+void vp8_first_pass(VP8_COMP *cpi)
+{
+    int mb_row, mb_col;
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    int col_blocks = 4 * cm->mb_cols;
+    int recon_yoffset, recon_uvoffset;
+    int recon_y_stride = cm->last_frame.y_stride;
+    int recon_uv_stride = cm->last_frame.uv_stride;
+    int intra_error = 0;
+    int coded_error = 0;
+
+    int sum_mvr = 0, sum_mvc = 0;
+    int sum_mvr_abs = 0, sum_mvc_abs = 0;
+    int sum_mvrs = 0, sum_mvcs = 0;
+    int mvcount = 0;
+    int intercount = 0;
+    int second_ref_count = 0;
+    int intrapenalty = 256;
+
+    int sum_in_vectors = 0;
+
+    MV best_ref_mv = {0, 0};
+    MV zero_ref_mv = {0, 0};
+
+    unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
+
+    vp8_clear_system_state();  //__asm emms;
+
+    x->src = * cpi->Source;
+    xd->pre = cm->last_frame;
+    xd->dst = cm->new_frame;
+
+    vp8_build_block_offsets(x);
+
+    vp8_setup_block_dptrs(&x->e_mbd);
+
+    vp8_setup_block_ptrs(x);
+
+    // set up frame new frame for intra coded blocks
+    vp8_setup_intra_recon(&cm->new_frame);
+    vp8cx_frame_init_quantizer(cpi);
+
+    // Initialise the MV cost table to the defaults
+    //if( cm->current_video_frame == 0)
+    //if ( 0 )
+    {
+        int flag[2] = {1, 1};
+        vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
+        vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+    }
+
+    // for each macroblock row in image
+    for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+    {
+        MV best_ref_mv = {0, 0};
+
+        // reset above block coeffs
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+        recon_uvoffset = (mb_row * recon_uv_stride * 8);
+
+        // for each macroblock col in image
+        for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+        {
+            int this_error;
+            int gf_motion_error = INT_MAX;
+            int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+            xd->dst.y_buffer = cm->new_frame.y_buffer + recon_yoffset;
+            xd->dst.u_buffer = cm->new_frame.u_buffer + recon_uvoffset;
+            xd->dst.v_buffer = cm->new_frame.v_buffer + recon_uvoffset;
+            xd->left_available = (mb_col != 0);
+
+            // do intra 16x16 prediction
+            this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+
+            // "intrapenalty" below deals with situations where the intra and inter error scores are very low (eg a plain black frame)
+            // We do not have special cases in first pass for 0,0 and nearest etc so all inter modes carry an overhead cost estimate fot the mv.
+            // When the error score is very low this causes us to pick all or lots of INTRA modes and throw lots of key frames.
+            // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+            this_error += intrapenalty;
+
+            // Cumulative intra error total
+            intra_error += this_error;
+
+            // Indicate default assumption of intra in the motion map
+            *fp_motion_map_ptr = 2;
+
+            // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+            x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
+            x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
+            x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+            x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+            // Other than for the first frame do a motion search
+            if (cm->current_video_frame > 0)
+            {
+                BLOCK *b = &x->block[0];
+                BLOCKD *d = &x->e_mbd.block[0];
+                MV tmp_mv = {0, 0};
+                int tmp_err;
+                int motion_error = INT_MAX;
+
+                // Simple 0,0 motion with no mv overhead
+                vp8_zz_motion_search( cpi, x, &cm->last_frame, &motion_error, recon_yoffset );
+                d->bmi.mv.as_mv.row = 0;
+                d->bmi.mv.as_mv.col = 0;
+
+                // Test last reference frame using the previous best mv as the starting point (best reference) for the search
+                vp8_first_pass_motion_search(cpi, x, &best_ref_mv, &d->bmi.mv.as_mv, &cm->last_frame, &motion_error, recon_yoffset);
+
+                // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
+                if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+                {
+                   tmp_err = INT_MAX;
+                   vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->last_frame, &motion_error, recon_yoffset);
+
+                   if ( tmp_err < motion_error )
+                   {
+                        motion_error = tmp_err;
+                        d->bmi.mv.as_mv.row = tmp_mv.row;
+                        d->bmi.mv.as_mv.col = tmp_mv.col;
+                   }
+
+                }
+
+                // Experimental search in a second reference frame ((0,0) based only)
+                if (cm->current_video_frame > 1)
+                {
+                    vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv, &cm->golden_frame, &gf_motion_error, recon_yoffset);
+
+                    if ((gf_motion_error < motion_error) && (gf_motion_error < this_error))
+                    {
+                        second_ref_count++;
+                        //motion_error = gf_motion_error;
+                        //d->bmi.mv.as_mv.row = tmp_mv.row;
+                        //d->bmi.mv.as_mv.col = tmp_mv.col;
+                    }
+                    /*else
+                    {
+                        xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset;
+                        xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset;
+                        xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset;
+                    }*/
+
+
+                    // Reset to last frame as reference buffer
+                    xd->pre.y_buffer = cm->last_frame.y_buffer + recon_yoffset;
+                    xd->pre.u_buffer = cm->last_frame.u_buffer + recon_uvoffset;
+                    xd->pre.v_buffer = cm->last_frame.v_buffer + recon_uvoffset;
+                }
+
+                if (motion_error <= this_error)
+                {
+                    d->bmi.mv.as_mv.row <<= 3;
+                    d->bmi.mv.as_mv.col <<= 3;
+                    this_error = motion_error;
+                    vp8_set_mbmode_and_mvs(x, NEWMV, &d->bmi.mv.as_mv);
+                    vp8_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+                    sum_mvr += d->bmi.mv.as_mv.row;
+                    sum_mvr_abs += abs(d->bmi.mv.as_mv.row);
+                    sum_mvc += d->bmi.mv.as_mv.col;
+                    sum_mvc_abs += abs(d->bmi.mv.as_mv.col);
+                    sum_mvrs += d->bmi.mv.as_mv.row * d->bmi.mv.as_mv.row;
+                    sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
+                    intercount++;
+
+                    best_ref_mv.row = d->bmi.mv.as_mv.row;
+                    best_ref_mv.col = d->bmi.mv.as_mv.col;
+                    //best_ref_mv.row = 0;
+                    //best_ref_mv.col = 0;
+
+                    // Was the vector non-zero
+                    if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+                    {
+                        mvcount++;
+
+                        *fp_motion_map_ptr = 1;
+
+                        // Does the Row vector point inwards or outwards
+                        if (mb_row < cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_row > cm->mb_rows / 2)
+                        {
+                            if (d->bmi.mv.as_mv.row > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.row < 0)
+                                sum_in_vectors--;
+                        }
+
+                        // Does the Row vector point inwards or outwards
+                        if (mb_col < cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors--;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors++;
+                        }
+                        else if (mb_col > cm->mb_cols / 2)
+                        {
+                            if (d->bmi.mv.as_mv.col > 0)
+                                sum_in_vectors++;
+                            else if (d->bmi.mv.as_mv.col < 0)
+                                sum_in_vectors--;
+                        }
+                    }
+                    else
+                        *fp_motion_map_ptr = 0;    // 0,0 mv was best
+                }
+                else
+                {
+                    best_ref_mv.row = 0;
+                    best_ref_mv.col = 0;
+                }
+            }
+
+            coded_error += this_error;
+
+            // adjust to the next column of macroblocks
+            x->src.y_buffer += 16;
+            x->src.u_buffer += 8;
+            x->src.v_buffer += 8;
+
+            recon_yoffset += 16;
+            recon_uvoffset += 8;
+
+            // Update the motion map
+            fp_motion_map_ptr++;
+        }
+
+        // adjust to the next row of mbs
+        x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+        x->src.u_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+        x->src.v_buffer += 8 * x->src.uv_stride - 8 * cm->mb_cols;
+
+        //extend the recon for intra prediction
+        vp8_extend_mb_row(&cm->new_frame, xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+        vp8_clear_system_state();  //__asm emms;
+    }
+
+    vp8_clear_system_state();  //__asm emms;
+    {
+        double weight = 0.0;
+        double weigth2 = 0.0;
+
+        FIRSTPASS_STATS fps;
+
+        fps.frame      = cm->current_video_frame ;
+        fps.intra_error = intra_error >> 8;
+        fps.coded_error = coded_error >> 8;
+        weight = vp8_simple_weight(cpi->Source);
+
+        if (weight < 0.1)
+            weight = 0.1;
+
+        fps.ssim_weighted_pred_err = fps.coded_error * weight;
+
+        fps.pcnt_inter  = 0.0;
+        fps.pcnt_motion = 0.0;
+        fps.MVr        = 0.0;
+        fps.mvr_abs     = 0.0;
+        fps.MVc        = 0.0;
+        fps.mvc_abs     = 0.0;
+        fps.MVrv       = 0.0;
+        fps.MVcv       = 0.0;
+        fps.mv_in_out_count  = 0.0;
+        fps.count      = 1.0;
+
+        fps.pcnt_inter   = 1.0 * (double)intercount / cm->MBs;
+        fps.pcnt_second_ref = 1.0 * (double)second_ref_count / cm->MBs;
+
+        if (mvcount > 0)
+        {
+            fps.MVr = (double)sum_mvr / (double)mvcount;
+            fps.mvr_abs = (double)sum_mvr_abs / (double)mvcount;
+            fps.MVc = (double)sum_mvc / (double)mvcount;
+            fps.mvc_abs = (double)sum_mvc_abs / (double)mvcount;
+            fps.MVrv = ((double)sum_mvrs - (fps.MVr * fps.MVr / (double)mvcount)) / (double)mvcount;
+            fps.MVcv = ((double)sum_mvcs - (fps.MVc * fps.MVc / (double)mvcount)) / (double)mvcount;
+            fps.mv_in_out_count = (double)sum_in_vectors / (double)(mvcount * 2);
+
+            fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs;
+        }
+
+        // TODO:  handle the case when duration is set to 0, or something less
+        // than the full time between subsequent cpi->source_time_stamp s  .
+        fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp;
+
+        // don't want to do outputstats with a stack variable!
+        cpi->this_frame_stats = fps;
+        vp8_output_stats(cpi->output_pkt_list, &cpi->this_frame_stats);
+        vp8_accumulate_stats(&cpi->total_stats, &fps);
+
+#ifdef FIRSTPASS_MM
+        fwrite(cpi->fp_motion_map, 1, cpi->common.MBs, cpi->fp_motion_mapfile);
+#endif
+    }
+
+    // Copy the previous Last Frame into the GF buffer if specific conditions for doing so are met
+    if ((cm->current_video_frame > 0) &&
+        (cpi->this_frame_stats.pcnt_inter > 0.20) &&
+        ((cpi->this_frame_stats.intra_error / cpi->this_frame_stats.coded_error) > 2.0))
+    {
+        vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+    }
+
+    // swap frame pointers so last frame refers to the frame we just compressed
+    vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+    vp8_yv12_extend_frame_borders(&cm->last_frame);
+
+    // Special case for the first frame. Copy into the GF buffer as a second reference.
+    if (cm->current_video_frame == 0)
+    {
+        vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+    }
+
+
+    // use this to see what the first pass reconstruction looks like
+    if (0)
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+
+        if (cm->current_video_frame == 0)
+            recon_file = fopen(filename, "wb");
+        else
+            recon_file = fopen(filename, "ab");
+
+        fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file);
+        fclose(recon_file);
+    }
+
+    cm->current_video_frame++;
+
+}
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+#define BASE_ERRPERMB   150
+static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double rolling_ratio;
+
+    double pow_highq = 0.90;
+    double pow_lowq = 0.40;
+
+    if (section_target_bandwitdh <= 0)
+        return MAXQ;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+    // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
+    if ((cpi->rolling_target_bits > 0.0) && (cpi->active_worst_quality < cpi->worst_quality))
+    {
+        //double adjustment_rate = 0.985 + (0.00005 * cpi->active_worst_quality);
+        double adjustment_rate = 0.99;
+
+        rolling_ratio = (double)cpi->rolling_actual_bits / (double)cpi->rolling_target_bits;
+
+        //if ( cpi->est_max_qcorrection_factor > rolling_ratio )
+        if (rolling_ratio < 0.95)
+            //cpi->est_max_qcorrection_factor *= adjustment_rate;
+            cpi->est_max_qcorrection_factor -= 0.005;
+        //else if ( cpi->est_max_qcorrection_factor < rolling_ratio )
+        else if (rolling_ratio > 1.05)
+            cpi->est_max_qcorrection_factor += 0.005;
+
+        //cpi->est_max_qcorrection_factor /= adjustment_rate;
+
+        cpi->est_max_qcorrection_factor = (cpi->est_max_qcorrection_factor < 0.1) ? 0.1 : (cpi->est_max_qcorrection_factor > 10.0) ? 10.0 : cpi->est_max_qcorrection_factor;
+    }
+
+    // Corrections for higher compression speed settings (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that should be high enough to encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        if (Q < 50)
+        {
+            correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+            correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+        }
+        else
+            correction_factor = corr_high;
+
+        bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * cpi->section_max_qfactor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+        //bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return Q;
+}
+static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double pow_highq = 0.90;
+    double pow_lowq = 0.40;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
+
+    // Corrections for higher compression speed settings (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that can encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        if (Q < 50)
+        {
+            correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+            correction_factor = (correction_factor < 0.05) ? 0.05 : (correction_factor > 5.0) ? 5.0 : correction_factor;
+        }
+        else
+            correction_factor = corr_high;
+
+        bits_per_mb_at_this_q = (int)(.5 + correction_factor * speed_correction * cpi->est_max_qcorrection_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return Q;
+}
+
+// Estimate a worst case Q for a KF group
+static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width, double group_iiratio)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb = (512 * section_target_bandwitdh) / num_mbs;
+    int bits_per_mb_at_this_q;
+
+    double err_per_mb = section_err / num_mbs;
+    double err_correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double current_spend_ratio = 1.0;
+
+    double pow_highq = (POW1 < 0.6) ? POW1 + 0.3 : 0.90;
+    double pow_lowq = (POW1 < 0.7) ? POW1 + 0.1 : 0.80;
+
+    double iiratio_correction_factor = 1.0;
+
+    double combined_correction_factor;
+
+    // Trap special case where the target is <= 0
+    if (target_norm_bits_per_mb <= 0)
+        return MAXQ * 2;
+
+    // Calculate a corrective factor based on a rolling ratio of bits spent vs target bits
+    // This is clamped to the range 0.1 to 10.0
+    if (cpi->long_rolling_target_bits <= 0)
+        current_spend_ratio = 10.0;
+    else
+    {
+        current_spend_ratio = (double)cpi->long_rolling_actual_bits / (double)cpi->long_rolling_target_bits;
+        current_spend_ratio = (current_spend_ratio > 10.0) ? 10.0 : (current_spend_ratio < 0.1) ? 0.1 : current_spend_ratio;
+    }
+
+    // Calculate a correction factor based on the quality of prediction in the sequence as indicated by intra_inter error score ratio (IIRatio)
+    // The idea here is to favour subsampling in the hardest sections vs the easyest.
+    iiratio_correction_factor = 1.0 - ((group_iiratio - 6.0) * 0.1);
+
+    if (iiratio_correction_factor < 0.5)
+        iiratio_correction_factor = 0.5;
+
+    // Corrections for higher compression speed settings (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+
+    // Combine the various factors calculated above
+    combined_correction_factor = speed_correction * iiratio_correction_factor * current_spend_ratio;
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that should be high enough to encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        // Q values < 20 treated as a special case
+        if (Q < 20)
+        {
+            err_correction_factor = pow(err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+            err_correction_factor = (err_correction_factor < 0.05) ? 0.05 : (err_correction_factor > 5.0) ? 5.0 : err_correction_factor;
+        }
+        else
+            err_correction_factor = corr_high;
+
+        bits_per_mb_at_this_q = (int)(.5 + err_correction_factor * combined_correction_factor * (double)vp8_bits_per_mb[INTER_FRAME][Q]);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    // If we could not hit the target even at Max Q then estimate what Q would have bee required
+    while ((bits_per_mb_at_this_q > target_norm_bits_per_mb)  && (Q < (MAXQ * 2)))
+    {
+
+        bits_per_mb_at_this_q = (int)(0.96 * bits_per_mb_at_this_q);
+        Q++;
+    }
+
+    if (0)
+    {
+        FILE *f = fopen("estkf_q.stt", "a");
+        fprintf(f, "%8d %8d %8d %8.2f %8.3f %8.2f %8.3f %8.3f %8.3f %8d\n", cpi->common.current_video_frame, bits_per_mb_at_this_q,
+                target_norm_bits_per_mb, err_per_mb, err_correction_factor,
+                current_spend_ratio, group_iiratio, iiratio_correction_factor,
+                (double)cpi->buffer_level / (double)cpi->oxcf.optimal_buffer_level, Q);
+        fclose(f);
+    }
+
+    return Q;
+}
+extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
+
+void vp8_init_second_pass(VP8_COMP *cpi)
+{
+    FIRSTPASS_STATS this_frame;
+    FIRSTPASS_STATS *start_pos;
+
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    vp8_zero_stats(&cpi->total_stats);
+
+    if (!cpi->stats_in_end)
+        return;
+
+    cpi->total_stats = *cpi->stats_in_end;
+
+    cpi->total_error_left = cpi->total_stats.ssim_weighted_pred_err;
+    cpi->total_intra_error_left = cpi->total_stats.intra_error;
+    cpi->total_coded_error_left = cpi->total_stats.coded_error;
+    cpi->start_tot_err_left = cpi->total_error_left;
+
+    //cpi->bits_left = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+    //cpi->bits_left -= (long long)(cpi->total_stats.count * two_pass_min_rate / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+
+    // each frame can have a different duration, as the frame rate in the source
+    // isn't guaranteed to be constant.   The frame rate prior to the first frame
+    // encoded in the second pass is a guess.  However the sum duration is not.
+    // Its calculated based on the actual durations of all frames from the first
+    // pass.
+    vp8_new_frame_rate(cpi, 10000000.0 * cpi->total_stats.count / cpi->total_stats.duration);
+
+    cpi->output_frame_rate = cpi->oxcf.frame_rate;
+    cpi->bits_left = (long long)(cpi->total_stats.duration * cpi->oxcf.target_bandwidth / 10000000.0) ;
+    cpi->bits_left -= (long long)(cpi->total_stats.duration * two_pass_min_rate / 10000000.0);
+
+    vp8_avg_stats(&cpi->total_stats);
+
+    // Scan the first pass file and calculate an average Intra / Inter error score ratio for the sequence
+    {
+        double sum_iiratio = 0.0;
+        double IIRatio;
+
+        start_pos = cpi->stats_in;               // Note starting "file" position
+
+        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        {
+            IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error);
+            IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio;
+            sum_iiratio += IIRatio;
+        }
+
+        cpi->avg_iiratio = sum_iiratio / DOUBLE_DIVIDE_CHECK((double)cpi->total_stats.count);
+
+        // Reset file position
+        reset_fpf_position(cpi, start_pos);
+    }
+
+    // Scan the first pass file and calculate a modified total error based upon the bias/power function
+    // used to allocate bits
+    {
+        start_pos = cpi->stats_in;               // Note starting "file" position
+
+        cpi->modified_total_error_left = 0.0;
+
+        while (vp8_input_stats(cpi, &this_frame) != EOF)
+        {
+            cpi->modified_total_error_left += calculate_modified_err(cpi, &this_frame);
+        }
+
+        reset_fpf_position(cpi, start_pos);            // Reset file position
+
+    }
+
+#ifdef FIRSTPASS_MM
+    cpi->fp_motion_mapfile = 0;
+    cpi->fp_motion_mapfile = fopen("fpmotionmap.stt", "rb");
+#endif
+
+}
+
+void vp8_end_second_pass(VP8_COMP *cpi)
+{
+#ifdef FIRSTPASS_MM
+
+    if (cpi->fp_motion_mapfile)
+        fclose(cpi->fp_motion_mapfile);
+
+#endif
+}
+
+// Analyse and define a gf/arf group .
+static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_pos;
+    int i;
+    int count = 0;
+    int image_size = cpi->common.last_frame.y_width  * cpi->common.last_frame.y_height;
+    double boost_score = 0.0;
+    double old_boost_score = 0.0;
+    double gf_group_err = 0.0;
+    double gf_first_frame_err = 0.0;
+    double mod_frame_err = 0.0;
+
+    double mv_accumulator_rabs  = 0.0;
+    double mv_accumulator_cabs  = 0.0;
+    double this_mv_rabs;
+    double this_mv_cabs;
+    double mv_ratio_accumulator = 0.0;
+    double distance_factor = 0.0;
+    double decay_accumulator = 1.0;
+
+    double boost_factor = IIFACTOR;
+    double loop_decay_rate = 1.00;        // Starting decay rate
+
+    double this_frame_mv_in_out = 0.0;
+    double mv_in_out_accumulator = 0.0;
+    double abs_mv_in_out_accumulator = 0.0;
+    double mod_err_per_mb_accumulator = 0.0;
+
+    int max_bits = frame_max_bits(cpi);    // Max for a single frame
+
+#ifdef FIRSTPASS_MM
+    int fpmm_pos;
+#endif
+
+    cpi->gf_group_bits = 0;
+    cpi->gf_decay_rate = 0;
+
+    vp8_clear_system_state();  //__asm emms;
+
+#ifdef FIRSTPASS_MM
+    fpmm_pos = vp8_fpmm_get_pos(cpi);
+#endif
+
+    start_pos = cpi->stats_in;
+
+    // Preload the stats for the next frame.
+    mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+    // Note the error of the frame at the start of the group (this will be the GF frame error if we code a normal gf
+    gf_first_frame_err = mod_frame_err;
+
+    // Special treatment if the current frame is a key frame (which is also a gf).
+    // If it is then its error score (and hence bit allocation) need to be subtracted out
+    // from the calculation for the GF group
+    if (cpi->common.frame_type == KEY_FRAME)
+        gf_group_err -= gf_first_frame_err;
+
+    // Scan forward to try and work out how many frames the next gf group should contain and
+    // what level of boost is appropriate for the GF or ARF that will be coded with the group
+    i = 0;
+
+    while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    {
+        double r;
+        double motion_factor;
+        double this_frame_mvr_ratio;
+        double this_frame_mvc_ratio;
+
+        i++;                                                    // Increment the loop counter
+
+        // Accumulate error score of frames in this gf group
+        mod_frame_err = calculate_modified_err(cpi, this_frame);
+
+        gf_group_err += mod_frame_err;
+
+        mod_err_per_mb_accumulator += mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
+
+        if (EOF == vp8_input_stats(cpi, &next_frame))
+            break;
+
+        // Accumulate motion stats.
+        motion_factor = next_frame.pcnt_motion;
+        this_mv_rabs = fabs(next_frame.mvr_abs * motion_factor);
+        this_mv_cabs = fabs(next_frame.mvc_abs * motion_factor);
+
+        mv_accumulator_rabs += fabs(next_frame.mvr_abs * motion_factor);
+        mv_accumulator_cabs += fabs(next_frame.mvc_abs * motion_factor);
+
+        //Accumulate Motion In/Out of frame stats
+        this_frame_mv_in_out = next_frame.mv_in_out_count * next_frame.pcnt_motion;
+        mv_in_out_accumulator += next_frame.mv_in_out_count * next_frame.pcnt_motion;
+        abs_mv_in_out_accumulator += fabs(next_frame.mv_in_out_count * next_frame.pcnt_motion);
+
+        // If there is a significant amount of motion
+        if (motion_factor > 0.05)
+        {
+            this_frame_mvr_ratio = fabs(next_frame.mvr_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVr));
+            this_frame_mvc_ratio = fabs(next_frame.mvc_abs) / DOUBLE_DIVIDE_CHECK(fabs(next_frame.MVc));
+
+            mv_ratio_accumulator += (this_frame_mvr_ratio < next_frame.mvr_abs) ? (this_frame_mvr_ratio * motion_factor) : next_frame.mvr_abs * motion_factor;
+            mv_ratio_accumulator += (this_frame_mvc_ratio < next_frame.mvc_abs) ? (this_frame_mvc_ratio * motion_factor) : next_frame.mvc_abs * motion_factor;
+        }
+        else
+        {
+            mv_ratio_accumulator += 0.0;
+            this_frame_mvr_ratio = 1.0;
+            this_frame_mvc_ratio = 1.0;
+        }
+
+        // Underlying boost factor is based on inter intra error ratio
+        r = (boost_factor * (next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)));
+
+        // Increase boost for frames where new data coming into frame (eg zoom out)
+        // Slightly reduce boost if there is a net balance of motion out of the frame (zoom in)
+        // The range for this_frame_mv_in_out is -1.0 to +1.0
+        if (this_frame_mv_in_out > 0.0)
+            r += r * (this_frame_mv_in_out * 2.0);
+        else
+            r += r * (this_frame_mv_in_out / 2.0);  // In extreme case boost is halved
+
+        if (r > GF_RMAX)
+            r = GF_RMAX;
+
+        // Adjust loop decay rate
+        //if ( next_frame.pcnt_inter < loop_decay_rate )
+        loop_decay_rate = next_frame.pcnt_inter;
+
+        // High % motion -> somewhat higher decay rate
+        if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
+            loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
+
+        distance_factor = sqrt((this_mv_rabs * this_mv_rabs) + (this_mv_cabs * this_mv_cabs)) / 300.0;
+        distance_factor = ((distance_factor > 1.0) ? 0.0 : (1.0 - distance_factor));
+
+        if (distance_factor < loop_decay_rate)
+            loop_decay_rate = distance_factor;
+
+        // Cumulative effect of decay
+        decay_accumulator = decay_accumulator * loop_decay_rate;
+        decay_accumulator = decay_accumulator < 0.1 ? 0.1 : decay_accumulator;
+        //decay_accumulator = ( loop_decay_rate < decay_accumulator ) ? loop_decay_rate : decay_accumulator;
+
+        boost_score += (decay_accumulator * r);
+
+        // Break out conditions.
+        if (   /* i>4 || */
+            (
+                (i > MIN_GF_INTERVAL) &&                            // Dont break out with a very short interval
+                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&      // Dont break out very close to a key frame
+                ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
+                ((mv_ratio_accumulator > 100.0) ||
+                 (abs_mv_in_out_accumulator > 3.0) ||
+                 (mv_in_out_accumulator < -2.0) ||
+                 ((boost_score - old_boost_score) < 2.0)
+                )
+            )
+        )
+        {
+            boost_score = old_boost_score;
+            break;
+        }
+
+        vpx_memcpy(this_frame, &next_frame, sizeof(*this_frame));
+
+        old_boost_score = boost_score;
+    }
+
+    cpi->gf_decay_rate = (i > 0) ? (int)(100.0 * (1.0 - decay_accumulator)) / i : 0;
+
+    // When using CBR apply additional buffer related upper limits
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        // For cbr apply buffer related limits
+        if (cpi->drop_frames_allowed)
+        {
+            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    cpi->gfu_boost = (int)(boost_score * 100.0) >> 4;
+
+    // Should we use the alternate refernce frame
+    if (cpi->oxcf.play_alternate &&
+        (i >= MIN_GF_INTERVAL) &&
+        (i <= (cpi->frames_to_key - MIN_GF_INTERVAL)) &&          // dont use ARF very near next kf
+        (((next_frame.pcnt_inter > 0.75) &&
+          ((mv_in_out_accumulator / (double)i > -0.2) || (mv_in_out_accumulator > -2.0)) &&
+          //(cpi->gfu_boost>150) &&
+          (cpi->gfu_boost > 100) &&
+          //(cpi->gfu_boost>AF_THRESH2) &&
+          //((cpi->gfu_boost/i)>AF_THRESH) &&
+          //(decay_accumulator > 0.5) &&
+          (cpi->gf_decay_rate <= (ARF_DECAY_THRESH + (cpi->gfu_boost / 200)))
+         )
+        )
+       )
+    {
+        int Boost;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+        int tmp_q;
+        int arf_frame_bits = 0;
+        int group_bits;
+
+        // Estimate the bits to be allocated to the group as a whole
+        if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0))
+            group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left));
+        else
+            group_bits = 0;
+
+        // Boost for arf frame
+        Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+        Boost += (cpi->baseline_gf_interval * 50);
+        allocation_chunks = (i * 100) + Boost;
+
+        // Normalize Altboost and allocations chunck down to prevent overflow
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        // Calculate the number of bits to be spent on the arf based on the boost number
+        arf_frame_bits = (int)((double)Boost * (group_bits / (double)allocation_chunks));
+
+        // Estimate if there are enough bits available to make worthwhile use of an arf.
+        tmp_q = estimate_q(cpi, mod_frame_err, (int)arf_frame_bits, cpi->common.Height, cpi->common.Width);
+
+        // Only use an arf if it is likely we will be able to code it at a lower Q than the surrounding frames.
+        if (tmp_q < cpi->worst_quality)
+        {
+            cpi->source_alt_ref_pending = TRUE;
+
+            // For alt ref frames the error score for the end frame of the group (the alt ref frame) should not contribute to the group total and hence
+            // the number of bit allocated to the group. Rather it forms part of the next group (it is the GF at the start of the next group)
+            gf_group_err -= mod_frame_err;
+
+            // Set the interval till the next gf or arf. For ARFs this is the number of frames to be coded before the future frame that is coded as an ARF.
+            // The future frame itself is part of the next group
+            cpi->baseline_gf_interval = i - 1;
+
+#ifdef FIRSTPASS_MM
+            // Read through the motion map to load up the entry for the ARF
+            {
+                int j;
+
+                // Advance to the region of interest
+                // Current default 2 frames before to 2 frames after the ARF frame itsef
+                vp8_fpmm_reset_pos(cpi, cpi->fpmm_pos);
+
+                for (j = 0; j < cpi->baseline_gf_interval - 2; j++)
+                    vp8_advance_fpmm(cpi, 1);
+
+                // Read / create a motion map for the region of interest
+                vp8_input_fpmm(cpi, 5);
+            }
+#endif
+        }
+        else
+        {
+            cpi->source_alt_ref_pending = FALSE;
+            cpi->baseline_gf_interval = i;
+        }
+    }
+    else
+    {
+        cpi->source_alt_ref_pending = FALSE;
+        cpi->baseline_gf_interval = i;
+    }
+
+    // Conventional GF
+    if (!cpi->source_alt_ref_pending)
+    {
+        // Dont allow conventional gf too near the next kf
+        if ((cpi->frames_to_key - cpi->baseline_gf_interval) < MIN_GF_INTERVAL)
+        {
+            while (cpi->baseline_gf_interval < cpi->frames_to_key)
+            {
+                if (EOF == vp8_input_stats(cpi, this_frame))
+                    break;
+
+                cpi->baseline_gf_interval++;
+
+                if (cpi->baseline_gf_interval < cpi->frames_to_key)
+                    gf_group_err += calculate_modified_err(cpi, this_frame);
+            }
+        }
+    }
+
+    // Now decide how many bits should be allocated to the GF group as  a proportion of those remaining in the kf group.
+    // The final key frame group in the clip is treated as a special case where cpi->kf_group_bits is tied to cpi->bits_left.
+    // This is also important for short clips where there may only be one key frame.
+    if (cpi->frames_to_key >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+    {
+        cpi->kf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;
+    }
+
+    // Calculate the bits to be allocated to the group as a whole
+    if ((cpi->kf_group_bits > 0) && (cpi->kf_group_error_left > 0))
+        cpi->gf_group_bits = (int)((double)cpi->kf_group_bits * (gf_group_err / (double)cpi->kf_group_error_left));
+    else
+        cpi->gf_group_bits = 0;
+
+    cpi->gf_group_bits = (cpi->gf_group_bits < 0) ? 0 : (cpi->gf_group_bits > cpi->kf_group_bits) ? cpi->kf_group_bits : cpi->gf_group_bits;
+
+    // Clip cpi->gf_group_bits based on user supplied data rate variability limit (cpi->oxcf.two_pass_vbrmax_section)
+    if (cpi->gf_group_bits > max_bits * cpi->baseline_gf_interval)
+        cpi->gf_group_bits = max_bits * cpi->baseline_gf_interval;
+
+    // Reset the file position
+    reset_fpf_position(cpi, start_pos);
+
+    // Assign  bits to the arf or gf.
+    {
+        int Boost;
+        int frames_in_section;
+        int allocation_chunks;
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+        // For ARF frames
+        if (cpi->source_alt_ref_pending)
+        {
+            Boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+            //Boost += (cpi->baseline_gf_interval * 25);
+            Boost += (cpi->baseline_gf_interval * 50);
+
+            // Set max and minimum boost and hence minimum allocation
+            if (Boost > ((cpi->baseline_gf_interval + 1) * 200))
+                Boost = ((cpi->baseline_gf_interval + 1) * 200);
+            else if (Boost < 125)
+                Boost = 125;
+
+            frames_in_section = cpi->baseline_gf_interval + 1;
+            allocation_chunks = (frames_in_section * 100) + Boost;
+        }
+        // Else for standard golden frames
+        else
+        {
+            // boost based on inter / intra ratio of subsequent frames
+            Boost = (cpi->gfu_boost * GFQ_ADJUSTMENT) / 100;
+
+            // Set max and minimum boost and hence minimum allocation
+            if (Boost > (cpi->baseline_gf_interval * 150))
+                Boost = (cpi->baseline_gf_interval * 150);
+            else if (Boost < 125)
+                Boost = 125;
+
+            frames_in_section = cpi->baseline_gf_interval;
+            allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+        }
+
+        // Normalize Altboost and allocations chunck down to prevent overflow
+        while (Boost > 1000)
+        {
+            Boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        // Calculate the number of bits to be spent on the gf or arf based on the boost number
+        cpi->gf_bits = (int)((double)Boost * (cpi->gf_group_bits / (double)allocation_chunks));
+
+        // If the frame that is to be boosted is simpler than the average for the gf/arf group then use an alternative calculation
+        // based on the error score of the frame itself
+        if (mod_frame_err < gf_group_err / (double)cpi->baseline_gf_interval)
+        {
+            double  alt_gf_grp_bits;
+            int     alt_gf_bits;
+
+            alt_gf_grp_bits = ((double)cpi->kf_group_bits  * (mod_frame_err * (double)cpi->baseline_gf_interval) / (double)cpi->kf_group_error_left) ;
+            alt_gf_bits = (int)((double)Boost * (alt_gf_grp_bits / (double)allocation_chunks));
+
+            if (cpi->gf_bits > alt_gf_bits)
+            {
+                cpi->gf_bits = alt_gf_bits;
+            }
+        }
+        // Else if it is harder than other frames in the group make sure it at least receives an allocation in keeping with
+        // its relative error score, otherwise it may be worse off than an "un-boosted" frame
+        else
+        {
+            int alt_gf_bits = (int)((double)cpi->kf_group_bits * (mod_frame_err / (double)cpi->kf_group_error_left));
+
+            if (alt_gf_bits > cpi->gf_bits)
+            {
+                cpi->gf_bits = alt_gf_bits;
+            }
+        }
+
+        // Apply an additional limit for CBR
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->gf_bits > (cpi->buffer_level >> 1))
+                cpi->gf_bits = cpi->buffer_level >> 1;
+        }
+
+        // Dont allow a negative value for gf_bits
+        if (cpi->gf_bits < 0)
+            cpi->gf_bits = 0;
+
+        // Adjust KF group bits and error remainin
+        cpi->kf_group_error_left -= gf_group_err;
+        cpi->kf_group_bits -= cpi->gf_group_bits;
+
+        if (cpi->kf_group_bits < 0)
+            cpi->kf_group_bits = 0;
+
+        // Note the error score left in the remaining frames of the group.
+        // For normal GFs we want to remove the error score for the first frame of the group (except in Key frame case where this has already happened)
+        if (!cpi->source_alt_ref_pending && cpi->common.frame_type != KEY_FRAME)
+            cpi->gf_group_error_left = gf_group_err - gf_first_frame_err;
+        else
+            cpi->gf_group_error_left = gf_group_err;
+
+        cpi->gf_group_bits -= cpi->gf_bits;
+
+        if (cpi->gf_group_bits < 0)
+            cpi->gf_group_bits = 0;
+
+        // Set aside some bits for a mid gf sequence boost
+        if ((cpi->gfu_boost > 150) && (cpi->baseline_gf_interval > 5))
+        {
+            int pct_extra = (cpi->gfu_boost - 100) / 50;
+            pct_extra = (pct_extra > 10) ? 10 : pct_extra;
+
+            cpi->mid_gf_extra_bits = (cpi->gf_group_bits * pct_extra) / 100;
+            cpi->gf_group_bits -= cpi->mid_gf_extra_bits;
+        }
+        else
+            cpi->mid_gf_extra_bits = 0;
+
+        cpi->gf_bits += cpi->min_frame_bandwidth;                                              // Add in minimum for a frame
+    }
+
+    if (!cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))                  // Normal GF and not a KF
+    {
+        cpi->per_frame_bandwidth = cpi->gf_bits;                                               // Per frame bit target for this frame
+    }
+
+    // Adjustment to estimate_max_q based on a measure of complexity of the section
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        vp8_zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_pos);
+
+        for (i = 0 ; i < cpi->baseline_gf_interval ; i++)
+        {
+            vp8_input_stats(cpi, &next_frame);
+            vp8_accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        vp8_avg_stats(&sectionstats);
+
+        if (sectionstats.pcnt_motion < .17)
+            cpi->section_is_low_motion = 1;
+        else
+            cpi->section_is_low_motion = 0;
+
+        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+            cpi->section_is_fast_motion = 1;
+        else
+            cpi->section_is_fast_motion = 0;
+
+        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
+        //{
+        cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->section_max_qfactor < 0.80)
+            cpi->section_max_qfactor = 0.80;
+
+        //}
+        //else
+        //    cpi->section_max_qfactor = 1.0;
+
+        reset_fpf_position(cpi, start_pos);
+    }
+
+#ifdef FIRSTPASS_MM
+    // Reset the First pass motion map file position
+    vp8_fpmm_reset_pos(cpi, fpmm_pos);
+#endif
+}
+
+// Allocate bits to a normal frame that is neither a gf an arf or a key frame.
+static void assign_std_frame_bits(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int    target_frame_size;                                                             // gf_group_error_left
+
+    double modified_err;
+    double err_fraction;                                                                 // What portion of the remaining GF group error is used by this frame
+
+    int max_bits = frame_max_bits(cpi);    // Max for a single frame
+
+    // The final few frames have special treatment
+    if (cpi->frames_till_gf_update_due >= (int)(cpi->total_stats.count - cpi->common.current_video_frame))
+    {
+        cpi->gf_group_bits = (cpi->bits_left > 0) ? cpi->bits_left : 0;;
+    }
+
+    // Calculate modified prediction error used in bit allocation
+    modified_err = calculate_modified_err(cpi, this_frame);
+
+    if (cpi->gf_group_error_left > 0)
+        err_fraction = modified_err / cpi->gf_group_error_left;                              // What portion of the remaining GF group error is used by this frame
+    else
+        err_fraction = 0.0;
+
+    target_frame_size = (int)((double)cpi->gf_group_bits * err_fraction);                    // How many of those bits available for allocation should we give it?
+
+    // Clip to target size to 0 - max_bits (or cpi->gf_group_bits) at the top end.
+    if (target_frame_size < 0)
+        target_frame_size = 0;
+    else
+    {
+        if (target_frame_size > max_bits)
+            target_frame_size = max_bits;
+
+        if (target_frame_size > cpi->gf_group_bits)
+            target_frame_size = cpi->gf_group_bits;
+    }
+
+    cpi->gf_group_error_left -= modified_err;                                               // Adjust error remaining
+    cpi->gf_group_bits -= target_frame_size;                                                // Adjust bits remaining
+
+    if (cpi->gf_group_bits < 0)
+        cpi->gf_group_bits = 0;
+
+    target_frame_size += cpi->min_frame_bandwidth;                                          // Add in the minimum number of bits that is set aside for every frame.
+
+    // Special case for the frame that lies half way between two gfs
+    if (cpi->common.frames_since_golden == cpi->baseline_gf_interval / 2)
+        target_frame_size += cpi->mid_gf_extra_bits;
+
+    cpi->per_frame_bandwidth = target_frame_size;                                           // Per frame bit target for this frame
+}
+
+void vp8_second_pass(VP8_COMP *cpi)
+{
+    int tmp_q;
+    int frames_left = (int)(cpi->total_stats.count - cpi->common.current_video_frame);
+
+    FIRSTPASS_STATS this_frame;
+    FIRSTPASS_STATS this_frame_copy;
+
+    VP8_COMMON *cm = &cpi->common;
+
+    double this_frame_error;
+    double this_frame_intra_error;
+    double this_frame_coded_error;
+
+    FIRSTPASS_STATS *start_pos;
+
+    if (!cpi->stats_in)
+    {
+        return ;
+    }
+
+    vp8_clear_system_state();
+
+    if (EOF == vp8_input_stats(cpi, &this_frame))
+        return;
+
+#ifdef FIRSTPASS_MM
+    vpx_memset(cpi->fp_motion_map, 0, cpi->common.MBs);
+    cpi->fpmm_pos = vp8_fpmm_get_pos(cpi);
+    vp8_advance_fpmm(cpi, 1);         // Read this frame's first pass motion map
+#endif
+
+    this_frame_error = this_frame.ssim_weighted_pred_err;
+    this_frame_intra_error = this_frame.intra_error;
+    this_frame_coded_error = this_frame.coded_error;
+
+    // Store information regarding level of motion etc for use mode decisions.
+    cpi->motion_speed = (int)(fabs(this_frame.MVr) + fabs(this_frame.MVc));
+    cpi->motion_var = (int)(fabs(this_frame.MVrv) + fabs(this_frame.MVcv));
+    cpi->inter_lvl = (int)(this_frame.pcnt_inter * 100);
+    cpi->intra_lvl = (int)((1.0 - this_frame.pcnt_inter) * 100);
+    cpi->motion_lvl = (int)(this_frame.pcnt_motion * 100);
+
+    start_pos = cpi->stats_in;
+
+    // keyframe and section processing !
+    if (cpi->frames_to_key == 0)
+    {
+        // Define next KF group and assign bits to it
+        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        vp8_find_next_key_frame(cpi, &this_frame_copy);
+
+        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
+        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
+        // This is temporary code till we decide what should really happen in this case.
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->gf_group_bits = cpi->kf_group_bits;
+            cpi->gf_group_error_left = cpi->kf_group_error_left;
+            cpi->baseline_gf_interval = cpi->frames_to_key;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+            cpi->source_alt_ref_pending = FALSE;
+        }
+
+    }
+
+    // Is this a GF / ARF (Note that a KF is always also a GF)
+    if (cpi->frames_till_gf_update_due == 0)
+    {
+        // Define next gf group and assign bits to it
+        vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+        define_gf_group(cpi, &this_frame_copy);
+
+        // If we are going to code an altref frame at the end of the group and the current frame is not a key frame....
+        // If the previous group used an arf this frame has already benefited from that arf boost and it should not be given extra bits
+        // If the previous group was NOT coded using arf we may want to apply some boost to this GF as well
+        if (cpi->source_alt_ref_pending && (cpi->common.frame_type != KEY_FRAME))
+        {
+            // Assign a standard frames worth of bits from those allocated to the GF group
+            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+
+            // If appropriate (we are switching into ARF active but it was not previously active) apply a boost for the gf at the start of the group.
+            //if ( !cpi->source_alt_ref_active && (cpi->gfu_boost > 150) )
+            if (FALSE)
+            {
+                int extra_bits;
+                int pct_extra = (cpi->gfu_boost - 100) / 50;
+
+                pct_extra = (pct_extra > 20) ? 20 : pct_extra;
+
+                extra_bits = (cpi->gf_group_bits * pct_extra) / 100;
+                cpi->gf_group_bits -= extra_bits;
+                cpi->per_frame_bandwidth += extra_bits;
+            }
+        }
+    }
+
+    // Otherwise this is an ordinary frame
+    else
+    {
+        // Special case: Error error_resilient_mode mode does not make much sense for two pass but with its current meaning but this code is designed to stop
+        // outlandish behaviour if someone does set it when using two pass. It effectively disables GF groups.
+        // This is temporary code till we decide what should really happen in this case.
+        if (cpi->oxcf.error_resilient_mode)
+        {
+            cpi->frames_till_gf_update_due = cpi->frames_to_key;
+
+            if (cpi->common.frame_type != KEY_FRAME)
+            {
+                // Assign bits from those allocated to the GF group
+                vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+                assign_std_frame_bits(cpi, &this_frame_copy);
+            }
+        }
+        else
+        {
+            // Assign bits from those allocated to the GF group
+            vpx_memcpy(&this_frame_copy, &this_frame, sizeof(this_frame));
+            assign_std_frame_bits(cpi, &this_frame_copy);
+        }
+    }
+
+    // Set nominal per second bandwidth for this frame
+    cpi->target_bandwidth = cpi->per_frame_bandwidth * cpi->output_frame_rate;
+    if (cpi->target_bandwidth < 0)
+        cpi->target_bandwidth = 0;
+
+    if (cpi->common.current_video_frame == 0)
+    {
+        // guess at 2nd pass q
+        cpi->est_max_qcorrection_factor = 1.0;
+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+
+        if (tmp_q < cpi->worst_quality)
+        {
+            cpi->active_worst_quality         = tmp_q;
+            cpi->ni_av_qi                     = tmp_q;
+        }
+        else
+        {
+            cpi->active_worst_quality         = cpi->worst_quality;
+            cpi->ni_av_qi                     = cpi->worst_quality;
+        }
+    }
+    else
+    {
+        if (frames_left < 1)
+            frames_left = 1;
+
+        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
+
+        // Move active_worst_quality but in a damped way
+        if (tmp_q > cpi->active_worst_quality)
+            cpi->active_worst_quality ++;
+        else if (tmp_q < cpi->active_worst_quality)
+            cpi->active_worst_quality --;
+
+        cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
+
+        // Clamp to user set limits
+        if (cpi->active_worst_quality > cpi->worst_quality)
+            cpi->active_worst_quality = cpi->worst_quality;
+        else if (cpi->active_worst_quality < cpi->best_quality)
+            cpi->active_worst_quality = cpi->best_quality;
+
+    }
+
+    cpi->frames_to_key --;
+    cpi->total_error_left      -= this_frame_error;
+    cpi->total_intra_error_left -= this_frame_intra_error;
+    cpi->total_coded_error_left -= this_frame_coded_error;
+}
+
+
+static BOOL test_candidate_kf(VP8_COMP *cpi,  FIRSTPASS_STATS *last_frame, FIRSTPASS_STATS *this_frame, FIRSTPASS_STATS *next_frame)
+{
+    BOOL is_viable_kf = FALSE;
+
+    // Does the frame satisfy the primary criteria of a key frame
+    //      If so, then examine how well it predicts subsequent frames
+    if ((this_frame->pcnt_second_ref < 0.10) &&
+        (next_frame->pcnt_second_ref < 0.10) &&
+        ((this_frame->pcnt_inter < 0.05) ||
+         (
+             (this_frame->pcnt_inter < .25) &&
+             ((this_frame->intra_error / DOUBLE_DIVIDE_CHECK(this_frame->coded_error)) < 2.5) &&
+             ((fabs(last_frame->coded_error - this_frame->coded_error) / DOUBLE_DIVIDE_CHECK(this_frame->coded_error) > .40) ||
+              (fabs(last_frame->intra_error - this_frame->intra_error) / DOUBLE_DIVIDE_CHECK(this_frame->intra_error) > .40) ||
+              ((next_frame->intra_error / DOUBLE_DIVIDE_CHECK(next_frame->coded_error)) > 3.5)
+             )
+         )
+        )
+       )
+    {
+        int i;
+        FIRSTPASS_STATS *start_pos;
+
+        FIRSTPASS_STATS local_next_frame;
+
+        double boost_score = 0.0;
+        double old_boost_score = 0.0;
+        double decay_accumulator = 1.0;
+        double next_iiratio;
+
+        vpx_memcpy(&local_next_frame, next_frame, sizeof(*next_frame));
+
+        // Note the starting file position so we can reset to it
+        start_pos = cpi->stats_in;
+
+        // Examine how well the key frame predicts subsequent frames
+        for (i = 0 ; i < 16; i++)
+        {
+            next_iiratio = (IIKFACTOR1 * local_next_frame.intra_error / DOUBLE_DIVIDE_CHECK(local_next_frame.coded_error)) ;
+
+            if (next_iiratio > RMAX)
+                next_iiratio = RMAX;
+
+            // Cumulative effect of decay in prediction quality
+            if (local_next_frame.pcnt_inter > 0.85)
+                decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+            else
+                decay_accumulator = decay_accumulator * ((0.85 + local_next_frame.pcnt_inter) / 2.0);
+
+            //decay_accumulator = decay_accumulator * local_next_frame.pcnt_inter;
+
+            // Keep a running total
+            boost_score += (decay_accumulator * next_iiratio);
+
+            // Test various breakout clauses
+            if ((local_next_frame.pcnt_inter < 0.05) ||
+                (next_iiratio < 1.5) ||
+                ((local_next_frame.pcnt_inter < 0.20) && (next_iiratio < 3.0)) ||
+                ((boost_score - old_boost_score) < 0.5) ||
+                (local_next_frame.intra_error < 200)
+               )
+            {
+                break;
+            }
+
+            old_boost_score = boost_score;
+
+            // Get the next frame details
+            if (EOF == vp8_input_stats(cpi, &local_next_frame))
+                break;
+        }
+
+        // If there is tolerable prediction for at least the next 3 frames then break out else discard this pottential key frame and move on
+        if (boost_score > 5.0 && (i > 3))
+            is_viable_kf = TRUE;
+        else
+        {
+            // Reset the file position
+            reset_fpf_position(cpi, start_pos);
+
+            is_viable_kf = FALSE;
+        }
+    }
+
+    return is_viable_kf;
+}
+void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
+{
+    int i;
+    FIRSTPASS_STATS last_frame;
+    FIRSTPASS_STATS first_frame;
+    FIRSTPASS_STATS next_frame;
+    FIRSTPASS_STATS *start_position;
+
+    double decay_accumulator = 0;
+    double boost_score = 0;
+    double old_boost_score = 0.0;
+    double loop_decay_rate;
+
+    double kf_mod_err = 0.0;
+    double kf_group_err = 0.0;
+    double kf_group_intra_err = 0.0;
+    double kf_group_coded_err = 0.0;
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    vp8_clear_system_state();  //__asm emms;
+    start_position = cpi->stats_in;
+
+    cpi->common.frame_type = KEY_FRAME;
+
+    // Clear the alt ref active flag as this can never be active on a key frame
+    cpi->source_alt_ref_active = FALSE;
+
+    // Kf is always a gf so clear frames till next gf counter
+    cpi->frames_till_gf_update_due = 0;
+
+    cpi->frames_to_key = 1;
+
+    // Take a copy of the initial frame details
+    vpx_memcpy(&first_frame, this_frame, sizeof(*this_frame));
+
+    cpi->kf_group_bits = 0;       // Estimate of total bits avaialable to kf group
+    cpi->kf_group_error_left = 0;  // Group modified error score.
+
+    kf_mod_err = calculate_modified_err(cpi, this_frame);
+
+    // find the next keyframe
+    while (cpi->stats_in < cpi->stats_in_end)
+    {
+        // Accumulate kf group error
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        // These figures keep intra and coded error counts for all frames including key frames in the group.
+        // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+
+        vpx_memcpy(&last_frame, this_frame, sizeof(*this_frame));
+
+        // Provided that we are not at the end of the file...
+        if (EOF != vp8_input_stats(cpi, this_frame))
+        {
+            if (lookup_next_frame_stats(cpi, &next_frame) != EOF)
+            {
+                if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame))
+                    break;
+            }
+        }
+
+        // Step on to the next frame
+        cpi->frames_to_key ++;
+
+        // If we don't have a real key frame within the next two
+        // forcekeyframeevery intervals then break out of the loop.
+        if (cpi->frames_to_key >= 2 *(int)cpi->key_frame_frequency)
+            break;
+
+    }
+
+    // If there is a max kf interval set by the user we must obey it.
+    // We already breakout of the loop above at 2x max.
+    // This code centers the extra kf if the actual natural
+    // interval is between 1x and 2x
+    if ( cpi->frames_to_key > (int)cpi->key_frame_frequency )
+    {
+        cpi->frames_to_key /= 2;
+
+        // Estimate corrected kf group error
+        kf_group_err /= 2.0;
+        kf_group_intra_err /= 2.0;
+        kf_group_coded_err /= 2.0;
+    }
+
+    // Special case for the last frame of the file
+    if (cpi->stats_in >= cpi->stats_in_end)
+    {
+        // Accumulate kf group error
+        kf_group_err += calculate_modified_err(cpi, this_frame);
+
+        // These figures keep intra and coded error counts for all frames including key frames in the group.
+        // The effect of the key frame itself can be subtracted out using the first_frame data collected above
+        kf_group_intra_err += this_frame->intra_error;
+        kf_group_coded_err += this_frame->coded_error;
+    }
+
+    // Calculate the number of bits that should be assigned to the kf group.
+    if ((cpi->bits_left > 0) && ((int)cpi->modified_total_error_left > 0))
+    {
+        int max_bits = frame_max_bits(cpi);    // Max for a single normal frame (not key frame)
+
+        // Default allocation based on bits left and relative complexity of the section
+        cpi->kf_group_bits = (int)(cpi->bits_left * (kf_group_err / cpi->modified_total_error_left));
+
+        // Clip based on maximum per frame rate defined by the user.
+        if (cpi->kf_group_bits > max_bits * cpi->frames_to_key)
+            cpi->kf_group_bits = max_bits * cpi->frames_to_key;
+
+        // Additional special case for CBR if buffer is getting full.
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            // If the buffer is near or above the optimal and this kf group is not being allocated much
+            // then increase the allocation a bit.
+            if (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level)
+            {
+                int high_water_mark = (cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1;
+                int min_group_bits;
+
+                // We are at or above the maximum.
+                if (cpi->buffer_level >= high_water_mark)
+                {
+                    min_group_bits = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) + (cpi->buffer_level - high_water_mark);
+
+                    if (cpi->kf_group_bits < min_group_bits)
+                        cpi->kf_group_bits = min_group_bits;
+                }
+                // We are above optimal but below the maximum
+                else if (cpi->kf_group_bits < (cpi->av_per_frame_bandwidth * cpi->frames_to_key))
+                {
+                    int bits_below_av = (cpi->av_per_frame_bandwidth * cpi->frames_to_key) - cpi->kf_group_bits;
+                    cpi->kf_group_bits += (int)((double)bits_below_av * (double)(cpi->buffer_level - cpi->oxcf.optimal_buffer_level) /
+                                                (double)(high_water_mark - cpi->oxcf.optimal_buffer_level));
+                }
+            }
+        }
+    }
+    else
+        cpi->kf_group_bits = 0;
+
+    // Reset the first pass file position
+    reset_fpf_position(cpi, start_position);
+
+    // determine how big to make this keyframe based on how well the subsequent frames use inter blocks
+    decay_accumulator = 1.0;
+    boost_score = 0.0;
+    loop_decay_rate = 1.00;       // Starting decay rate
+
+    for (i = 0 ; i < cpi->frames_to_key ; i++)
+    {
+        double r;
+
+        if (EOF == vp8_input_stats(cpi, &next_frame))
+            break;
+
+        r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)) ;
+
+        if (r > RMAX)
+            r = RMAX;
+
+        // Adjust loop decay rate
+        //if ( next_frame.pcnt_inter < loop_decay_rate )
+        loop_decay_rate = next_frame.pcnt_inter;
+
+        if ((1.0 - (next_frame.pcnt_motion / 10.0)) < loop_decay_rate)
+            loop_decay_rate = (1.0 - (next_frame.pcnt_motion / 10.0));
+
+        decay_accumulator = decay_accumulator * loop_decay_rate;
+
+        boost_score += (decay_accumulator * r);
+
+        if ((i > MIN_GF_INTERVAL) &&
+            ((boost_score - old_boost_score) < 1.0))
+        {
+            break;
+        }
+
+        old_boost_score = boost_score;
+    }
+
+    if (1)
+    {
+        FIRSTPASS_STATS sectionstats;
+        double Ratio;
+
+        vp8_zero_stats(&sectionstats);
+        reset_fpf_position(cpi, start_position);
+
+        for (i = 0 ; i < cpi->frames_to_key ; i++)
+        {
+            vp8_input_stats(cpi, &next_frame);
+            vp8_accumulate_stats(&sectionstats, &next_frame);
+        }
+
+        vp8_avg_stats(&sectionstats);
+
+        if (sectionstats.pcnt_motion < .17)
+            cpi->section_is_low_motion = 1;
+        else
+            cpi->section_is_low_motion = 0;
+
+        if (sectionstats.mvc_abs + sectionstats.mvr_abs > 45)
+            cpi->section_is_fast_motion = 1;
+        else
+            cpi->section_is_fast_motion = 0;
+
+        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+
+        Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        // if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
+        //{
+        cpi->section_max_qfactor = 1.0 - ((Ratio - 10.0) * 0.025);
+
+        if (cpi->section_max_qfactor < 0.80)
+            cpi->section_max_qfactor = 0.80;
+
+        //}
+        //else
+        //    cpi->section_max_qfactor = 1.0;
+    }
+
+    // When using CBR apply additional buffer fullness related upper limits
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        double max_boost;
+
+        if (cpi->drop_frames_allowed)
+        {
+            int df_buffer_level = cpi->oxcf.drop_frames_water_mark * (cpi->oxcf.optimal_buffer_level / 100);
+
+            if (cpi->buffer_level > df_buffer_level)
+                max_boost = ((double)((cpi->buffer_level - df_buffer_level) * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+            else
+                max_boost = 0.0;
+        }
+        else if (cpi->buffer_level > 0)
+        {
+            max_boost = ((double)(cpi->buffer_level * 2 / 3) * 16.0) / DOUBLE_DIVIDE_CHECK((double)cpi->av_per_frame_bandwidth);
+        }
+        else
+        {
+            max_boost = 0.0;
+        }
+
+        if (boost_score > max_boost)
+            boost_score = max_boost;
+    }
+
+    // Reset the first pass file position
+    reset_fpf_position(cpi, start_position);
+
+    // Work out how many bits to allocate for the key frame itself
+    if (1)
+    {
+        int kf_boost = boost_score;
+        int allocation_chunks;
+        int Counter = cpi->frames_to_key;
+        int alt_kf_bits;
+
+        // Min boost based on kf interval
+#if 0
+
+        while ((kf_boost < 48) && (Counter > 0))
+        {
+            Counter -= 2;
+            kf_boost ++;
+        }
+
+#endif
+
+        if (kf_boost < 48)
+        {
+            kf_boost += ((Counter + 1) >> 1);
+
+            if (kf_boost > 48) kf_boost = 48;
+        }
+
+        // bigger frame sizes need larger kf boosts, smaller frames smaller boosts...
+        if ((cpi->common.last_frame.y_width  * cpi->common.last_frame.y_height) > (320 * 240))
+            kf_boost += 2 * (cpi->common.last_frame.y_width  * cpi->common.last_frame.y_height) / (320 * 240);
+        else if ((cpi->common.last_frame.y_width  * cpi->common.last_frame.y_height) < (320 * 240))
+            kf_boost -= 4 * (320 * 240) / (cpi->common.last_frame.y_width  * cpi->common.last_frame.y_height);
+
+        kf_boost = (int)((double)kf_boost * 100.0) >> 4;                          // Scale 16 to 100
+
+        // Adjustment to boost based on recent average q
+        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+
+        if (kf_boost < 250)                                                      // Min KF boost
+            kf_boost = 250;
+
+        // We do three calculations for kf size.
+        // The first is based on the error score for the whole kf group.
+        // The second (optionaly) on the key frames own error if this is smaller than the average for the group.
+        // The final one insures that the frame receives at least the allocation it would have received based on its own error score vs the error score remaining
+
+        allocation_chunks = ((cpi->frames_to_key - 1) * 100) + kf_boost;           // cpi->frames_to_key-1 because key frame itself is taken care of by kf_boost
+
+        // Normalize Altboost and allocations chunck down to prevent overflow
+        while (kf_boost > 1000)
+        {
+            kf_boost /= 2;
+            allocation_chunks /= 2;
+        }
+
+        cpi->kf_group_bits = (cpi->kf_group_bits < 0) ? 0 : cpi->kf_group_bits;
+
+        // Calculate the number of bits to be spent on the key frame
+        cpi->kf_bits  = (int)((double)kf_boost * ((double)cpi->kf_group_bits / (double)allocation_chunks));
+
+        // Apply an additional limit for CBR
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            if (cpi->kf_bits > ((3 * cpi->buffer_level) >> 2))
+                cpi->kf_bits = (3 * cpi->buffer_level) >> 2;
+        }
+
+        // If the key frame is actually easier than the average for the kf group (which does sometimes happen... eg a blank intro frame)
+        // Then use an alternate calculation based on the kf error score which should give a smaller key frame.
+        if (kf_mod_err < kf_group_err / cpi->frames_to_key)
+        {
+            double  alt_kf_grp_bits = ((double)cpi->bits_left * (kf_mod_err * (double)cpi->frames_to_key) / cpi->modified_total_error_left) ;
+
+            alt_kf_bits = (int)((double)kf_boost * (alt_kf_grp_bits / (double)allocation_chunks));
+
+            if (cpi->kf_bits > alt_kf_bits)
+            {
+                cpi->kf_bits = alt_kf_bits;
+            }
+        }
+        // Else if it is much harder than other frames in the group make sure it at least receives an allocation in keeping with its relative error score
+        else
+        {
+            alt_kf_bits = (int)((double)cpi->bits_left * (kf_mod_err / cpi->modified_total_error_left));
+
+            if (alt_kf_bits > cpi->kf_bits)
+            {
+                cpi->kf_bits = alt_kf_bits;
+            }
+        }
+
+        cpi->kf_group_bits -= cpi->kf_bits;
+        cpi->kf_bits += cpi->min_frame_bandwidth;                                          // Add in the minimum frame allowance
+
+        cpi->per_frame_bandwidth = cpi->kf_bits;                                           // Peer frame bit target for this frame
+        cpi->target_bandwidth = cpi->kf_bits * cpi->output_frame_rate;                      // Convert to a per second bitrate
+    }
+
+    // Note the total error score of the kf group minus the key frame itself
+    cpi->kf_group_error_left = (int)(kf_group_err - kf_mod_err);
+
+    // Adjust the count of total modified error left.
+    // The count of bits left is adjusted elsewhere based on real coded frame sizes
+    cpi->modified_total_error_left -= kf_group_err;
+
+    if (cpi->oxcf.allow_spatial_resampling)
+    {
+        int resample_trigger = FALSE;
+        int last_kf_resampled = FALSE;
+        int kf_q;
+        int scale_val = 0;
+        int hr, hs, vr, vs;
+        int new_width = cpi->oxcf.Width;
+        int new_height = cpi->oxcf.Height;
+
+        int projected_buffer_level = cpi->buffer_level;
+        int tmp_q;
+
+        double projected_bits_perframe;
+        double group_iiratio = (kf_group_intra_err - first_frame.intra_error) / (kf_group_coded_err - first_frame.coded_error);
+        double err_per_frame = kf_group_err / cpi->frames_to_key;
+        double bits_per_frame;
+        double av_bits_per_frame;
+        double effective_size_ratio;
+
+        if ((cpi->common.Width != cpi->oxcf.Width) || (cpi->common.Height != cpi->oxcf.Height))
+            last_kf_resampled = TRUE;
+
+        // Set back to unscaled by defaults
+        cpi->common.horiz_scale = NORMAL;
+        cpi->common.vert_scale = NORMAL;
+
+        // Calculate Average bits per frame.
+        //av_bits_per_frame = cpi->bits_left/(double)(cpi->total_stats.count - cpi->common.current_video_frame);
+        av_bits_per_frame = cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate);
+        //if ( av_bits_per_frame < 0.0 )
+        //  av_bits_per_frame = 0.0
+
+        // CBR... Use the clip average as the target for deciding resample
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            bits_per_frame = av_bits_per_frame;
+        }
+
+        // In VBR we want to avoid downsampling in easy section unless we are under extreme pressure
+        // So use the larger of target bitrate for this sectoion or average bitrate for sequence
+        else
+        {
+            bits_per_frame = cpi->kf_group_bits / cpi->frames_to_key;     // This accounts for how hard the section is...
+
+            if (bits_per_frame < av_bits_per_frame)                      // Dont turn to resampling in easy sections just because they have been assigned a small number of bits
+                bits_per_frame = av_bits_per_frame;
+        }
+
+        // bits_per_frame should comply with our minimum
+        if (bits_per_frame < (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100))
+            bits_per_frame = (cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+        // Work out if spatial resampling is necessary
+        kf_q = estimate_kf_group_q(cpi, err_per_frame, bits_per_frame, new_height, new_width, group_iiratio);
+
+        // If we project a required Q higher than the maximum allowed Q then make a guess at the actual size of frames in this section
+        projected_bits_perframe = bits_per_frame;
+        tmp_q = kf_q;
+
+        while (tmp_q > cpi->worst_quality)
+        {
+            projected_bits_perframe *= 1.04;
+            tmp_q--;
+        }
+
+        // Guess at buffer level at the end of the section
+        projected_buffer_level = cpi->buffer_level - (int)((projected_bits_perframe - av_bits_per_frame) * cpi->frames_to_key);
+
+        if (0)
+        {
+            FILE *f = fopen("Subsamle.stt", "a");
+            fprintf(f, " %8d %8d %8d %8d %12.0f %8d %8d %8d\n",  cpi->common.current_video_frame, kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+            fclose(f);
+        }
+
+        // The trigger for spatial resampling depends on the various parameters such as whether we are streaming (CBR) or VBR.
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            // Trigger resample if we are projected to fall below down sample level or
+            // resampled last time and are projected to remain below the up sample level
+            if ((projected_buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100)) ||
+                (last_kf_resampled && (projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))))
+                //( ((cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))) &&
+                //  ((projected_buffer_level < (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))) ))
+                resample_trigger = TRUE;
+            else
+                resample_trigger = FALSE;
+        }
+        else
+        {
+            long long clip_bits = (long long)(cpi->total_stats.count * cpi->oxcf.target_bandwidth / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.frame_rate));
+            long long over_spend = cpi->oxcf.starting_buffer_level - cpi->buffer_level;
+            long long over_spend2 = cpi->oxcf.starting_buffer_level - projected_buffer_level;
+
+            if ((last_kf_resampled && (kf_q > cpi->worst_quality)) ||                                               // If triggered last time the threshold for triggering again is reduced
+                ((kf_q > cpi->worst_quality) &&                                                                  // Projected Q higher than allowed and ...
+                 (over_spend > clip_bits / 20)))                                                               // ... Overspend > 5% of total bits
+                resample_trigger = TRUE;
+            else
+                resample_trigger = FALSE;
+
+        }
+
+        if (resample_trigger)
+        {
+            while ((kf_q >= cpi->worst_quality) && (scale_val < 6))
+            {
+                scale_val ++;
+
+                cpi->common.vert_scale   = vscale_lookup[scale_val];
+                cpi->common.horiz_scale  = hscale_lookup[scale_val];
+
+                Scale2Ratio(cpi->common.horiz_scale, &hr, &hs);
+                Scale2Ratio(cpi->common.vert_scale, &vr, &vs);
+
+                new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+                new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+                // Reducing the area to 1/4 does not reduce the complexity (err_per_frame) to 1/4...
+                // effective_sizeratio attempts to provide a crude correction for this
+                effective_size_ratio = (double)(new_width * new_height) / (double)(cpi->oxcf.Width * cpi->oxcf.Height);
+                effective_size_ratio = (1.0 + (3.0 * effective_size_ratio)) / 4.0;
+
+                // Now try again and see what Q we get with the smaller image size
+                kf_q = estimate_kf_group_q(cpi, err_per_frame * effective_size_ratio, bits_per_frame, new_height, new_width, group_iiratio);
+
+                if (0)
+                {
+                    FILE *f = fopen("Subsamle.stt", "a");
+                    fprintf(f, "******** %8d %8d %8d %12.0f %8d %8d %8d\n",  kf_q, cpi->common.horiz_scale, cpi->common.vert_scale,  kf_group_err / cpi->frames_to_key, cpi->kf_group_bits / cpi->frames_to_key, new_height, new_width);
+                    fclose(f);
+                }
+            }
+        }
+
+        if ((cpi->common.Width != new_width) || (cpi->common.Height != new_height))
+        {
+            cpi->common.Width = new_width;
+            cpi->common.Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+        }
+    }
+}

diff --git a/vp8/encoder/firstpass.h b/vp8/encoder/firstpass.h
new file mode 100644
index 0000000..d7b52f3
--- /dev/null
+++ b/vp8/encoder/firstpass.h

@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if !defined __INC_FIRSTPASS_H
+#define      __INC_FIRSTPASS_H
+
+extern void vp8_init_first_pass(VP8_COMP *cpi);
+extern void vp8_first_pass(VP8_COMP *cpi);
+extern void vp8_end_first_pass(VP8_COMP *cpi);
+
+extern void vp8_init_second_pass(VP8_COMP *cpi);
+extern void vp8_second_pass(VP8_COMP *cpi);
+extern void vp8_end_second_pass(VP8_COMP *cpi);
+
+#endif

diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
new file mode 100644
index 0000000..52aab66
--- /dev/null
+++ b/vp8/encoder/generic/csystemdependent.c

@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi);
+
+
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction);
+
+void vp8_cmachine_specific_config(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    cpi->rtcd.common                    = &cpi->common.rtcd;
+    cpi->rtcd.variance.sad16x16              = vp8_sad16x16_c;
+    cpi->rtcd.variance.sad16x8               = vp8_sad16x8_c;
+    cpi->rtcd.variance.sad8x16               = vp8_sad8x16_c;
+    cpi->rtcd.variance.sad8x8                = vp8_sad8x8_c;
+    cpi->rtcd.variance.sad4x4                = vp8_sad4x4_c;
+
+    cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_c;
+    cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_c;
+    cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_c;
+    cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_c;
+    cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_c;
+
+    cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_c;
+    cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_c;
+    cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_c;
+    cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_c;
+    cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_c;
+
+    cpi->rtcd.variance.var4x4                = vp8_variance4x4_c;
+    cpi->rtcd.variance.var8x8                = vp8_variance8x8_c;
+    cpi->rtcd.variance.var8x16               = vp8_variance8x16_c;
+    cpi->rtcd.variance.var16x8               = vp8_variance16x8_c;
+    cpi->rtcd.variance.var16x16              = vp8_variance16x16_c;
+
+    cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_c;
+    cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_c;
+    cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_c;
+    cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_c;
+    cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_c;
+    cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_c;
+
+    cpi->rtcd.variance.mse16x16              = vp8_mse16x16_c;
+    cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
+
+    cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
+    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
+    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
+    cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
+
+    cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
+    cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_c;
+    cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_c;
+    cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_c;
+    cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
+
+    cpi->rtcd.encodemb.berr                  = vp8_block_error_c;
+    cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_c;
+    cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_c;
+    cpi->rtcd.encodemb.subb                  = vp8_subtract_b_c;
+    cpi->rtcd.encodemb.submby                = vp8_subtract_mby_c;
+    cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_c;
+
+    cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
+    cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
+
+    cpi->rtcd.search.full_search             = vp8_full_search_sad;
+    cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
+#endif
+
+    // Pure C:
+    vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
+
+
+#if ARCH_X86 || ARCH_X86_64
+    vp8_arch_x86_encoder_init(cpi);
+#endif
+
+}

diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
new file mode 100644
index 0000000..d80059d
--- /dev/null
+++ b/vp8/encoder/mcomp.c

@@ -0,0 +1,1467 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "mcomp.h"
+#include "vpx_mem/vpx_mem.h"
+
+#include <stdio.h>
+#include <limits.h>
+#include <math.h>
+
+#ifdef ENTROPY_STATS
+static int mv_ref_ct [31] [4] [2];
+static int mv_mode_cts [4] [2];
+#endif
+
+static int mv_bits_sadcost[256];
+
+void vp8cx_init_mv_bits_sadcost()
+{
+    int i;
+
+    for (i = 0; i < 256; i++)
+    {
+        mv_bits_sadcost[i] = (int)sqrt(i * 16);
+    }
+}
+
+
+int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
+{
+    // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
+    // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
+    // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
+    // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
+    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
+}
+
+int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
+{
+    //int i;
+    //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
+    //return ( (vp8_mv_bit_cost(mv,  ref, mvcost, 100) + 128) * error_per_bit) >> 8;
+
+    //i = (vp8_mv_bit_cost(mv,  ref, mvcost, 100) * error_per_bit + 128) >> 8;
+    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
+    //return (vp8_mv_bit_cost(mv,  ref, mvcost, 128) * error_per_bit + 128) >> 8;
+}
+
+
+static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
+{
+    // get the estimated number of bits for a motion vector, to be used for costing in SAD based
+    // motion estimation
+    return ((mvcost[0][(mv->row - ref->row) >> 1]  +  mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
+}
+
+void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+
+    // Generate offsets for 4 search sites per step.
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        // Contract.
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 4;
+}
+
+void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
+{
+    int Len;
+    int search_site_count = 0;
+
+    // Generate offsets for 8 search sites per step.
+    Len = MAX_FIRST_STEP;
+    x->ss[search_site_count].mv.col = 0;
+    x->ss[search_site_count].mv.row = 0;
+    x->ss[search_site_count].offset = 0;
+    search_site_count++;
+
+    while (Len > 0)
+    {
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = 0;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = -Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = 0;
+        x->ss[search_site_count].offset = Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride - Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = -Len;
+        x->ss[search_site_count].offset = -Len * stride + Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = -Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride - Len;
+        search_site_count++;
+
+        // Compute offsets for search sites.
+        x->ss[search_site_count].mv.col = Len;
+        x->ss[search_site_count].mv.row = Len;
+        x->ss[search_site_count].offset = Len * stride + Len;
+        search_site_count++;
+
+
+        // Contract.
+        Len /= 2;
+    }
+
+    x->ss_count = search_site_count;
+    x->searches_per_step = 8;
+}
+
+
+#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
+#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
+#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
+#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
+#define MIN(x,y) (((x)<(y))?(x):(y))
+#define MAX(x,y) (((x)>(y))?(x):(y))
+
+//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
+
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+
+    int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
+    int br = bestmv->row << 2, bc = bestmv->col << 2;
+    int tr = br, tc = bc;
+    unsigned int besterr = INT_MAX;
+    unsigned int left, right, up, down, diag;
+    unsigned int sse;
+    unsigned int whichdir;
+    unsigned int halfiters = 4;
+    unsigned int quarteriters = 4;
+
+    int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
+    int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
+    int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
+    int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+
+    // calculate central point error
+    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+    while (--halfiters)
+    {
+        // 1/2 pel
+        CHECK_BETTER(left, tr, tc - 2);
+        CHECK_BETTER(right, tr, tc + 2);
+        CHECK_BETTER(up, tr - 2, tc);
+        CHECK_BETTER(down, tr + 2, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 2, tc - 2);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 2, tc + 2);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 2, tc - 2);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 2, tc + 2);
+            break;
+        }
+
+        // no reason to check the same one again.
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
+    // 1/4 pel
+    while (--quarteriters)
+    {
+        CHECK_BETTER(left, tr, tc - 1);
+        CHECK_BETTER(right, tr, tc + 1);
+        CHECK_BETTER(up, tr - 1, tc);
+        CHECK_BETTER(down, tr + 1, tc);
+
+        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+        switch (whichdir)
+        {
+        case 0:
+            CHECK_BETTER(diag, tr - 1, tc - 1);
+            break;
+        case 1:
+            CHECK_BETTER(diag, tr - 1, tc + 1);
+            break;
+        case 2:
+            CHECK_BETTER(diag, tr + 1, tc - 1);
+            break;
+        case 3:
+            CHECK_BETTER(diag, tr + 1, tc + 1);
+            break;
+        }
+
+        // no reason to check the same one again.
+        if (tr == br && tc == bc)
+            break;
+
+        tr = br;
+        tc = bc;
+    }
+
+    bestmv->row = br << 1;
+    bestmv->col = bc << 1;
+
+    if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
+        return INT_MAX;
+
+    return besterr;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+#undef MIN
+#undef MAX
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    int bestmse = INT_MAX;
+    MV startmv;
+    //MV this_mv;
+    MV this_mv;
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+    int whichdir ;
+
+
+    // Trap uncodable vectors
+    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+    {
+        bestmv->row <<= 3;
+        bestmv->col <<= 3;
+        return INT_MAX;
+    }
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+    startmv = *bestmv;
+
+    // calculate central point error
+    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+    this_mv.col = ((startmv.col - 8) | 4);
+    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 8;
+    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+    this_mv.row = ((startmv.row - 8) | 4);
+    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 8;
+    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+
+    // now check 1 more diagonal
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    // whichdir must be 0-4. Therefore, one of the cases below
+    // must run through. However, because there is no default
+    // and diag is not set elsewhere, we get a compile warning
+    diag = 0;
+    //for(whichdir =0;whichdir<4;whichdir++)
+    //{
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.col += 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row += 4;
+        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 3:
+        this_mv.col += 4;
+        this_mv.row += 4;
+        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+//  }
+
+
+    // time to check quarter pels.
+    if (bestmv->row < startmv.row)
+        y -= d->pre_stride;
+
+    if (bestmv->col < startmv.col)
+        y--;
+
+    startmv = *bestmv;
+
+
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+
+    if (startmv.col & 7)
+    {
+        this_mv.col = startmv.col - 2;
+        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.col = (startmv.col - 8) | 6;
+        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 4;
+    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+
+    if (startmv.row & 7)
+    {
+        this_mv.row = startmv.row - 2;
+        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    }
+    else
+    {
+        this_mv.row = (startmv.row - 8) | 6;
+        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+    }
+
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 4;
+    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+
+    // now check 1 more diagonal
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+//  for(whichdir=0;whichdir<4;whichdir++)
+//  {
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+
+        if (startmv.row & 7)
+        {
+            this_mv.row -= 2;
+
+            if (startmv.col & 7)
+            {
+                this_mv.col -= 2;
+                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.col = (startmv.col - 8) | 6;
+                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            }
+        }
+        else
+        {
+            this_mv.row = (startmv.row - 8) | 6;
+
+            if (startmv.col & 7)
+            {
+                this_mv.col -= 2;
+                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            }
+            else
+            {
+                this_mv.col = (startmv.col - 8) | 6;
+                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+            }
+        }
+
+        break;
+    case 1:
+        this_mv.col += 2;
+
+        if (startmv.row & 7)
+        {
+            this_mv.row -= 2;
+            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.row = (startmv.row - 8) | 6;
+            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        }
+
+        break;
+    case 2:
+        this_mv.row += 2;
+
+        if (startmv.col & 7)
+        {
+            this_mv.col -= 2;
+            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        }
+        else
+        {
+            this_mv.col = (startmv.col - 8) | 6;
+            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+        }
+
+        break;
+    case 3:
+        this_mv.col += 2;
+        this_mv.row += 2;
+        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+//  }
+
+    return bestmse;
+}
+
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    int bestmse = INT_MAX;
+    MV startmv;
+    //MV this_mv;
+    MV this_mv;
+    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
+    unsigned char *z = (*(b->base_src) + b->src);
+    int left, right, up, down, diag;
+    unsigned int sse;
+
+    // Trap uncodable vectors
+    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
+    {
+        bestmv->row <<= 3;
+        bestmv->col <<= 3;
+        return INT_MAX;
+    }
+
+    // central mv
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+    startmv = *bestmv;
+
+    // calculate central point error
+    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
+
+    // go left then right and check error
+    this_mv.row = startmv.row;
+    this_mv.col = ((startmv.col - 8) | 4);
+    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (left < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = left;
+    }
+
+    this_mv.col += 8;
+    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (right < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = right;
+    }
+
+    // go up then down and check error
+    this_mv.col = startmv.col;
+    this_mv.row = ((startmv.row - 8) | 4);
+    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (up < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = up;
+    }
+
+    this_mv.row += 8;
+    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (down < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = down;
+    }
+
+    // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
+#if 0
+    // now check 1 more diagonal -
+    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+    this_mv = startmv;
+
+    switch (whichdir)
+    {
+    case 0:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 1:
+        this_mv.col += 4;
+        this_mv.row = (this_mv.row - 8) | 4;
+        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 2:
+        this_mv.col = (this_mv.col - 8) | 4;
+        this_mv.row += 4;
+        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    case 3:
+        this_mv.col += 4;
+        this_mv.row += 4;
+        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        break;
+    }
+
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+#else
+    this_mv.col = (this_mv.col - 8) | 4;
+    this_mv.row = (this_mv.row - 8) | 4;
+    diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col += 8;
+    diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col = (this_mv.col - 8) | 4;
+    this_mv.row = startmv.row + 4;
+    diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+    this_mv.col += 8;
+    diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+
+    if (diag < bestmse)
+    {
+        *bestmv = this_mv;
+        bestmse = diag;
+    }
+
+#endif
+    return bestmse;
+}
+
+
+#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
+#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
+#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
+
+int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_t vf,
+    vp8_sad_fn_t      sf,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
+    int i, j;
+    unsigned char *src = (*(b->base_src) + b->src);
+    int src_stride = b->src_stride;
+    int rr = ref_mv->row, rc = ref_mv->col, br = rr, bc = rc, tr, tc;
+    unsigned int besterr, thiserr = 0x7fffffff;
+
+    if (rc < x->mv_col_min) bc = x->mv_col_min;
+
+    if (rc > x->mv_col_max) bc = x->mv_col_max;
+
+    if (rr < x->mv_row_min) br = x->mv_row_min;
+
+    if (rr > x->mv_row_max) br = x->mv_row_max;
+
+    rr >>= 1;
+    rc >>= 1;
+    br >>= 3;
+    bc >>= 3;
+
+    besterr = ERR(br, bc, thiserr);
+
+    // hex search  jbb changed to 127 to avoid max 256 problem steping by 2.
+    for (j = 0; j < 127; j++)
+    {
+        tr = br;
+        tc = bc;
+
+        for (i = 0; i < 6; i++)
+        {
+            int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+            if (nc < x->mv_col_min) continue;
+
+            if (nc > x->mv_col_max) continue;
+
+            if (nr < x->mv_row_min) continue;
+
+            if (nr > x->mv_row_max) continue;
+
+            CHECK_BETTER(thiserr, nr, nc);
+        }
+
+        if (tr == br && tc == bc)
+            break;
+    }
+
+    // check 8 1 away neighbors
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 8; i++)
+    {
+        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        CHECK_BETTER(thiserr, nr, nc);
+    }
+
+    best_mv->row = br;
+    best_mv->col = bc;
+
+    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+}
+#undef MVC
+#undef PRE
+#undef SP
+#undef DIST
+#undef ERR
+#undef CHECK_BETTER
+int vp8_diamond_search_sad
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    MV this_mv;
+
+    int bestsad = INT_MAX;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+    int thissad;
+
+    // Work out the start point for the search
+    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    best_address = in_what;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Check the starting position
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // search_param determines the length of the initial step and hence the number of iterations
+    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    *num00 = 0;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        for (j = 0 ; j < x->searches_per_step ; j++)
+        {
+            // Trap illegal vectors
+            this_row_offset = best_mv->row + ss[i].mv.row;
+            this_col_offset = best_mv->col + ss[i].mv.col;
+
+            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
+            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
+
+            {
+                check_here = ss[i].offset + best_address;
+                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                if (thissad < bestsad)
+                {
+                    this_mv.row = this_row_offset << 3;
+                    this_mv.col = this_col_offset << 3;
+                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_site = i;
+                    }
+                }
+            }
+
+            i++;
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->row += ss[best_site].mv.row;
+            best_mv->col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad == INT_MAX)
+        return INT_MAX;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+int vp8_diamond_search_sadx4
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_ptr_t *fn_ptr,
+    int *mvsadcost[2],
+    int *mvcost[2]
+)
+{
+    int i, j, step;
+
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    unsigned char *best_address;
+
+    int tot_steps;
+    MV this_mv;
+
+    unsigned int bestsad = UINT_MAX;
+    int best_site = 0;
+    int last_site = 0;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+    int this_row_offset;
+    int this_col_offset;
+    search_site *ss;
+
+    unsigned char *check_here;
+    unsigned int thissad;
+
+    // Work out the start point for the search
+    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
+    best_address = in_what;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Check the starting position
+        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // search_param determines the length of the initial step and hence the number of iterations
+    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
+    ss = &x->ss[search_param * x->searches_per_step];
+    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
+
+    i = 1;
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    *num00 = 0;
+
+    for (step = 0; step < tot_steps ; step++)
+    {
+        int check_row_min, check_col_min, check_row_max, check_col_max;
+
+        check_row_min = x->mv_row_min - best_mv->row;
+        check_row_max = x->mv_row_max - best_mv->row;
+        check_col_min = x->mv_col_min - best_mv->col;
+        check_col_max = x->mv_col_max - best_mv->col;
+
+        for (j = 0 ; j < x->searches_per_step ; j += 4)
+        {
+            unsigned char *block_offset[4];
+            unsigned int valid_block[4];
+            int all_in = 1, t;
+
+            for (t = 0; t < 4; t++)
+            {
+                valid_block [t]  = (ss[t+i].mv.col > check_col_min);
+                valid_block [t] &= (ss[t+i].mv.col < check_col_max);
+                valid_block [t] &= (ss[t+i].mv.row > check_row_min);
+                valid_block [t] &= (ss[t+i].mv.row < check_row_max);
+
+                all_in &= valid_block[t];
+                block_offset[t] = ss[i+t].offset + best_address;
+            }
+
+            if (all_in)
+            {
+                unsigned int sad_array[4];
+
+                fn_ptr->sdx4df(what, what_stride, block_offset, in_what_stride, sad_array);
+
+                for (t = 0; t < 4; t++, i++)
+                {
+                    thissad = sad_array[t];
+
+                    if (thissad < bestsad)
+                    {
+                        this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
+                        this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
+                        thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                        if (thissad < bestsad)
+                        {
+                            bestsad = thissad;
+                            best_site = i;
+                        }
+                    }
+                }
+            }
+            else
+            {
+                int t;
+
+                for (t = 0; t < 4; i++, t++)
+                {
+                    // Trap illegal vectors
+                    if (valid_block[t])
+
+                    {
+                        check_here = block_offset[t];
+                        thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+                        if (thissad < bestsad)
+                        {
+                            this_row_offset = best_mv->row + ss[i].mv.row;
+                            this_col_offset = best_mv->col + ss[i].mv.col;
+
+                            this_mv.row = this_row_offset << 3;
+                            this_mv.col = this_col_offset << 3;
+                            thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                            if (thissad < bestsad)
+                            {
+                                bestsad = thissad;
+                                best_site = i;
+                            }
+                        }
+                    }
+                }
+            }
+        }
+
+        if (best_site != last_site)
+        {
+            best_mv->row += ss[best_site].mv.row;
+            best_mv->col += ss[best_site].mv.col;
+            best_address += ss[best_site].offset;
+            last_site = best_site;
+        }
+        else if (best_address == in_what)
+            (*num00)++;
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad == INT_MAX)
+        return INT_MAX;
+
+    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
+    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+}
+
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    int bestsad = INT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+
+        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+
+        for (c = col_min; c < col_max; c++)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            this_mv.col = c << 3;
+            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
+            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
+            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+
+            if (thissad < bestsad)
+            {
+                bestsad = thissad;
+                best_mv->row = r;
+                best_mv->col = c;
+                bestaddress = check_here;
+            }
+
+            check_here++;
+        }
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
+{
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what;
+    int in_what_stride = d->pre_stride;
+    int mv_stride = d->pre_stride;
+    unsigned char *bestaddress;
+    MV *best_mv = &d->bmi.mv.as_mv;
+    MV this_mv;
+    unsigned int bestsad = UINT_MAX;
+    int r, c;
+
+    unsigned char *check_here;
+    unsigned int thissad;
+
+    int ref_row = ref_mv->row >> 3;
+    int ref_col = ref_mv->col >> 3;
+
+    int row_min = ref_row - distance;
+    int row_max = ref_row + distance;
+    int col_min = ref_col - distance;
+    int col_max = ref_col + distance;
+
+    unsigned int sad_array[3];
+
+    // Work out the mid point for the search
+    in_what = *(d->base_pre) + d->pre;
+    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
+
+    best_mv->row = ref_row;
+    best_mv->col = ref_col;
+
+    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
+    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
+    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
+    {
+        // Baseline value at the centre
+        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
+    }
+
+    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
+    if (col_min < x->mv_col_min)
+        col_min = x->mv_col_min;
+
+    if (col_max > x->mv_col_max)
+        col_max = x->mv_col_max;
+
+    if (row_min < x->mv_row_min)
+        row_min = x->mv_row_min;
+
+    if (row_max > x->mv_row_max)
+        row_max = x->mv_row_max;
+
+    for (r = row_min; r < row_max ; r++)
+    {
+        this_mv.row = r << 3;
+        check_here = r * mv_stride + in_what + col_min;
+        c = col_min;
+
+        while ((c + 3) < col_max)
+        {
+            int i;
+
+            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
+
+            for (i = 0; i < 3; i++)
+            {
+                thissad = sad_array[i];
+
+                if (thissad < bestsad)
+                {
+                    this_mv.col = c << 3;
+                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                    if (thissad < bestsad)
+                    {
+                        bestsad = thissad;
+                        best_mv->row = r;
+                        best_mv->col = c;
+                        bestaddress = check_here;
+                    }
+                }
+
+                check_here++;
+                c++;
+            }
+        }
+
+        while (c < col_max)
+        {
+            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
+
+            if (thissad < bestsad)
+            {
+                this_mv.col = c << 3;
+                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
+
+                if (thissad < bestsad)
+                {
+                    bestsad = thissad;
+                    best_mv->row = r;
+                    best_mv->col = c;
+                    bestaddress = check_here;
+                }
+            }
+
+            check_here ++;
+            c ++;
+        }
+
+    }
+
+    this_mv.row = best_mv->row << 3;
+    this_mv.col = best_mv->col << 3;
+
+    if (bestsad < INT_MAX)
+        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
+        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
+    else
+        return INT_MAX;
+}
+#endif
+
+#ifdef ENTROPY_STATS
+void print_mode_context(void)
+{
+    FILE *f = fopen("modecont.c", "w");
+    int i, j;
+
+    fprintf(f, "#include \"entropy.h\"\n");
+    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
+    fprintf(f, "{\n");
+
+    for (j = 0; j < 6; j++)
+    {
+        fprintf(f, "  { // %d \n", j);
+        fprintf(f, "    ");
+
+        for (i = 0; i < 4; i++)
+        {
+            int overal_prob;
+            int this_prob;
+            int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
+
+            // Overall probs
+            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
+
+            if (count)
+                overal_prob = 256 * mv_mode_cts[i][0] / count;
+            else
+                overal_prob = 128;
+
+            if (overal_prob == 0)
+                overal_prob = 1;
+
+            // context probs
+            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
+
+            if (count)
+                this_prob = 256 * mv_ref_ct[j][i][0] / count;
+            else
+                this_prob = 128;
+
+            if (this_prob == 0)
+                this_prob = 1;
+
+            fprintf(f, "%5d, ", this_prob);
+            //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
+            //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
+        }
+
+        fprintf(f, "  },\n");
+    }
+
+    fprintf(f, "};\n");
+    fclose(f);
+}
+
+/* MV ref count ENTROPY_STATS stats code */
+#ifdef ENTROPY_STATS
+void init_mv_ref_counts()
+{
+    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
+    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
+}
+
+void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
+{
+    if (m == ZEROMV)
+    {
+        ++mv_ref_ct [ct[0]] [0] [0];
+        ++mv_mode_cts[0][0];
+    }
+    else
+    {
+        ++mv_ref_ct [ct[0]] [0] [1];
+        ++mv_mode_cts[0][1];
+
+        if (m == NEARESTMV)
+        {
+            ++mv_ref_ct [ct[1]] [1] [0];
+            ++mv_mode_cts[1][0];
+        }
+        else
+        {
+            ++mv_ref_ct [ct[1]] [1] [1];
+            ++mv_mode_cts[1][1];
+
+            if (m == NEARMV)
+            {
+                ++mv_ref_ct [ct[2]] [2] [0];
+                ++mv_mode_cts[2][0];
+            }
+            else
+            {
+                ++mv_ref_ct [ct[2]] [2] [1];
+                ++mv_mode_cts[2][1];
+
+                if (m == NEWMV)
+                {
+                    ++mv_ref_ct [ct[3]] [3] [0];
+                    ++mv_mode_cts[3][0];
+                }
+                else
+                {
+                    ++mv_ref_ct [ct[3]] [3] [1];
+                    ++mv_mode_cts[3][1];
+                }
+            }
+        }
+    }
+}
+
+#endif/* END MV ref count ENTROPY_STATS stats code */
+
+#endif

diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
new file mode 100644
index 0000000..921206f
--- /dev/null
+++ b/vp8/encoder/mcomp.h

@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MCOMP_H
+#define __INC_MCOMP_H
+
+#include "block.h"
+#include "variance.h"
+
+#ifdef ENTROPY_STATS
+extern void init_mv_ref_counts();
+extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
+#endif
+
+
+#define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
+#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
+
+
+extern void print_mode_context(void);
+extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
+extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride);
+extern void vp8_init3smotion_compensation(MACROBLOCK *x,  int stride);
+
+
+extern int vp8_hex_search
+(
+    MACROBLOCK *x,
+    BLOCK *b,
+    BLOCKD *d,
+    MV *ref_mv,
+    MV *best_mv,
+    int search_param,
+    int error_per_bit,
+    int *num00,
+    vp8_variance_fn_t vf,
+    vp8_sad_fn_t sf,
+    int *mvsadcost[2],
+    int *mvcost[2]
+
+);
+
+typedef int (fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]);
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
+extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
+extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
+extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
+
+#define prototype_full_search_sad(sym)\
+    int (sym)\
+    (\
+     MACROBLOCK *x, \
+     BLOCK *b, \
+     BLOCKD *d, \
+     MV *ref_mv, \
+     int error_per_bit, \
+     int distance, \
+     vp8_variance_fn_ptr_t *fn_ptr, \
+     int *mvcost[2], \
+     int *mvsadcost[2] \
+    )
+
+#define prototype_diamond_search_sad(sym)\
+    int (sym)\
+    (\
+     MACROBLOCK *x, \
+     BLOCK *b, \
+     BLOCKD *d, \
+     MV *ref_mv, \
+     MV *best_mv, \
+     int search_param, \
+     int error_per_bit, \
+     int *num00, \
+     vp8_variance_fn_ptr_t *fn_ptr, \
+     int *mvsadcost[2], \
+     int *mvcost[2] \
+    )
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/mcomp_x86.h"
+#endif
+
+typedef prototype_full_search_sad(*vp8_full_search_fn_t);
+extern prototype_full_search_sad(vp8_full_search_sad);
+extern prototype_full_search_sad(vp8_full_search_sadx3);
+
+typedef prototype_diamond_search_sad(*vp8_diamond_search_fn_t);
+extern prototype_diamond_search_sad(vp8_diamond_search_sad);
+extern prototype_diamond_search_sad(vp8_diamond_search_sadx4);
+
+#ifndef vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sad
+#endif
+extern prototype_full_search_sad(vp8_search_full_search);
+
+#ifndef vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sad
+#endif
+extern prototype_diamond_search_sad(vp8_search_diamond_search);
+
+typedef struct
+{
+    prototype_full_search_sad(*full_search);
+    prototype_diamond_search_sad(*diamond_search);
+} vp8_search_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define SEARCH_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define SEARCH_INVOKE(ctx,fn) vp8_search_##fn
+#endif
+
+#endif

diff --git a/vp8/encoder/modecosts.c b/vp8/encoder/modecosts.c
new file mode 100644
index 0000000..73170cf
--- /dev/null
+++ b/vp8/encoder/modecosts.c

@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "blockd.h"
+#include "onyx_int.h"
+#include "treewriter.h"
+#include "entropymode.h"
+
+
+void vp8_init_mode_costs(VP8_COMP *c)
+{
+    VP8_COMMON *x = &c->common;
+    {
+        const vp8_tree_p T = vp8_bmode_tree;
+
+        int i = 0;
+
+        do
+        {
+            int j = 0;
+
+            do
+            {
+                vp8_cost_tokens((int *)c->mb.bmode_costs[i][j], x->kf_bmode_prob[i][j], T);
+            }
+            while (++j < VP8_BINTRAMODES);
+        }
+        while (++i < VP8_BINTRAMODES);
+
+        vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.bmode_prob, T);
+    }
+    vp8_cost_tokens((int *)c->mb.inter_bmode_costs, x->fc.sub_mv_ref_prob, vp8_sub_mv_ref_tree);
+
+    vp8_cost_tokens(c->mb.mbmode_cost[1], x->fc.ymode_prob, vp8_ymode_tree);
+    vp8_cost_tokens(c->mb.mbmode_cost[0], x->kf_ymode_prob, vp8_kf_ymode_tree);
+
+    vp8_cost_tokens(c->mb.intra_uv_mode_cost[1], x->fc.uv_mode_prob, vp8_uv_mode_tree);
+    vp8_cost_tokens(c->mb.intra_uv_mode_cost[0], x->kf_uv_mode_prob, vp8_uv_mode_tree);
+}

diff --git a/vp8/encoder/modecosts.h b/vp8/encoder/modecosts.h
new file mode 100644
index 0000000..5ade265
--- /dev/null
+++ b/vp8/encoder/modecosts.h

@@ -0,0 +1,16 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_MODECOSTS_H
+#define __INC_MODECOSTS_H
+
+void vp8_init_mode_costs(VP8_COMP *x);
+
+#endif

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
new file mode 100644
index 0000000..7662720
--- /dev/null
+++ b/vp8/encoder/onyx_if.c

@@ -0,0 +1,5428 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "systemdependent.h"
+#include "quantize.h"
+#include "alloccommon.h"
+#include "mcomp.h"
+#include "firstpass.h"
+#include "psnr.h"
+#include "vpx_scale/vpxscale.h"
+#include "extend.h"
+#include "ratectrl.h"
+#include "quant_common.h"
+#include "segmentation_common.h"
+#include "g_common.h"
+#include "vpx_scale/yv12extend.h"
+#include "postproc.h"
+#include "vpx_mem/vpx_mem.h"
+#include "swapyv12buffer.h"
+#include "threading.h"
+#include "vpx_ports/vpx_timer.h"
+#include <math.h>
+#include <stdio.h>
+#include <limits.h>
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#define RTCD(x) &cpi->common.rtcd.x
+#else
+#define IF_RTCD(x) NULL
+#define RTCD(x) NULL
+#endif
+
+extern void vp8cx_init_mv_bits_sadcost();
+extern void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+extern void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val);
+extern void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi);
+
+extern void vp8_init_loop_filter(VP8_COMMON *cm);
+extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
+extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);
+extern void vp8_dmachine_specific_config(VP8_COMP *cpi);
+extern void vp8_cmachine_specific_config(VP8_COMP *cpi);
+extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi);
+extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag);
+extern void print_parms(VP8_CONFIG *ocf, char *filenam);
+extern unsigned int vp8_get_processor_freq();
+extern void print_tree_update_probs();
+extern void vp8cx_create_encoder_threads(VP8_COMP *cpi);
+extern void vp8cx_remove_encoder_threads(VP8_COMP *cpi);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+int vp8_estimate_entropy_savings(VP8_COMP *cpi);
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+
+
+static void mode_ref_lf_test_function(VP8_COMP *cpi);
+
+extern const int vp8_gf_interval_table[101];
+
+#if CONFIG_PSNR
+#include "math.h"
+
+extern double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight
+);
+
+extern double vp8_calc_ssimg
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    double *ssim_y,
+    double *ssim_u,
+    double *ssim_v
+);
+
+
+#endif
+
+
+#ifdef OUTPUT_YUV_SRC
+FILE *yuv_file;
+#endif
+
+#if 0
+FILE *framepsnr;
+FILE *kf_list;
+FILE *keyfile;
+#endif
+
+#if 0
+extern int skip_true_count;
+extern int skip_false_count;
+#endif
+
+
+#ifdef ENTROPY_STATS
+extern int intra_mode_stats[10][10][10];
+#endif
+
+#ifdef SPEEDSTATS
+unsigned int frames_at_speed[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+unsigned int tot_pm = 0;
+unsigned int cnt_pm = 0;
+unsigned int tot_ef = 0;
+unsigned int cnt_ef = 0;
+#endif
+
+#ifdef MODE_STATS
+extern unsigned __int64 Sectionbits[50];
+extern int y_modes[5]  ;
+extern int uv_modes[4] ;
+extern int b_modes[10]  ;
+
+extern int inter_y_modes[10] ;
+extern int inter_uv_modes[4] ;
+extern unsigned int inter_b_modes[15];
+#endif
+
+extern void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+extern void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+extern void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+extern void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+extern const int vp8_bits_per_mb[2][QINDEX_RANGE];
+
+extern const int qrounding_factors[129];
+extern const int qzbin_factors[129];
+extern void vp8cx_init_quantizer(VP8_COMP *cpi);
+extern const int vp8cx_base_skip_false_prob[128];
+
+
+void vp8_initialize()
+{
+    static int init_done = 0;
+
+    if (!init_done)
+    {
+        vp8_scale_machine_specific_config();
+        vp8_initialize_common();
+        //vp8_dmachine_specific_config();
+        vp8_tokenize_initialize();
+
+        vp8cx_init_mv_bits_sadcost();
+        init_done = 1;
+    }
+}
+#ifdef PACKET_TESTING
+extern FILE *vpxlogc;
+#endif
+
+static void setup_features(VP8_COMP *cpi)
+{
+    // Set up default state for MB feature flags
+    cpi->mb.e_mbd.segmentation_enabled = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    vpx_memset(cpi->mb.e_mbd.mb_segment_tree_probs, 255, sizeof(cpi->mb.e_mbd.mb_segment_tree_probs));
+    vpx_memset(cpi->mb.e_mbd.segment_feature_data, 0, sizeof(cpi->mb.e_mbd.segment_feature_data));
+
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    // jbb trial !
+    mode_ref_lf_test_function(cpi);
+
+}
+
+
+void vp8_dealloc_compressor_data(VP8_COMP *cpi)
+{
+
+    // Delete sementation map
+    if (cpi->segmentation_map != 0)
+        vpx_free(cpi->segmentation_map);
+
+    cpi->segmentation_map = 0;
+
+    if (cpi->active_map != 0)
+        vpx_free(cpi->active_map);
+
+    cpi->active_map = 0;
+
+    // Delete first pass motion map
+    if (cpi->fp_motion_map != 0)
+        vpx_free(cpi->fp_motion_map);
+
+    cpi->fp_motion_map = 0;
+
+    vp8_de_alloc_frame_buffers(&cpi->common);
+
+    vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf);
+    vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source);
+#if VP8_TEMPORAL_ALT_REF
+    vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer);
+#endif
+    {
+        int i;
+
+        for (i = 0; i < MAX_LAG_BUFFERS; i++)
+            vp8_yv12_de_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer);
+
+        cpi->source_buffer_count = 0;
+    }
+
+    vpx_free(cpi->tok);
+    cpi->tok = 0;
+
+}
+
+static void enable_segmentation(VP8_PTR ptr)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    // Set the appropriate feature bit
+    cpi->mb.e_mbd.segmentation_enabled = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+static void disable_segmentation(VP8_PTR ptr)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    // Clear the appropriate feature bit
+    cpi->mb.e_mbd.segmentation_enabled = 0;
+}
+
+// Valid values for a segment are 0 to 3
+// Segmentation map is arrange as [Rows][Columns]
+static void set_segmentation_map(VP8_PTR ptr, unsigned char *segmentation_map)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    // Copy in the new segmentation map
+    vpx_memcpy(cpi->segmentation_map, segmentation_map, (cpi->common.mb_rows * cpi->common.mb_cols));
+
+    // Signal that the map should be updated.
+    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+}
+
+// The values given for each segment can be either deltas (from the default value chosen for the frame) or absolute values.
+//
+// Valid range for abs values is (0-127 for MB_LVL_ALT_Q) , (0-63 for SEGMENT_ALT_LF)
+// Valid range for delta values are (+/-127 for MB_LVL_ALT_Q) , (+/-63 for SEGMENT_ALT_LF)
+//
+// abs_delta = SEGMENT_DELTADATA (deltas) abs_delta = SEGMENT_ABSDATA (use the absolute values given).
+//
+//
+static void set_segment_data(VP8_PTR ptr, signed char *feature_data, unsigned char abs_delta)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    cpi->mb.e_mbd.mb_segement_abs_delta = abs_delta;
+    vpx_memcpy(cpi->segment_feature_data, feature_data, sizeof(cpi->segment_feature_data));
+}
+
+
+static void segmentation_test_function(VP8_PTR ptr)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    unsigned char *seg_map;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int i, j;
+
+    // Create a temporary map for segmentation data.
+    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+    // MB loop to set local segmentation map
+    /*for ( i = 0; i < cpi->common.mb_rows; i++ )
+    {
+        for ( j = 0; j < cpi->common.mb_cols; j++ )
+        {
+            //seg_map[(i*cpi->common.mb_cols) + j] = (j % 2) + ((i%2)* 2);
+            //if ( j < cpi->common.mb_cols/2 )
+
+            // Segment 1 around the edge else 0
+            if ( (i == 0) || (j == 0) || (i == (cpi->common.mb_rows-1)) || (j == (cpi->common.mb_cols-1)) )
+                seg_map[(i*cpi->common.mb_cols) + j] = 1;
+            //else if ( (i < 2) || (j < 2) || (i > (cpi->common.mb_rows-3)) || (j > (cpi->common.mb_cols-3)) )
+            //  seg_map[(i*cpi->common.mb_cols) + j] = 2;
+            //else if ( (i < 5) || (j < 5) || (i > (cpi->common.mb_rows-6)) || (j > (cpi->common.mb_cols-6)) )
+            //  seg_map[(i*cpi->common.mb_cols) + j] = 3;
+            else
+                seg_map[(i*cpi->common.mb_cols) + j] = 0;
+        }
+    }*/
+
+    // Set the segmentation Map
+    set_segmentation_map(ptr, seg_map);
+
+    // Activate segmentation.
+    enable_segmentation(ptr);
+
+    // Set up the quant segment data
+    feature_data[MB_LVL_ALT_Q][0] = 0;
+    feature_data[MB_LVL_ALT_Q][1] = 4;
+    feature_data[MB_LVL_ALT_Q][2] = 0;
+    feature_data[MB_LVL_ALT_Q][3] = 0;
+    // Set up the loop segment data
+    feature_data[MB_LVL_ALT_LF][0] = 0;
+    feature_data[MB_LVL_ALT_LF][1] = 0;
+    feature_data[MB_LVL_ALT_LF][2] = 0;
+    feature_data[MB_LVL_ALT_LF][3] = 0;
+
+    // Initialise the feature data structure
+    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    set_segment_data(ptr, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    // Delete sementation map
+    if (seg_map != 0)
+        vpx_free(seg_map);
+
+    seg_map = 0;
+
+}
+
+// A simple function to cyclically refresh the background at a lower Q
+static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
+{
+    unsigned char *seg_map;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+    int i;
+    int block_count = cpi->cyclic_refresh_mode_max_mbs_perframe;
+    int mbs_in_frame = cpi->common.mb_rows * cpi->common.mb_cols;
+
+    // Create a temporary map for segmentation data.
+    CHECK_MEM_ERROR(seg_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
+    cpi->cyclic_refresh_q = Q;
+
+    for (i = Q; i > 0; i--)
+    {
+        if (vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*(Q + 128)) / 64))
+            //if ( vp8_bits_per_mb[cpi->common.frame_type][i] >= ((vp8_bits_per_mb[cpi->common.frame_type][Q]*((2*Q)+96))/64) )
+        {
+            break;
+        }
+    }
+
+    cpi->cyclic_refresh_q = i;
+
+    // Only update for inter frames
+    if (cpi->common.frame_type != KEY_FRAME)
+    {
+        // Cycle through the macro_block rows
+        // MB loop to set local segmentation map
+        for (i = cpi->cyclic_refresh_mode_index; i < mbs_in_frame; i++)
+        {
+            // If the MB is as a candidate for clean up then mark it for possible boost/refresh (segment 1)
+            // The segment id may get reset to 0 later if the MB gets coded anything other than last frame 0,0
+            // as only (last frame 0,0) MBs are eligable for refresh : that is to say Mbs likely to be background blocks.
+            if (cpi->cyclic_refresh_map[i] == 0)
+            {
+                seg_map[i] = 1;
+            }
+            else
+            {
+                seg_map[i] = 0;
+
+                // Skip blocks that have been refreshed recently anyway.
+                if (cpi->cyclic_refresh_map[i] < 0)
+                    //cpi->cyclic_refresh_map[i] = cpi->cyclic_refresh_map[i] / 16;
+                    cpi->cyclic_refresh_map[i]++;
+            }
+
+
+            if (block_count > 0)
+                block_count--;
+            else
+                break;
+
+        }
+
+        // If we have gone through the frame reset to the start
+        cpi->cyclic_refresh_mode_index = i;
+
+        if (cpi->cyclic_refresh_mode_index >= mbs_in_frame)
+            cpi->cyclic_refresh_mode_index = 0;
+    }
+
+    // Set the segmentation Map
+    set_segmentation_map((VP8_PTR)cpi, seg_map);
+
+    // Activate segmentation.
+    enable_segmentation((VP8_PTR)cpi);
+
+    // Set up the quant segment data
+    feature_data[MB_LVL_ALT_Q][0] = 0;
+    feature_data[MB_LVL_ALT_Q][1] = (cpi->cyclic_refresh_q - Q);
+    feature_data[MB_LVL_ALT_Q][2] = 0;
+    feature_data[MB_LVL_ALT_Q][3] = 0;
+
+    // Set up the loop segment data
+    feature_data[MB_LVL_ALT_LF][0] = 0;
+    feature_data[MB_LVL_ALT_LF][1] = lf_adjustment;
+    feature_data[MB_LVL_ALT_LF][2] = 0;
+    feature_data[MB_LVL_ALT_LF][3] = 0;
+
+    // Initialise the feature data structure
+    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    // Delete sementation map
+    if (seg_map != 0)
+        vpx_free(seg_map);
+
+    seg_map = 0;
+
+}
+
+static void mode_ref_lf_test_function(VP8_COMP *cpi)
+{
+    cpi->mb.e_mbd.mode_ref_lf_delta_enabled = 1;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+
+    vpx_memset(cpi->mb.e_mbd.ref_lf_deltas, 0, sizeof(cpi->mb.e_mbd.ref_lf_deltas));
+    vpx_memset(cpi->mb.e_mbd.mode_lf_deltas, 0, sizeof(cpi->mb.e_mbd.mode_lf_deltas));
+
+    // Test of ref frame deltas
+    cpi->mb.e_mbd.ref_lf_deltas[INTRA_FRAME] = 2;
+    cpi->mb.e_mbd.ref_lf_deltas[LAST_FRAME] = 0;
+    cpi->mb.e_mbd.ref_lf_deltas[GOLDEN_FRAME] = -2;
+    cpi->mb.e_mbd.ref_lf_deltas[ALTREF_FRAME] = -2;
+
+    cpi->mb.e_mbd.mode_lf_deltas[0] = 4;               // BPRED
+    cpi->mb.e_mbd.mode_lf_deltas[1] = -2;              // Zero
+    cpi->mb.e_mbd.mode_lf_deltas[2] = 2;               // New mv
+    cpi->mb.e_mbd.mode_lf_deltas[3] = 4;               // Split mv
+}
+
+void vp8_set_speed_features(VP8_COMP *cpi)
+{
+    SPEED_FEATURES *sf = &cpi->sf;
+    int Mode = cpi->compressor_speed;
+    int Speed = cpi->Speed;
+    int i;
+    VP8_COMMON *cm = &cpi->common;
+
+    // Initialise default mode frequency sampling variables
+    for (i = 0; i < MAX_MODES; i ++)
+    {
+        cpi->mode_check_freq[i] = 0;
+        cpi->mode_test_hit_counts[i] = 0;
+        cpi->mode_chosen_counts[i] = 0;
+    }
+
+    cpi->mbs_tested_so_far = 0;
+
+    // best quality
+    sf->RD = 1;
+    sf->search_method = NSTEP;
+    sf->improved_quant = 1;
+    sf->improved_dct = 1;
+    sf->auto_filter = 1;
+    sf->recode_loop = 1;
+    sf->quarter_pixel_search = 1;
+    sf->half_pixel_search = 1;
+    sf->full_freq[0] = 7;
+    sf->full_freq[1] = 7;
+    sf->min_fs_radius = 8;
+    sf->max_fs_radius = 32;
+    sf->iterative_sub_pixel = 1;
+    sf->optimize_coefficients = 1;
+
+    sf->first_step = 0;
+    sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+    cpi->do_full[0] = 0;
+    cpi->do_full[1] = 0;
+
+    // default thresholds to 0
+    for (i = 0; i < MAX_MODES; i++)
+        sf->thresh_mult[i] = 0;
+
+    switch (Mode)
+    {
+#if !(CONFIG_REALTIME_ONLY)
+    case 0: // best quality mode
+        sf->thresh_mult[THR_ZEROMV   ] = 0;
+        sf->thresh_mult[THR_ZEROG    ] = 0;
+        sf->thresh_mult[THR_ZEROA    ] = 0;
+        sf->thresh_mult[THR_NEARESTMV] = 0;
+        sf->thresh_mult[THR_NEARESTG ] = 0;
+        sf->thresh_mult[THR_NEARESTA ] = 0;
+        sf->thresh_mult[THR_NEARMV   ] = 0;
+        sf->thresh_mult[THR_NEARG    ] = 0;
+        sf->thresh_mult[THR_NEARA    ] = 0;
+
+        sf->thresh_mult[THR_DC       ] = 0;
+
+        sf->thresh_mult[THR_V_PRED   ] = 1000;
+        sf->thresh_mult[THR_H_PRED   ] = 1000;
+        sf->thresh_mult[THR_B_PRED   ] = 2000;
+        sf->thresh_mult[THR_TM       ] = 1000;
+
+        sf->thresh_mult[THR_NEWMV    ] = 1000;
+        sf->thresh_mult[THR_NEWG     ] = 1000;
+        sf->thresh_mult[THR_NEWA     ] = 1000;
+
+        sf->thresh_mult[THR_SPLITMV  ] = 2500;
+        sf->thresh_mult[THR_SPLITG   ] = 5000;
+        sf->thresh_mult[THR_SPLITA   ] = 5000;
+
+        sf->full_freq[0] = 7;
+        sf->full_freq[1] = 15;
+
+        sf->first_step = 0;
+        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+        {
+            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+        }
+
+        if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+        {
+            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+        }
+        else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+        {
+            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+        }
+
+        break;
+    case 1:
+    case 3:
+        sf->optimize_coefficients = 0;
+        sf->thresh_mult[THR_NEARESTMV] = 0;
+        sf->thresh_mult[THR_ZEROMV   ] = 0;
+        sf->thresh_mult[THR_DC       ] = 0;
+        sf->thresh_mult[THR_NEARMV   ] = 0;
+        sf->thresh_mult[THR_V_PRED   ] = 1000;
+        sf->thresh_mult[THR_H_PRED   ] = 1000;
+        sf->thresh_mult[THR_B_PRED   ] = 2500;
+        sf->thresh_mult[THR_TM       ] = 1000;
+
+        sf->thresh_mult[THR_NEARESTG ] = 1000;
+        sf->thresh_mult[THR_NEARESTA ] = 1000;
+
+        sf->thresh_mult[THR_ZEROG    ] = 1000;
+        sf->thresh_mult[THR_ZEROA    ] = 1000;
+        sf->thresh_mult[THR_NEARG    ] = 1000;
+        sf->thresh_mult[THR_NEARA    ] = 1000;
+
+        sf->thresh_mult[THR_NEWMV    ] = 1500;
+        sf->thresh_mult[THR_NEWG     ] = 1500;
+        sf->thresh_mult[THR_NEWA     ] = 1500;
+
+        sf->thresh_mult[THR_SPLITMV  ] = 5000;
+        sf->thresh_mult[THR_SPLITG   ] = 10000;
+        sf->thresh_mult[THR_SPLITA   ] = 10000;
+
+        sf->full_freq[0] = 15;
+        sf->full_freq[1] = 31;
+
+        sf->first_step = 0;
+        sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+
+        if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+        {
+            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+        }
+        else if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+        {
+            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+        }
+        else if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+        {
+            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+        }
+
+        if (Speed > 0)
+        {
+            cpi->mode_check_freq[THR_SPLITG] = 4;
+            cpi->mode_check_freq[THR_SPLITA] = 4;
+            cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+            sf->thresh_mult[THR_TM       ] = 1500;
+            sf->thresh_mult[THR_V_PRED   ] = 1500;
+            sf->thresh_mult[THR_H_PRED   ] = 1500;
+            sf->thresh_mult[THR_B_PRED   ] = 5000;
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            {
+                sf->thresh_mult[THR_NEWMV    ] = 2000;
+                sf->thresh_mult[THR_SPLITMV  ] = 10000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 1500;
+                sf->thresh_mult[THR_ZEROG    ] = 1500;
+                sf->thresh_mult[THR_NEARG    ] = 1500;
+                sf->thresh_mult[THR_NEWG     ] = 2000;
+                sf->thresh_mult[THR_SPLITG   ] = 20000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 1500;
+                sf->thresh_mult[THR_ZEROA    ] = 1500;
+                sf->thresh_mult[THR_NEARA    ] = 1500;
+                sf->thresh_mult[THR_NEWA     ] = 2000;
+                sf->thresh_mult[THR_SPLITA   ] = 20000;
+            }
+
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+
+            sf->first_step = 1;
+            sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+        }
+
+        if (Speed > 1)
+        {
+            cpi->mode_check_freq[THR_SPLITG] = 15;
+            cpi->mode_check_freq[THR_SPLITA] = 15;
+            cpi->mode_check_freq[THR_SPLITMV] = 7;
+
+            sf->thresh_mult[THR_TM       ] = 2000;
+            sf->thresh_mult[THR_V_PRED   ] = 2000;
+            sf->thresh_mult[THR_H_PRED   ] = 2000;
+            sf->thresh_mult[THR_B_PRED   ] = 7500;
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            {
+                sf->thresh_mult[THR_NEWMV    ] = 2000;
+                sf->thresh_mult[THR_SPLITMV  ] = 25000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 2000;
+                sf->thresh_mult[THR_ZEROG    ] = 2000;
+                sf->thresh_mult[THR_NEARG    ] = 2000;
+                sf->thresh_mult[THR_NEWG     ] = 2500;
+                sf->thresh_mult[THR_SPLITG   ] = 50000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 2000;
+                sf->thresh_mult[THR_ZEROA    ] = 2000;
+                sf->thresh_mult[THR_NEARA    ] = 2000;
+                sf->thresh_mult[THR_NEWA     ] = 2500;
+                sf->thresh_mult[THR_SPLITA   ] = 50000;
+            }
+
+            // Only do recode loop on key frames and golden frames
+            sf->recode_loop = 2;
+
+            sf->full_freq[0] = 31;
+            sf->full_freq[1] = 63;
+
+        }
+
+        if (Speed > 2)
+        {
+            sf->auto_filter = 0;                     // Faster selection of loop filter
+            cpi->mode_check_freq[THR_V_PRED] = 2;
+            cpi->mode_check_freq[THR_H_PRED] = 2;
+            cpi->mode_check_freq[THR_B_PRED] = 2;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARG] = 2;
+                cpi->mode_check_freq[THR_NEWG] = 4;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARA] = 2;
+                cpi->mode_check_freq[THR_NEWA] = 4;
+            }
+
+            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+
+            sf->full_freq[0] = 63;
+            sf->full_freq[1] = 127;
+        }
+
+        if (Speed > 3)
+        {
+            cpi->mode_check_freq[THR_V_PRED] = 0;
+            cpi->mode_check_freq[THR_H_PRED] = 0;
+            cpi->mode_check_freq[THR_B_PRED] = 0;
+            cpi->mode_check_freq[THR_NEARG] = 0;
+            cpi->mode_check_freq[THR_NEWG] = 0;
+            cpi->mode_check_freq[THR_NEARA] = 0;
+            cpi->mode_check_freq[THR_NEWA] = 0;
+
+            sf->auto_filter = 1;
+            sf->recode_loop = 0; // recode loop off
+            sf->RD = 0;         // Turn rd off
+            sf->full_freq[0] = INT_MAX;
+            sf->full_freq[1] = INT_MAX;
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;                     // Faster selection of loop filter
+
+            cpi->mode_check_freq[THR_V_PRED] = 2;
+            cpi->mode_check_freq[THR_H_PRED] = 2;
+            cpi->mode_check_freq[THR_B_PRED] = 2;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARG] = 2;
+                cpi->mode_check_freq[THR_NEWG] = 4;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARA] = 2;
+                cpi->mode_check_freq[THR_NEWA] = 4;
+            }
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 2000;
+                sf->thresh_mult[THR_ZEROG    ] = 2000;
+                sf->thresh_mult[THR_NEARG    ] = 2000;
+                sf->thresh_mult[THR_NEWG     ] = 4000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 2000;
+                sf->thresh_mult[THR_ZEROA    ] = 2000;
+                sf->thresh_mult[THR_NEARA    ] = 2000;
+                sf->thresh_mult[THR_NEWA     ] = 4000;
+            }
+        }
+
+        break;
+#endif
+    case 2:
+        sf->optimize_coefficients = 0;
+        sf->recode_loop = 0;
+        sf->auto_filter = 1;
+        sf->iterative_sub_pixel = 1;
+        sf->thresh_mult[THR_NEARESTMV] = 0;
+        sf->thresh_mult[THR_ZEROMV   ] = 0;
+        sf->thresh_mult[THR_DC       ] = 0;
+        sf->thresh_mult[THR_TM       ] = 0;
+        sf->thresh_mult[THR_NEARMV   ] = 0;
+        sf->thresh_mult[THR_V_PRED   ] = 1000;
+        sf->thresh_mult[THR_H_PRED   ] = 1000;
+        sf->thresh_mult[THR_B_PRED   ] = 2500;
+        sf->thresh_mult[THR_NEARESTG ] = 1000;
+        sf->thresh_mult[THR_ZEROG    ] = 1000;
+        sf->thresh_mult[THR_NEARG    ] = 1000;
+        sf->thresh_mult[THR_NEARESTA ] = 1000;
+        sf->thresh_mult[THR_ZEROA    ] = 1000;
+        sf->thresh_mult[THR_NEARA    ] = 1000;
+        sf->thresh_mult[THR_NEWMV    ] = 2000;
+        sf->thresh_mult[THR_NEWG     ] = 2000;
+        sf->thresh_mult[THR_NEWA     ] = 2000;
+        sf->thresh_mult[THR_SPLITMV  ] = 5000;
+        sf->thresh_mult[THR_SPLITG   ] = 10000;
+        sf->thresh_mult[THR_SPLITA   ] = 10000;
+        sf->full_freq[0] = 15;
+        sf->full_freq[1] = 31;
+        sf->search_method = NSTEP;
+
+        if (!cpi->ref_frame_flags & VP8_LAST_FLAG)
+        {
+            sf->thresh_mult[THR_NEWMV    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARESTMV] = INT_MAX;
+            sf->thresh_mult[THR_ZEROMV   ] = INT_MAX;
+            sf->thresh_mult[THR_NEARMV   ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+        }
+
+        if (!cpi->ref_frame_flags & VP8_GOLD_FLAG)
+        {
+            sf->thresh_mult[THR_NEARESTG ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARG    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWG     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG   ] = INT_MAX;
+        }
+
+        if (!cpi->ref_frame_flags & VP8_ALT_FLAG)
+        {
+            sf->thresh_mult[THR_NEARESTA ] = INT_MAX;
+            sf->thresh_mult[THR_ZEROA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEARA    ] = INT_MAX;
+            sf->thresh_mult[THR_NEWA     ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITA   ] = INT_MAX;
+        }
+
+        if (Speed > 0)
+        {
+            cpi->mode_check_freq[THR_SPLITG] = 4;
+            cpi->mode_check_freq[THR_SPLITA] = 4;
+            cpi->mode_check_freq[THR_SPLITMV] = 2;
+
+            sf->thresh_mult[THR_DC       ] = 0;
+            sf->thresh_mult[THR_TM       ] = 1000;
+            sf->thresh_mult[THR_V_PRED   ] = 2000;
+            sf->thresh_mult[THR_H_PRED   ] = 2000;
+            sf->thresh_mult[THR_B_PRED   ] = 5000;
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTMV] = 0;
+                sf->thresh_mult[THR_ZEROMV   ] = 0;
+                sf->thresh_mult[THR_NEARMV   ] = 0;
+                sf->thresh_mult[THR_NEWMV    ] = 2000;
+                sf->thresh_mult[THR_SPLITMV  ] = 10000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 1000;
+                sf->thresh_mult[THR_ZEROG    ] = 1000;
+                sf->thresh_mult[THR_NEARG    ] = 1000;
+                sf->thresh_mult[THR_NEWG     ] = 2000;
+                sf->thresh_mult[THR_SPLITG   ] = 20000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 1000;
+                sf->thresh_mult[THR_ZEROA    ] = 1000;
+                sf->thresh_mult[THR_NEARA    ] = 1000;
+                sf->thresh_mult[THR_NEWA     ] = 2000;
+                sf->thresh_mult[THR_SPLITA   ] = 20000;
+            }
+
+            sf->improved_quant = 0;
+            sf->improved_dct = 0;
+        }
+
+        if (Speed > 1)
+        {
+            cpi->mode_check_freq[THR_SPLITMV] = 7;
+            cpi->mode_check_freq[THR_SPLITG] = 15;
+            cpi->mode_check_freq[THR_SPLITA] = 15;
+
+            sf->thresh_mult[THR_TM       ] = 2000;
+            sf->thresh_mult[THR_V_PRED   ] = 2000;
+            sf->thresh_mult[THR_H_PRED   ] = 2000;
+            sf->thresh_mult[THR_B_PRED   ] = 5000;
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            {
+                sf->thresh_mult[THR_NEWMV    ] = 2000;
+                sf->thresh_mult[THR_SPLITMV  ] = 25000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 2000;
+                sf->thresh_mult[THR_ZEROG    ] = 2000;
+                sf->thresh_mult[THR_NEARG    ] = 2000;
+                sf->thresh_mult[THR_NEWG     ] = 2500;
+                sf->thresh_mult[THR_SPLITG   ] = 50000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 2000;
+                sf->thresh_mult[THR_ZEROA    ] = 2000;
+                sf->thresh_mult[THR_NEARA    ] = 2000;
+                sf->thresh_mult[THR_NEWA     ] = 2500;
+                sf->thresh_mult[THR_SPLITA   ] = 50000;
+            }
+
+            sf->full_freq[0] = 31;
+            sf->full_freq[1] = 63;
+        }
+
+        if (Speed > 2)
+        {
+            sf->auto_filter = 0;                     // Faster selection of loop filter
+
+            cpi->mode_check_freq[THR_V_PRED] = 2;
+            cpi->mode_check_freq[THR_H_PRED] = 2;
+            cpi->mode_check_freq[THR_B_PRED] = 2;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARG] = 2;
+                cpi->mode_check_freq[THR_NEWG] = 4;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARA] = 2;
+                cpi->mode_check_freq[THR_NEWA] = 4;
+            }
+
+            sf->thresh_mult[THR_SPLITMV  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITG  ] = INT_MAX;
+            sf->thresh_mult[THR_SPLITA  ] = INT_MAX;
+
+            sf->full_freq[0] = 63;
+            sf->full_freq[1] = 127;
+        }
+
+        if (Speed > 3)
+        {
+            sf->RD = 0;
+            sf->full_freq[0] = INT_MAX;
+            sf->full_freq[1] = INT_MAX;
+
+            sf->auto_filter = 1;
+        }
+
+        if (Speed > 4)
+        {
+            sf->auto_filter = 0;                     // Faster selection of loop filter
+
+#if CONFIG_REALTIME_ONLY
+            sf->search_method = HEX;
+#else
+            sf->search_method = DIAMOND;
+#endif
+
+            cpi->mode_check_freq[THR_V_PRED] = 4;
+            cpi->mode_check_freq[THR_H_PRED] = 4;
+            cpi->mode_check_freq[THR_B_PRED] = 4;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARG] = 2;
+                cpi->mode_check_freq[THR_NEWG] = 4;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                cpi->mode_check_freq[THR_NEARA] = 2;
+                cpi->mode_check_freq[THR_NEWA] = 4;
+            }
+
+            sf->thresh_mult[THR_TM       ] = 2000;
+            sf->thresh_mult[THR_B_PRED   ] = 5000;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTG ] = 2000;
+                sf->thresh_mult[THR_ZEROG    ] = 2000;
+                sf->thresh_mult[THR_NEARG    ] = 2000;
+                sf->thresh_mult[THR_NEWG     ] = 4000;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEARESTA ] = 2000;
+                sf->thresh_mult[THR_ZEROA    ] = 2000;
+                sf->thresh_mult[THR_NEARA    ] = 2000;
+                sf->thresh_mult[THR_NEWA     ] = 4000;
+            }
+        }
+
+        if (Speed > 5)
+        {
+            // Disable split MB intra prediction mode
+            sf->thresh_mult[THR_B_PRED] = INT_MAX;
+        }
+
+        if (Speed > 6)
+        {
+            unsigned int i, sum = 0;
+            unsigned int total_mbs = cm->MBs;
+            int thresh;
+            int total_skip;
+
+            int min = 2000;
+            sf->iterative_sub_pixel = 0;
+
+            if (cpi->oxcf.encode_breakout > 2000)
+                min = cpi->oxcf.encode_breakout;
+
+            min >>= 7;
+
+            for (i = 0; i < min; i++)
+            {
+                sum += cpi->error_bins[i];
+            }
+
+            total_skip = sum;
+            sum = 0;
+
+            // i starts from 2 to make sure thresh started from 2048
+            for (; i < 1024; i++)
+            {
+                sum += cpi->error_bins[i];
+
+                if (10 * sum >= (unsigned int)(cpi->Speed - 6)*(total_mbs - total_skip))
+                    break;
+            }
+
+            i--;
+            thresh = (i << 7);
+
+            if (thresh < 2000)
+                thresh = 2000;
+
+            if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+            {
+                sf->thresh_mult[THR_NEWMV] = thresh;
+                sf->thresh_mult[THR_NEARESTMV ] = thresh >> 1;
+                sf->thresh_mult[THR_NEARMV    ] = thresh >> 1;
+            }
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                sf->thresh_mult[THR_NEWG] = thresh << 1;
+                sf->thresh_mult[THR_NEARESTG ] = thresh;
+                sf->thresh_mult[THR_NEARG    ] = thresh;
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                sf->thresh_mult[THR_NEWA] = thresh << 1;
+                sf->thresh_mult[THR_NEARESTA ] = thresh;
+                sf->thresh_mult[THR_NEARA    ] = thresh;
+            }
+
+            // Disable other intra prediction modes
+            sf->thresh_mult[THR_TM] = INT_MAX;
+            sf->thresh_mult[THR_V_PRED] = INT_MAX;
+            sf->thresh_mult[THR_H_PRED] = INT_MAX;
+
+        }
+
+        if (Speed > 8)
+        {
+            sf->quarter_pixel_search = 0;
+        }
+
+        if (Speed > 9)
+        {
+            int Tmp = cpi->Speed - 8;
+
+            if (Tmp > 4)
+                Tmp = 4;
+
+            if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+            {
+                cpi->mode_check_freq[THR_ZEROG] = 1 << (Tmp - 1);
+                cpi->mode_check_freq[THR_NEARESTG] = 1 << (Tmp - 1);
+                cpi->mode_check_freq[THR_NEARG] = 1 << Tmp;
+                cpi->mode_check_freq[THR_NEWG] = 1 << (Tmp + 1);
+            }
+
+            if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+            {
+                cpi->mode_check_freq[THR_ZEROA] = 1 << (Tmp - 1);
+                cpi->mode_check_freq[THR_NEARESTA] = 1 << (Tmp - 1);
+                cpi->mode_check_freq[THR_NEARA] = 1 << Tmp;
+                cpi->mode_check_freq[THR_NEWA] = 1 << (Tmp + 1);
+            }
+
+            cpi->mode_check_freq[THR_NEWMV] = 1 << (Tmp - 1);
+        }
+
+        cm->filter_type = NORMAL_LOOPFILTER;
+
+        if (Speed >= 14)
+            cm->filter_type = SIMPLE_LOOPFILTER;
+
+        if (Speed >= 15)
+        {
+            sf->half_pixel_search = 0;        // This has a big hit on quality. Last resort
+        }
+
+        vpx_memset(cpi->error_bins, 0, sizeof(cpi->error_bins));
+
+    };
+
+    if (cpi->sf.search_method == NSTEP)
+    {
+        vp8_init3smotion_compensation(&cpi->mb, cm->last_frame.y_stride);
+    }
+    else if (cpi->sf.search_method == DIAMOND)
+    {
+        vp8_init_dsmotion_compensation(&cpi->mb, cm->last_frame.y_stride);
+    }
+
+    if (cpi->sf.improved_dct)
+    {
+        cpi->mb.vp8_short_fdct8x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
+        cpi->mb.vp8_short_fdct4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short8x4);
+        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+    }
+    else
+    {
+        cpi->mb.vp8_short_fdct8x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
+        cpi->mb.vp8_short_fdct4x4   = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
+        cpi->mb.short_fdct8x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast8x4);
+        cpi->mb.short_fdct4x4rd = FDCT_INVOKE(&cpi->rtcd.fdct, fast4x4);
+    }
+
+    cpi->mb.vp8_short_fdct4x4_ptr = FDCT_INVOKE(&cpi->rtcd.fdct, short4x4);
+    cpi->mb.short_walsh4x4 = FDCT_INVOKE(&cpi->rtcd.fdct, walsh_short4x4);
+
+    if (cpi->sf.improved_quant)
+    {
+        cpi->mb.quantize_b    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+        cpi->mb.quantize_brd  = QUANTIZE_INVOKE(&cpi->rtcd.quantize, quantb);
+    }
+    else
+    {
+        cpi->mb.quantize_b      = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+        cpi->mb.quantize_brd    = QUANTIZE_INVOKE(&cpi->rtcd.quantize, fastquantb);
+    }
+
+#if CONFIG_RUNTIME_CPU_DETECT
+    cpi->mb.e_mbd.rtcd = &cpi->common.rtcd;
+#endif
+
+    if (cpi->sf.iterative_sub_pixel == 1)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step_iteratively;
+    }
+    else if (cpi->sf.quarter_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_sub_pixel_step;
+    }
+    else if (cpi->sf.half_pixel_search)
+    {
+        cpi->find_fractional_mv_step = vp8_find_best_half_pixel_step;
+    }
+    else
+    {
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+    }
+
+    if (cpi->sf.optimize_coefficients == 1)
+        cpi->mb.optimize = 1;
+    else
+        cpi->mb.optimize = 0;
+
+    if (cpi->common.full_pixel)
+        cpi->find_fractional_mv_step = vp8_skip_fractional_mv_step;
+
+#ifdef SPEEDSTATS
+    frames_at_speed[cpi->Speed]++;
+#endif
+}
+static void alloc_raw_frame_buffers(VP8_COMP *cpi)
+{
+    int i, buffers;
+
+    buffers = cpi->oxcf.lag_in_frames;
+
+    if (buffers > MAX_LAG_BUFFERS)
+        buffers = MAX_LAG_BUFFERS;
+
+    if (buffers < 1)
+        buffers = 1;
+
+    for (i = 0; i < buffers; i++)
+        if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer,
+                                        cpi->oxcf.Width, cpi->oxcf.Height,
+                                        16))
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate lag buffer");
+
+#if VP8_TEMPORAL_ALT_REF
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer,
+                                    cpi->oxcf.Width, cpi->oxcf.Height, 16))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate altref buffer");
+
+#endif
+
+    cpi->source_buffer_count = 0;
+}
+void vp8_alloc_compressor_data(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = & cpi->common;
+
+    int width = cm->Width;
+    int height = cm->Height;
+
+    if (vp8_alloc_frame_buffers(cm, width, height))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate frame buffers");
+
+    if ((width & 0xf) != 0)
+        width += 16 - (width & 0xf);
+
+    if ((height & 0xf) != 0)
+        height += 16 - (height & 0xf);
+
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->last_frame_uf,
+                                    width, height, VP8BORDERINPIXELS))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate last frame buffer");
+
+    if (vp8_yv12_alloc_frame_buffer(&cpi->scaled_source, width, height, 16))
+        vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate scaled source buffer");
+
+
+    if (cpi->tok != 0)
+        vpx_free(cpi->tok);
+
+    {
+        unsigned int tokens = cm->mb_rows * cm->mb_cols * 24 * 16;
+
+        CHECK_MEM_ERROR(cpi->tok, vpx_calloc(tokens, sizeof(*cpi->tok)));
+    }
+
+    // Data used for real time vc mode to see if gf needs refreshing
+    cpi->inter_zz_count = 0;
+    cpi->gf_bad_count = 0;
+    cpi->gf_update_recommended = 0;
+}
+
+
+// Quant MOD
+static const int q_trans[] =
+{
+    0,   1,  2,  3,  4,  5,  7,  8,
+    9,  10, 12, 13, 15, 17, 18, 19,
+    20,  21, 23, 24, 25, 26, 27, 28,
+    29,  30, 31, 33, 35, 37, 39, 41,
+    43,  45, 47, 49, 51, 53, 55, 57,
+    59,  61, 64, 67, 70, 73, 76, 79,
+    82,  85, 88, 91, 94, 97, 100, 103,
+    106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+int vp8_reverse_trans(int x)
+{
+    int i;
+
+    for (i = 0; i < 64; i++)
+        if (q_trans[i] >= x)
+            return i;
+
+    return 63;
+};
+void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
+{
+    cpi->oxcf.frame_rate             = framerate;
+    cpi->output_frame_rate            = cpi->oxcf.frame_rate;
+    cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+    cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
+    cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+    cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
+    cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
+
+    cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
+    cpi->long_rolling_actual_bits      = cpi->av_per_frame_bandwidth;
+    cpi->max_gf_interval = (int)(cpi->output_frame_rate / 2) + 2;
+
+    //cpi->max_gf_interval = (int)(cpi->output_frame_rate * 2 / 3) + 1;
+    //cpi->max_gf_interval = 24;
+
+    if (cpi->max_gf_interval < 12)
+        cpi->max_gf_interval = 12;
+
+
+    // Special conditions when altr ref frame enabled
+    if (cpi->oxcf.play_alternate)
+    {
+        if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+    }
+}
+
+void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+    VP8_COMMON *cm = &cpi->common;
+
+    if (!cpi)
+        return;
+
+    cpi->auto_gold = 1;
+    cpi->auto_adjust_gold_quantizer = 1;
+    cpi->goldquantizer = 1;
+    cpi->goldfreq = 7;
+    cpi->auto_adjust_key_quantizer = 1;
+    cpi->keyquantizer = 1;
+
+    cm->version = oxcf->Version;
+    vp8_setup_version(cm);
+
+    if (oxcf == 0)
+    {
+        cpi->pass                     = 0;
+
+        cpi->auto_worst_q              = 0;
+        cpi->oxcf.best_allowed_q            = MINQ;
+        cpi->oxcf.worst_allowed_q           = MAXQ;
+
+        cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;
+        cpi->oxcf.starting_buffer_level     =   4;
+        cpi->oxcf.optimal_buffer_level      =   5;
+        cpi->oxcf.maximum_buffer_size       =   6;
+        cpi->oxcf.under_shoot_pct           =  90;
+        cpi->oxcf.allow_df                 =   0;
+        cpi->oxcf.drop_frames_water_mark     =  20;
+
+        cpi->oxcf.allow_spatial_resampling  = 0;
+        cpi->oxcf.resample_down_water_mark   = 40;
+        cpi->oxcf.resample_up_water_mark     = 60;
+
+        cpi->oxcf.fixed_q = cpi->interquantizer;
+
+        cpi->filter_type = NORMAL_LOOPFILTER;
+
+        if (cm->simpler_lpf)
+            cpi->filter_type = SIMPLE_LOOPFILTER;
+
+        cpi->compressor_speed = 1;
+        cpi->horiz_scale = 0;
+        cpi->vert_scale = 0;
+        cpi->oxcf.two_pass_vbrbias = 50;
+        cpi->oxcf.two_pass_vbrmax_section = 400;
+        cpi->oxcf.two_pass_vbrmin_section = 0;
+
+        cpi->oxcf.Sharpness = 0;
+        cpi->oxcf.noise_sensitivity = 0;
+    }
+    else
+        cpi->oxcf = *oxcf;
+
+
+    switch (cpi->oxcf.Mode)
+    {
+
+    case MODE_REALTIME:
+        cpi->pass = 0;
+        cpi->compressor_speed = 2;
+
+        if (cpi->oxcf.cpu_used < -16)
+        {
+            cpi->oxcf.cpu_used = -16;
+        }
+
+        if (cpi->oxcf.cpu_used > 16)
+            cpi->oxcf.cpu_used = 16;
+
+        break;
+
+#if !(CONFIG_REALTIME_ONLY)
+    case MODE_GOODQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+
+    case MODE_BESTQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 0;
+        break;
+
+    case MODE_FIRSTPASS:
+        cpi->pass = 1;
+        cpi->compressor_speed = 1;
+        break;
+    case MODE_SECONDPASS:
+        cpi->pass = 2;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+    case MODE_SECONDPASS_BEST:
+        cpi->pass = 2;
+        cpi->compressor_speed = 0;
+        break;
+#endif
+    }
+
+    if (cpi->pass == 0)
+        cpi->auto_worst_q = 1;
+
+    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+    cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
+
+    if (oxcf->fixed_q >= 0)
+    {
+        if (oxcf->worst_allowed_q < 0)
+            cpi->oxcf.fixed_q = q_trans[0];
+        else
+            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+        if (oxcf->alt_q < 0)
+            cpi->oxcf.alt_q = q_trans[0];
+        else
+            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+        if (oxcf->key_q < 0)
+            cpi->oxcf.key_q = q_trans[0];
+        else
+            cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+        if (oxcf->gold_q < 0)
+            cpi->oxcf.gold_q = q_trans[0];
+        else
+            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+    }
+
+    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+    //cpi->use_golden_frame_only = 0;
+    //cpi->use_last_frame_only = 0;
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
+    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+    setup_features(cpi);
+
+    {
+        int i;
+
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+    }
+
+    // At the moment the first order values may not be > MAXQ
+    if (cpi->oxcf.fixed_q > MAXQ)
+        cpi->oxcf.fixed_q = MAXQ;
+
+    // local file playback mode == really big buffer
+    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+    {
+        cpi->oxcf.starting_buffer_level   = 60;
+        cpi->oxcf.optimal_buffer_level    = 60;
+        cpi->oxcf.maximum_buffer_size     = 240;
+
+    }
+
+
+    // Convert target bandwidth from Kbit/s to Bit/s
+    cpi->oxcf.target_bandwidth       *= 1000;
+    cpi->oxcf.starting_buffer_level   *= cpi->oxcf.target_bandwidth;
+
+    if (cpi->oxcf.optimal_buffer_level == 0)
+        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth;
+
+    if (cpi->oxcf.maximum_buffer_size == 0)
+        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.maximum_buffer_size     *= cpi->oxcf.target_bandwidth;
+
+    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+
+    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+    cpi->best_quality                = cpi->oxcf.best_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+
+    cpi->total_actual_bits            = 0;
+    cpi->total_target_vs_actual        = 0;
+
+    // Only allow dropped frames in buffered mode
+    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+    cm->filter_type      = (LOOPFILTERTYPE) cpi->filter_type;
+
+    if (!cm->use_bilinear_mc_filter)
+        cm->mcomp_filter_type = SIXTAP;
+    else
+        cm->mcomp_filter_type = BILINEAR;
+
+    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+    cm->Width       = cpi->oxcf.Width     ;
+    cm->Height      = cpi->oxcf.Height    ;
+
+    cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+
+    cm->horiz_scale  = cpi->horiz_scale;
+    cm->vert_scale   = cpi->vert_scale ;
+
+    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+    if (cpi->oxcf.Sharpness > 7)
+        cpi->oxcf.Sharpness = 7;
+
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        // always go to the next whole number
+        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+    }
+
+    if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width ||
+        ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height ||
+        cm->last_frame.y_width == 0)
+    {
+        alloc_raw_frame_buffers(cpi);
+        vp8_alloc_compressor_data(cpi);
+    }
+
+    // Clamp KF frame size to quarter of data rate
+    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
+        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        cpi->last_q[0] = cpi->oxcf.fixed_q;
+        cpi->last_q[1] = cpi->oxcf.fixed_q;
+    }
+
+    cpi->Speed = cpi->oxcf.cpu_used;
+
+    // force to allowlag to 0 if lag_in_frames is 0;
+    if (cpi->oxcf.lag_in_frames == 0)
+    {
+        cpi->oxcf.allow_lag = 0;
+    }
+    // Limit on lag buffers as these are not currently dynamically allocated
+    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+    // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled.
+    if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS))
+    {
+        cpi->oxcf.play_alternate = 0;
+        cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG;
+    }
+
+    // YX Temp
+    cpi->last_alt_ref_sei    = -1;
+    cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+    // Experimental RD Code
+    cpi->frame_distortion = 0;
+    cpi->last_frame_distortion = 0;
+#endif
+
+#if VP8_TEMPORAL_ALT_REF
+    {
+        int i;
+
+        cpi->fixed_divide[0] = 0;
+
+        for (i = 1; i < 255; i++)
+            cpi->fixed_divide[i] = 0x10000 / i;
+    }
+#endif
+}
+
+/*
+ * This function needs more clean up, i.e. be more tuned torwards
+ * change_config rather than init_config  !!!!!!!!!!!!!!!!
+ * YX - 5/28/2009
+ *
+ */
+
+void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+    VP8_COMMON *cm = &cpi->common;
+
+    if (!cpi)
+        return;
+
+    if (!oxcf)
+        return;
+
+    if (cm->version != oxcf->Version)
+    {
+        cm->version = oxcf->Version;
+        vp8_setup_version(cm);
+    }
+
+    cpi->oxcf = *oxcf;
+
+    switch (cpi->oxcf.Mode)
+    {
+
+    case MODE_REALTIME:
+        cpi->pass = 0;
+        cpi->compressor_speed = 2;
+
+        if (cpi->oxcf.cpu_used < -16)
+        {
+            cpi->oxcf.cpu_used = -16;
+        }
+
+        if (cpi->oxcf.cpu_used > 16)
+            cpi->oxcf.cpu_used = 16;
+
+        break;
+
+#if !(CONFIG_REALTIME_ONLY)
+    case MODE_GOODQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+
+    case MODE_BESTQUALITY:
+        cpi->pass = 0;
+        cpi->compressor_speed = 0;
+        break;
+
+    case MODE_FIRSTPASS:
+        cpi->pass = 1;
+        cpi->compressor_speed = 1;
+        break;
+    case MODE_SECONDPASS:
+        cpi->pass = 2;
+        cpi->compressor_speed = 1;
+
+        if (cpi->oxcf.cpu_used < -5)
+        {
+            cpi->oxcf.cpu_used = -5;
+        }
+
+        if (cpi->oxcf.cpu_used > 5)
+            cpi->oxcf.cpu_used = 5;
+
+        break;
+    case MODE_SECONDPASS_BEST:
+        cpi->pass = 2;
+        cpi->compressor_speed = 0;
+        break;
+#endif
+    }
+
+    if (cpi->pass == 0)
+        cpi->auto_worst_q = 1;
+
+    cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
+    cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+
+    if (oxcf->fixed_q >= 0)
+    {
+        if (oxcf->worst_allowed_q < 0)
+            cpi->oxcf.fixed_q = q_trans[0];
+        else
+            cpi->oxcf.fixed_q = q_trans[oxcf->worst_allowed_q];
+
+        if (oxcf->alt_q < 0)
+            cpi->oxcf.alt_q = q_trans[0];
+        else
+            cpi->oxcf.alt_q = q_trans[oxcf->alt_q];
+
+        if (oxcf->key_q < 0)
+            cpi->oxcf.key_q = q_trans[0];
+        else
+            cpi->oxcf.key_q = q_trans[oxcf->key_q];
+
+        if (oxcf->gold_q < 0)
+            cpi->oxcf.gold_q = q_trans[0];
+        else
+            cpi->oxcf.gold_q = q_trans[oxcf->gold_q];
+
+    }
+
+    cpi->baseline_gf_interval = cpi->oxcf.alt_freq ? cpi->oxcf.alt_freq : DEFAULT_GF_INTERVAL;
+
+    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+    //cpi->use_golden_frame_only = 0;
+    //cpi->use_last_frame_only = 0;
+    cm->refresh_golden_frame = 0;
+    cm->refresh_last_frame = 1;
+    cm->refresh_entropy_probs = 1;
+
+    if (cpi->oxcf.token_partitions >= 0 && cpi->oxcf.token_partitions <= 3)
+        cm->multi_token_partition = (TOKEN_PARTITION) cpi->oxcf.token_partitions;
+
+    setup_features(cpi);
+
+    {
+        int i;
+
+        for (i = 0; i < MAX_MB_SEGMENTS; i++)
+            cpi->segment_encode_breakout[i] = cpi->oxcf.encode_breakout;
+    }
+
+    // At the moment the first order values may not be > MAXQ
+    if (cpi->oxcf.fixed_q > MAXQ)
+        cpi->oxcf.fixed_q = MAXQ;
+
+    // local file playback mode == really big buffer
+    if (cpi->oxcf.end_usage == USAGE_LOCAL_FILE_PLAYBACK)
+    {
+        cpi->oxcf.starting_buffer_level   = 60;
+        cpi->oxcf.optimal_buffer_level    = 60;
+        cpi->oxcf.maximum_buffer_size     = 240;
+
+    }
+
+    // Convert target bandwidth from Kbit/s to Bit/s
+    cpi->oxcf.target_bandwidth       *= 1000;
+
+    cpi->oxcf.starting_buffer_level   *= cpi->oxcf.target_bandwidth;
+
+    if (cpi->oxcf.optimal_buffer_level == 0)
+        cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.optimal_buffer_level *= cpi->oxcf.target_bandwidth;
+
+    if (cpi->oxcf.maximum_buffer_size == 0)
+        cpi->oxcf.maximum_buffer_size = cpi->oxcf.target_bandwidth / 8;
+    else
+        cpi->oxcf.maximum_buffer_size     *= cpi->oxcf.target_bandwidth;
+
+    cpi->buffer_level                = cpi->oxcf.starting_buffer_level;
+    cpi->bits_off_target              = cpi->oxcf.starting_buffer_level;
+
+    vp8_new_frame_rate(cpi, cpi->oxcf.frame_rate);
+    cpi->worst_quality               = cpi->oxcf.worst_allowed_q;
+    cpi->active_worst_quality         = cpi->oxcf.worst_allowed_q;
+    cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
+    cpi->best_quality                = cpi->oxcf.best_allowed_q;
+    cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
+
+
+    cpi->total_actual_bits            = 0;
+    cpi->total_target_vs_actual        = 0;
+
+    // Only allow dropped frames in buffered mode
+    cpi->drop_frames_allowed          = cpi->oxcf.allow_df && cpi->buffered_mode;
+
+    cm->filter_type                  = (LOOPFILTERTYPE) cpi->filter_type;
+
+    if (!cm->use_bilinear_mc_filter)
+        cm->mcomp_filter_type = SIXTAP;
+    else
+        cm->mcomp_filter_type = BILINEAR;
+
+    cpi->target_bandwidth = cpi->oxcf.target_bandwidth;
+
+    cm->Width       = cpi->oxcf.Width     ;
+    cm->Height      = cpi->oxcf.Height    ;
+
+    cm->horiz_scale  = cpi->horiz_scale;
+    cm->vert_scale   = cpi->vert_scale ;
+
+    cpi->intra_frame_target           = (4 * (cm->Width + cm->Height) / 15) * 1000; // As per VP8
+
+    // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs)
+    if (cpi->oxcf.Sharpness > 7)
+        cpi->oxcf.Sharpness = 7;
+
+    cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    if (cm->horiz_scale != NORMAL || cm->vert_scale != NORMAL)
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        // always go to the next whole number
+        cm->Width = (hs - 1 + cpi->oxcf.Width * hr) / hs;
+        cm->Height = (vs - 1 + cpi->oxcf.Height * vr) / vs;
+    }
+
+    if (((cm->Width + 15) & 0xfffffff0) != cm->last_frame.y_width ||
+        ((cm->Height + 15) & 0xfffffff0) != cm->last_frame.y_height ||
+        cm->last_frame.y_width == 0)
+    {
+        alloc_raw_frame_buffers(cpi);
+        vp8_alloc_compressor_data(cpi);
+    }
+
+    // Clamp KF frame size to quarter of data rate
+    if (cpi->intra_frame_target > cpi->target_bandwidth >> 2)
+        cpi->intra_frame_target = cpi->target_bandwidth >> 2;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        cpi->last_q[0] = cpi->oxcf.fixed_q;
+        cpi->last_q[1] = cpi->oxcf.fixed_q;
+    }
+
+    cpi->Speed = cpi->oxcf.cpu_used;
+
+    // force to allowlag to 0 if lag_in_frames is 0;
+    if (cpi->oxcf.lag_in_frames == 0)
+    {
+        cpi->oxcf.allow_lag = 0;
+    }
+    // Limit on lag buffers as these are not currently dynamically allocated
+    else if (cpi->oxcf.lag_in_frames > MAX_LAG_BUFFERS)
+        cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS;
+
+    // force play_alternate to 0 if allow_lag is 0, lag_in_frames is too small, Mode is real time or one pass compress enabled.
+    if (cpi->oxcf.allow_lag == 0 || cpi->oxcf.lag_in_frames <= 5 || (cpi->oxcf.Mode < MODE_SECONDPASS))
+    {
+        cpi->oxcf.play_alternate = 0;
+        cpi->ref_frame_flags = cpi->ref_frame_flags & ~VP8_ALT_FLAG;
+    }
+
+    // YX Temp
+    cpi->last_alt_ref_sei    = -1;
+    cpi->is_src_frame_alt_ref = 0;
+
+#if 0
+    // Experimental RD Code
+    cpi->frame_distortion = 0;
+    cpi->last_frame_distortion = 0;
+#endif
+
+}
+
+#define M_LOG2_E 0.693147180559945309417
+#define log2f(x) (log (x) / (float) M_LOG2_E)
+static void cal_mvsadcosts(int *mvsadcost[2])
+{
+    int i = 1;
+
+    mvsadcost [0] [0] = 300;
+    mvsadcost [1] [0] = 300;
+
+    do
+    {
+        double z = 256 * (2 * (log2f(2 * i) + .6));
+        mvsadcost [0][i] = (int) z;
+        mvsadcost [1][i] = (int) z;
+        mvsadcost [0][-i] = (int) z;
+        mvsadcost [1][-i] = (int) z;
+    }
+    while (++i <= mv_max);
+}
+
+VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
+{
+    int i;
+    volatile union
+    {
+        VP8_COMP *cpi;
+        VP8_PTR   ptr;
+    } ctx;
+    
+    VP8_COMP *cpi;
+    VP8_COMMON *cm;
+
+    cpi = ctx.cpi = vpx_memalign(32, sizeof(VP8_COMP));
+    // Check that the CPI instance is valid
+    if (!cpi)
+        return 0;
+
+    cm = &cpi->common;
+
+    vpx_memset(cpi, 0, sizeof(VP8_COMP));
+
+    if (setjmp(cm->error.jmp))
+    {
+        VP8_PTR ptr = ctx.ptr;
+
+        ctx.cpi->common.error.setjmp = 0;
+        vp8_remove_compressor(&ptr);
+        return 0;
+    }
+
+    cpi->common.error.setjmp = 1;
+
+    CHECK_MEM_ERROR(cpi->rdtok, vpx_calloc(256 * 3 / 2, sizeof(TOKENEXTRA)));
+    CHECK_MEM_ERROR(cpi->mb.ss, vpx_calloc(sizeof(search_site), (MAX_MVSEARCH_STEPS * 8) + 1));
+
+    vp8_cmachine_specific_config(cpi);
+    vp8_create_common(&cpi->common);
+
+    vp8_init_config((VP8_PTR)cpi, oxcf);
+
+    memcpy(cpi->base_skip_false_prob, vp8cx_base_skip_false_prob, sizeof(vp8cx_base_skip_false_prob));
+    cpi->common.current_video_frame   = 0;
+    cpi->kf_overspend_bits            = 0;
+    cpi->kf_bitrate_adjustment        = 0;
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->gf_overspend_bits            = 0;
+    cpi->non_gf_bitrate_adjustment     = 0;
+    cpi->prob_last_coded              = 128;
+    cpi->prob_gf_coded                = 128;
+    cpi->prob_intra_coded             = 63;
+
+    // Prime the recent reference frame useage counters.
+    // Hereafter they will be maintained as a sort of moving average
+    cpi->recent_ref_frame_usage[INTRA_FRAME]  = 1;
+    cpi->recent_ref_frame_usage[LAST_FRAME]   = 1;
+    cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+    cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+
+    // Set reference frame sign bias for ALTREF frame to 1 (for now)
+    cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+
+    cpi->gf_decay_rate = 0;
+    cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+
+    cpi->gold_is_last = 0 ;
+    cpi->alt_is_last  = 0 ;
+    cpi->gold_is_alt  = 0 ;
+
+
+
+    // Create the encoder segmentation map and set all entries to 0
+    CHECK_MEM_ERROR(cpi->segmentation_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+    CHECK_MEM_ERROR(cpi->active_map, vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+    vpx_memset(cpi->active_map , 1, (cpi->common.mb_rows * cpi->common.mb_cols));
+    cpi->active_map_enabled = 0;
+
+    // Create the first pass motion map structure and set to 0
+    CHECK_MEM_ERROR(cpi->fp_motion_map, vpx_calloc(cpi->common.MBs, 1));
+
+#if 0
+    // Experimental code for lagged and one pass
+    // Initialise one_pass GF frames stats
+    // Update stats used for GF selection
+    if (cpi->pass == 0)
+    {
+        cpi->one_pass_frame_index = 0;
+
+        for (i = 0; i < MAX_LAG_BUFFERS; i++)
+        {
+            cpi->one_pass_frame_stats[i].frames_so_far = 0;
+            cpi->one_pass_frame_stats[i].frame_intra_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_coded_error = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_inter = 0.0;
+            cpi->one_pass_frame_stats[i].frame_pcnt_motion = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvr_abs = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc = 0.0;
+            cpi->one_pass_frame_stats[i].frame_mvc_abs = 0.0;
+        }
+    }
+#endif
+
+    // Should we use the cyclic refresh method.
+    // Currently this is tied to error resilliant mode
+    cpi->cyclic_refresh_mode_enabled = cpi->oxcf.error_resilient_mode;
+    cpi->cyclic_refresh_mode_max_mbs_perframe = (cpi->common.mb_rows * cpi->common.mb_cols) / 40;
+    cpi->cyclic_refresh_mode_index = 0;
+    cpi->cyclic_refresh_q = 32;
+
+    if (cpi->cyclic_refresh_mode_enabled)
+    {
+        CHECK_MEM_ERROR(cpi->cyclic_refresh_map, vpx_calloc((cpi->common.mb_rows * cpi->common.mb_cols), 1));
+    }
+    else
+        cpi->cyclic_refresh_map = (signed char *) NULL;
+
+    // Test function for segmentation
+    //segmentation_test_function((VP8_PTR) cpi);
+
+    // Loop filter mode / ref deltas test function
+    //mode_ref_lf_test_function(cpi);
+
+#ifdef ENTROPY_STATS
+    init_context_counters();
+#endif
+
+
+#ifdef INTRARDOPT
+    cpi->intra_rd_opt = 1;
+
+#endif
+
+    cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
+    cpi->key_frame_frequency = cpi->oxcf.key_freq;
+
+    cpi->source_alt_ref_pending = FALSE;
+    cpi->source_alt_ref_active = FALSE;
+    cpi->common.refresh_alt_ref_frame = 0;
+
+    cpi->b_calculate_psnr = CONFIG_PSNR;
+#if CONFIG_PSNR
+    cpi->b_calculate_ssimg = 0;
+
+    cpi->count = 0;
+    cpi->bytes = 0;
+
+    if (cpi->b_calculate_psnr)
+    {
+        cpi->total_sq_error = 0.0;
+        cpi->total_sq_error2 = 0.0;
+        cpi->total_y = 0.0;
+        cpi->total_u = 0.0;
+        cpi->total_v = 0.0;
+        cpi->total = 0.0;
+        cpi->totalp_y = 0.0;
+        cpi->totalp_u = 0.0;
+        cpi->totalp_v = 0.0;
+        cpi->totalp = 0.0;
+        cpi->tot_recode_hits = 0;
+        cpi->summed_quality = 0;
+        cpi->summed_weights = 0;
+    }
+
+    if (cpi->b_calculate_ssimg)
+    {
+        cpi->total_ssimg_y = 0;
+        cpi->total_ssimg_u = 0;
+        cpi->total_ssimg_v = 0;
+        cpi->total_ssimg_all = 0;
+    }
+
+#ifndef LLONG_MAX
+#define LLONG_MAX  9223372036854775807LL
+#endif
+    cpi->first_time_stamp_ever = LLONG_MAX;
+
+#endif
+
+    cpi->frames_till_gf_update_due      = 0;
+    cpi->key_frame_count              = 1;
+    cpi->tot_key_frame_bits            = 0;
+
+    cpi->ni_av_qi                     = cpi->oxcf.worst_allowed_q;
+    cpi->ni_tot_qi                    = 0;
+    cpi->ni_frames                   = 0;
+    cpi->total_byte_count             = 0;
+
+    cpi->drop_frame                  = 0;
+    cpi->drop_count                  = 0;
+    cpi->max_drop_count               = 0;
+    cpi->max_consec_dropped_frames     = 4;
+
+    cpi->rate_correction_factor         = 1.0;
+    cpi->key_frame_rate_correction_factor = 1.0;
+    cpi->gf_rate_correction_factor  = 1.0;
+    cpi->est_max_qcorrection_factor  = 1.0;
+
+    cpi->mb.mvcost[0] = &cpi->mb.mvcosts[0][mv_max+1];
+    cpi->mb.mvcost[1] = &cpi->mb.mvcosts[1][mv_max+1];
+    cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mv_max+1];
+    cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mv_max+1];
+
+    cal_mvsadcosts(cpi->mb.mvsadcost);
+
+    for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+    {
+        cpi->prior_key_frame_size[i]     = cpi->intra_frame_target;
+        cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate;
+    }
+
+    cpi->check_freq[0] = 15;
+    cpi->check_freq[1] = 15;
+
+#ifdef OUTPUT_YUV_SRC
+    yuv_file = fopen("bd.yuv", "ab");
+#endif
+
+#if 0
+    framepsnr = fopen("framepsnr.stt", "a");
+    kf_list = fopen("kf_list.stt", "w");
+#endif
+
+    cpi->output_pkt_list = oxcf->output_pkt_list;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 1)
+    {
+        vp8_init_first_pass(cpi);
+    }
+    else if (cpi->pass == 2)
+    {
+        cpi->stats_in = oxcf->two_pass_stats_in.buf;
+        cpi->stats_in_end = cpi->stats_in
+                            + oxcf->two_pass_stats_in.sz / sizeof(FIRSTPASS_STATS)
+                            - 1;
+        vp8_init_second_pass(cpi);
+    }
+
+#endif
+
+    if (cpi->compressor_speed == 2)
+    {
+        cpi->cpu_freq            = 0; //vp8_get_processor_freq();
+        cpi->avg_encode_time      = 0;
+        cpi->avg_pick_mode_time    = 0;
+    }
+
+    vp8_set_speed_features(cpi);
+
+    // Set starting values of RD threshold multipliers (128 = *1)
+    for (i = 0; i < MAX_MODES; i++)
+    {
+        cpi->rd_thresh_mult[i] = 128;
+    }
+
+#ifdef ENTROPY_STATS
+    init_mv_ref_counts();
+#endif
+
+    vp8cx_create_encoder_threads(cpi);
+
+    cpi->fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
+    cpi->fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
+    cpi->fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
+    cpi->fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+    cpi->fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+
+#if !(CONFIG_REALTIME_ONLY)
+    cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
+#endif
+    cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search);
+
+    cpi->ready_for_new_frame = 1;
+
+    cpi->source_encode_index = 0;
+
+    // make sure frame 1 is okay
+    cpi->error_bins[0] = cpi->common.MBs;
+
+    //vp8cx_init_quantizer() is first called here. Add check in vp8cx_frame_init_quantizer() so that vp8cx_init_quantizer is only called later
+    //when needed. This will avoid unnecessary calls of vp8cx_init_quantizer() for every frame.
+    vp8cx_init_quantizer(cpi);
+    {
+        vp8_init_loop_filter(cm);
+        cm->last_frame_type = KEY_FRAME;
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+    }
+    cpi->common.error.setjmp = 0;
+    return (VP8_PTR) cpi;
+
+}
+
+
+void vp8_remove_compressor(VP8_PTR *ptr)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(*ptr);
+
+    if (!cpi)
+        return;
+
+    if (cpi && (cpi->common.current_video_frame > 0))
+    {
+#if !(CONFIG_REALTIME_ONLY)
+
+        if (cpi->pass == 2)
+        {
+            vp8_end_second_pass(cpi);
+        }
+
+#endif
+
+#ifdef ENTROPY_STATS
+        print_context_counters();
+        print_tree_update_probs();
+        print_mode_context();
+#endif
+
+#if CONFIG_PSNR
+
+        if (cpi->pass != 1)
+        {
+            FILE *f = fopen("opsnr.stt", "a");
+            double time_encoded = (cpi->source_end_time_stamp - cpi->first_time_stamp_ever) / 10000000.000;
+            double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data)   / 1000.000;
+            double dr = (double)cpi->bytes * (double) 8 / (double)1000  / time_encoded;
+
+            if (cpi->b_calculate_psnr)
+            {
+                double samples = 3.0 / 2 * cpi->count * cpi->common.last_frame.y_width * cpi->common.last_frame.y_height;
+                double total_psnr = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error);
+                double total_psnr2 = vp8_mse2psnr(samples, 255.0, cpi->total_sq_error2);
+                double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0);
+
+                fprintf(f, "Bitrate\AVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t  Time(us)\n");
+                fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f %8.0f\n",
+                        dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim,
+                        total_encode_time);
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t  Time(us)\n");
+                fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
+                        cpi->total_ssimg_y / cpi->count, cpi->total_ssimg_u / cpi->count,
+                        cpi->total_ssimg_v / cpi->count, cpi->total_ssimg_all / cpi->count, total_encode_time);
+            }
+
+            fclose(f);
+#if 0
+            f = fopen("qskip.stt", "a");
+            fprintf(f, "minq:%d -maxq:%d skipture:skipfalse = %d:%d\n", cpi->oxcf.best_allowed_q, cpi->oxcf.worst_allowed_q, skiptruecount, skipfalsecount);
+            fclose(f);
+#endif
+
+        }
+
+#endif
+
+
+#ifdef SPEEDSTATS
+
+        if (cpi->compressor_speed == 2)
+        {
+            int i;
+            FILE *f = fopen("cxspeed.stt", "a");
+            cnt_pm /= cpi->common.MBs;
+
+            for (i = 0; i < 16; i++)
+                fprintf(f, "%5d", frames_at_speed[i]);
+
+            fprintf(f, "\n");
+            //fprintf(f, "%10d PM %10d %10d %10d EF %10d %10d %10d\n", cpi->Speed, cpi->avg_pick_mode_time, (tot_pm/cnt_pm), cnt_pm,  cpi->avg_encode_time, 0, 0);
+            fclose(f);
+        }
+
+#endif
+
+
+#ifdef MODE_STATS
+        {
+            extern int count_mb_seg[4];
+            FILE *f = fopen("modes.stt", "a");
+            double dr = (double)cpi->oxcf.frame_rate * (double)bytes * (double)8 / (double)count / (double)1000 ;
+            fprintf(f, "intra_mode in Intra Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d\n", y_modes[0], y_modes[1], y_modes[2], y_modes[3], y_modes[4]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", uv_modes[0], uv_modes[1], uv_modes[2], uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 10; i++)
+                    fprintf(f, "%8d, ", b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+
+            fprintf(f, "Modes in Inter Frames:\n");
+            fprintf(f, "Y: %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d, %8d\n",
+                    inter_y_modes[0], inter_y_modes[1], inter_y_modes[2], inter_y_modes[3], inter_y_modes[4],
+                    inter_y_modes[5], inter_y_modes[6], inter_y_modes[7], inter_y_modes[8], inter_y_modes[9]);
+            fprintf(f, "UV:%8d, %8d, %8d, %8d\n", inter_uv_modes[0], inter_uv_modes[1], inter_uv_modes[2], inter_uv_modes[3]);
+            fprintf(f, "B: ");
+            {
+                int i;
+
+                for (i = 0; i < 15; i++)
+                    fprintf(f, "%8d, ", inter_b_modes[i]);
+
+                fprintf(f, "\n");
+
+            }
+            fprintf(f, "P:%8d, %8d, %8d, %8d\n", count_mb_seg[0], count_mb_seg[1], count_mb_seg[2], count_mb_seg[3]);
+            fprintf(f, "PB:%8d, %8d, %8d, %8d\n", inter_b_modes[LEFT4X4], inter_b_modes[ABOVE4X4], inter_b_modes[ZERO4X4], inter_b_modes[NEW4X4]);
+
+
+
+            fclose(f);
+        }
+#endif
+
+#ifdef ENTROPY_STATS
+        {
+            int i, j, k;
+            FILE *fmode = fopen("modecontext.c", "w");
+
+            fprintf(fmode, "\n#include \"entropymode.h\"\n\n");
+            fprintf(fmode, "const unsigned int vp8_kf_default_bmode_counts ");
+            fprintf(fmode, "[VP8_BINTRAMODES] [VP8_BINTRAMODES] [VP8_BINTRAMODES] =\n{\n");
+
+            for (i = 0; i < 10; i++)
+            {
+
+                fprintf(fmode, "    { //Above Mode :  %d\n", i);
+
+                for (j = 0; j < 10; j++)
+                {
+
+                    fprintf(fmode, "        {");
+
+                    for (k = 0; k < 10; k++)
+                    {
+                        if (!intra_mode_stats[i][j][k])
+                            fprintf(fmode, " %5d, ", 1);
+                        else
+                            fprintf(fmode, " %5d, ", intra_mode_stats[i][j][k]);
+                    }
+
+                    fprintf(fmode, "}, // left_mode %d\n", j);
+
+                }
+
+                fprintf(fmode, "    },\n");
+
+            }
+
+            fprintf(fmode, "};\n");
+        }
+#endif
+
+
+#if defined(SECTIONBITS_OUTPUT)
+
+        if (0)
+        {
+            int i;
+            FILE *f = fopen("tokenbits.stt", "a");
+
+            for (i = 0; i < 28; i++)
+                fprintf(f, "%8d", (int)(Sectionbits[i] / 256));
+
+            fprintf(f, "\n");
+            fclose(f);
+        }
+
+#endif
+
+#if 0
+        {
+            printf("\n_pick_loop_filter_level:%d\n", cpi->time_pick_lpf / 1000);
+            printf("\n_frames recive_data encod_mb_row compress_frame  Total\n");
+            printf("%6d %10ld %10ld %10ld %10ld\n", cpi->common.current_video_frame, cpi->time_receive_data / 1000, cpi->time_encode_mb_row / 1000, cpi->time_compress_data / 1000, (cpi->time_receive_data + cpi->time_compress_data) / 1000);
+        }
+#endif
+
+    }
+
+    vp8cx_remove_encoder_threads(cpi);
+
+    vp8_dealloc_compressor_data(cpi);
+    vpx_free(cpi->mb.ss);
+    vpx_free(cpi->tok);
+    vpx_free(cpi->rdtok);
+    vpx_free(cpi->cyclic_refresh_map);
+
+    vp8_remove_common(&cpi->common);
+    vpx_free(cpi);
+    *ptr = 0;
+
+#ifdef OUTPUT_YUV_SRC
+    fclose(yuv_file);
+#endif
+
+#if 0
+
+    if (keyfile)
+        fclose(keyfile);
+
+    if (framepsnr)
+        fclose(framepsnr);
+
+    if (kf_list)
+        fclose(kf_list);
+
+#endif
+
+}
+
+
+static uint64_t calc_plane_error(unsigned char *orig, int orig_stride,
+                                 unsigned char *recon, int recon_stride,
+                                 unsigned int cols, unsigned int rows,
+                                 vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned int row, col;
+    uint64_t total_sse = 0;
+    int diff;
+
+    for (row = 0; row + 16 <= rows; row += 16)
+    {
+        for (col = 0; col + 16 <= cols; col += 16)
+        {
+            unsigned int sse;
+
+            VARIANCE_INVOKE(rtcd, mse16x16)(orig + col, orig_stride,
+                                            recon + col, recon_stride,
+                                            &sse);
+            total_sse += sse;
+        }
+
+        /* Handle odd-sized width */
+        if (col < cols)
+        {
+            unsigned int   border_row, border_col;
+            unsigned char *border_orig = orig;
+            unsigned char *border_recon = recon;
+
+            for (border_row = 0; border_row < 16; border_row++)
+            {
+                for (border_col = col; border_col < cols; border_col++)
+                {
+                    diff = border_orig[border_col] - border_recon[border_col];
+                    total_sse += diff * diff;
+                }
+
+                border_orig += orig_stride;
+                border_recon += recon_stride;
+            }
+        }
+
+        orig += orig_stride * 16;
+        recon += recon_stride * 16;
+    }
+
+    /* Handle odd-sized height */
+    for (; row < rows; row++)
+    {
+        for (col = 0; col < cols; col++)
+        {
+            diff = orig[col] - recon[col];
+            total_sse += diff * diff;
+        }
+
+        orig += orig_stride;
+        recon += recon_stride;
+    }
+
+    return total_sse;
+}
+
+
+static void generate_psnr_packet(VP8_COMP *cpi)
+{
+    YV12_BUFFER_CONFIG      *orig = cpi->Source;
+    YV12_BUFFER_CONFIG      *recon = cpi->common.frame_to_show;
+    struct vpx_codec_cx_pkt  pkt;
+    uint64_t                 sse;
+    int                      i;
+    unsigned int             width = cpi->common.Width;
+    unsigned int             height = cpi->common.Height;
+
+    pkt.kind = VPX_CODEC_PSNR_PKT;
+    sse = calc_plane_error(orig->y_buffer, orig->y_stride,
+                           recon->y_buffer, recon->y_stride,
+                           width, height,
+                           IF_RTCD(&cpi->rtcd.variance));
+    pkt.data.psnr.sse[0] = sse;
+    pkt.data.psnr.sse[1] = sse;
+    pkt.data.psnr.samples[0] = width * height;
+    pkt.data.psnr.samples[1] = width * height;
+
+    width = (width + 1) / 2;
+    height = (height + 1) / 2;
+
+    sse = calc_plane_error(orig->u_buffer, orig->uv_stride,
+                           recon->u_buffer, recon->uv_stride,
+                           width, height,
+                           IF_RTCD(&cpi->rtcd.variance));
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[2] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[2] = width * height;
+
+    sse = calc_plane_error(orig->v_buffer, orig->uv_stride,
+                           recon->v_buffer, recon->uv_stride,
+                           width, height,
+                           IF_RTCD(&cpi->rtcd.variance));
+    pkt.data.psnr.sse[0] += sse;
+    pkt.data.psnr.sse[3] = sse;
+    pkt.data.psnr.samples[0] += width * height;
+    pkt.data.psnr.samples[3] = width * height;
+
+    for (i = 0; i < 4; i++)
+        pkt.data.psnr.psnr[i] = vp8_mse2psnr(pkt.data.psnr.samples[i], 255.0,
+                                             pkt.data.psnr.sse[i]);
+
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+}
+
+
+int vp8_use_as_reference(VP8_PTR ptr, int ref_frame_flags)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->ref_frame_flags = ref_frame_flags;
+    return 0;
+}
+int vp8_update_reference(VP8_PTR ptr, int ref_frame_flags)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+
+    if (ref_frame_flags > 7)
+        return -1 ;
+
+    cpi->common.refresh_golden_frame = 0;
+    cpi->common.refresh_alt_ref_frame = 0;
+    cpi->common.refresh_last_frame   = 0;
+
+    if (ref_frame_flags & VP8_LAST_FLAG)
+        cpi->common.refresh_last_frame = 1;
+
+    if (ref_frame_flags & VP8_GOLD_FLAG)
+        cpi->common.refresh_golden_frame = 1;
+
+    if (ref_frame_flags & VP8_ALT_FLAG)
+        cpi->common.refresh_alt_ref_frame = 1;
+
+    return 0;
+}
+
+int vp8_get_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+    VP8_COMMON *cm = &cpi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->last_frame, sd);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->golden_frame, sd);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, sd);
+
+    else
+        return -1;
+
+    return 0;
+}
+int vp8_set_reference(VP8_PTR ptr, VP8_REFFRAME ref_frame_flag, YV12_BUFFER_CONFIG *sd)
+{
+    VP8_COMP *cpi = (VP8_COMP *)(ptr);
+    VP8_COMMON *cm = &cpi->common;
+
+    if (ref_frame_flag == VP8_LAST_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->last_frame);
+
+    else if (ref_frame_flag == VP8_GOLD_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->golden_frame);
+
+    else if (ref_frame_flag == VP8_ALT_FLAG)
+        vp8_yv12_copy_frame_ptr(sd, &cm->alt_ref_frame);
+
+    else
+        return -1;
+
+    return 0;
+}
+int vp8_update_entropy(VP8_PTR comp, int update)
+{
+    VP8_COMP *cpi = (VP8_COMP *) comp;
+    VP8_COMMON *cm = &cpi->common;
+    cm->refresh_entropy_probs = update;
+
+    return 0;
+}
+
+void vp8_write_yuv_frame(const char *name, YV12_BUFFER_CONFIG *s)
+{
+    FILE *yuv_file = fopen(name, "ab");
+    unsigned char *src = s->y_buffer;
+    int h = s->y_height;
+
+    do
+    {
+        fwrite(src, s->y_width, 1,  yuv_file);
+        src += s->y_stride;
+    }
+    while (--h);
+
+    src = s->u_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1,  yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    src = s->v_buffer;
+    h = s->uv_height;
+
+    do
+    {
+        fwrite(src, s->uv_width, 1, yuv_file);
+        src += s->uv_stride;
+    }
+    while (--h);
+
+    fclose(yuv_file);
+}
+
+static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    // are we resizing the image
+    if (cm->horiz_scale != 0 || cm->vert_scale != 0)
+    {
+#if CONFIG_SPATIAL_RESAMPLING
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int tmp_height;
+
+        if (cm->vert_scale == 3)
+            tmp_height = 9;
+        else
+            tmp_height = 11;
+
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+
+        vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer,
+                        tmp_height, hs, hr, vs, vr, 0);
+
+        cpi->Source = &cpi->scaled_source;
+#endif
+    }
+    // we may need to copy to a buffer so we can extend the image...
+    else if (cm->Width != cm->last_frame.y_width ||
+             cm->Height != cm->last_frame.y_height)
+    {
+        //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+#if HAVE_ARMV7
+        vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source);
+#else
+        vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source);
+#endif
+
+        cpi->Source = &cpi->scaled_source;
+    }
+
+    vp8_extend_to_multiple_of16(cpi->Source, cm->Width, cm->Height);
+
+}
+static void resize_key_frame(VP8_COMP *cpi)
+{
+#if CONFIG_SPATIAL_RESAMPLING
+    VP8_COMMON *cm = &cpi->common;
+
+    // Do we need to apply resampling for one pass cbr.
+    // In one pass this is more limited than in two pass cbr
+    // The test and any change is only made one per key frame sequence
+    if (cpi->oxcf.allow_spatial_resampling && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER))
+    {
+        int UNINITIALIZED_IS_SAFE(hr), UNINITIALIZED_IS_SAFE(hs);
+        int UNINITIALIZED_IS_SAFE(vr), UNINITIALIZED_IS_SAFE(vs);
+        int new_width, new_height;
+
+        // If we are below the resample DOWN watermark then scale down a notch.
+        if (cpi->buffer_level < (cpi->oxcf.resample_down_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale < ONETWO) ? cm->horiz_scale + 1 : ONETWO;
+            cm->vert_scale = (cm->vert_scale < ONETWO) ? cm->vert_scale + 1 : ONETWO;
+        }
+        // Should we now start scaling back up
+        else if (cpi->buffer_level > (cpi->oxcf.resample_up_water_mark * cpi->oxcf.optimal_buffer_level / 100))
+        {
+            cm->horiz_scale = (cm->horiz_scale > NORMAL) ? cm->horiz_scale - 1 : NORMAL;
+            cm->vert_scale = (cm->vert_scale > NORMAL) ? cm->vert_scale - 1 : NORMAL;
+        }
+
+        // Get the new hieght and width
+        Scale2Ratio(cm->horiz_scale, &hr, &hs);
+        Scale2Ratio(cm->vert_scale, &vr, &vs);
+        new_width = ((hs - 1) + (cpi->oxcf.Width * hr)) / hs;
+        new_height = ((vs - 1) + (cpi->oxcf.Height * vr)) / vs;
+
+        // If the image size has changed we need to reallocate the buffers
+        // and resample the source image
+        if ((cm->Width != new_width) || (cm->Height != new_height))
+        {
+            cm->Width = new_width;
+            cm->Height = new_height;
+            vp8_alloc_compressor_data(cpi);
+            scale_and_extend_source(cpi->un_scaled_source, cpi);
+        }
+    }
+
+#endif
+}
+// return of 0 means drop frame
+static int pick_frame_size(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    // First Frame is a special case
+    if (cm->current_video_frame == 0)
+    {
+#if !(CONFIG_REALTIME_ONLY)
+
+        if (cpi->pass == 2)
+            vp8_calc_auto_iframe_target_size(cpi);
+
+        // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction
+        else
+#endif
+            cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2;
+
+        // in error resilient mode the first frame is bigger since it likely contains
+        // all the static background
+        if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2))
+        {
+            cpi->this_frame_target *= 3;      // 5;
+        }
+
+        // Key frame from VFW/auto-keyframe/first frame
+        cm->frame_type = KEY_FRAME;
+
+    }
+    // Auto key frames (Only two pass will enter here)
+    else if (cm->frame_type == KEY_FRAME)
+    {
+        vp8_calc_auto_iframe_target_size(cpi);
+    }
+    // Forced key frames (by interval or an external signal)
+    else if ((cm->frame_flags & FRAMEFLAGS_KEY) ||
+             (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+    {
+        // Key frame from VFW/auto-keyframe/first frame
+        cm->frame_type = KEY_FRAME;
+
+        resize_key_frame(cpi);
+
+        // Compute target frame size
+        if (cpi->pass != 2)
+            vp8_calc_iframe_target_size(cpi);
+    }
+    else
+    {
+        // INTER frame: compute target frame size
+        cm->frame_type = INTER_FRAME;
+        vp8_calc_pframe_target_size(cpi);
+
+        // Check if we're dropping the frame:
+        if (cpi->drop_frame)
+        {
+            cpi->drop_frame = FALSE;
+            cpi->drop_count++;
+            return 0;
+        }
+    }
+
+    // Note target_size in bits * 256 per MB
+    cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs;
+
+    return 1;
+}
+static void set_quantizer(VP8_COMP *cpi, int Q)
+{
+    VP8_COMMON *cm = &cpi->common;
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+
+    cm->base_qindex = Q;
+
+    cm->y1dc_delta_q = 0;
+    cm->y2dc_delta_q = 0;
+    cm->y2ac_delta_q = 0;
+    cm->uvdc_delta_q = 0;
+    cm->uvac_delta_q = 0;
+
+    // Set Segment specific quatizers
+    mbd->segment_feature_data[MB_LVL_ALT_Q][0] = cpi->segment_feature_data[MB_LVL_ALT_Q][0];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][1] = cpi->segment_feature_data[MB_LVL_ALT_Q][1];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][2] = cpi->segment_feature_data[MB_LVL_ALT_Q][2];
+    mbd->segment_feature_data[MB_LVL_ALT_Q][3] = cpi->segment_feature_data[MB_LVL_ALT_Q][3];
+}
+
+static void update_alt_ref_frame_and_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    // Update the golden frame buffer
+    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+    // Select an interval before next GF or altref
+    if (!cpi->auto_gold)
+        cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+    if ((cpi->pass != 2) && cpi->frames_till_gf_update_due)
+    {
+        cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        // Set the bits per frame that we should try and recover in subsequent inter frames
+        // to account for the extra GF spend... note that his does not apply for GF updates
+        // that occur coincident with a key frame as the extra cost of key frames is dealt
+        // with elsewhere.
+
+        cpi->gf_overspend_bits += cpi->projected_frame_size;
+        cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+    }
+
+    // Update data structure that monitors level of reference to last GF
+    vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+    cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+    // this frame refreshes means next frames don't unless specified by user
+
+    cpi->common.frames_since_golden = 0;
+
+    // Clear the alternate reference update pending flag.
+    cpi->source_alt_ref_pending = FALSE;
+
+    // Set the alternate refernce frame active flag
+    cpi->source_alt_ref_active = TRUE;
+
+
+}
+static void update_golden_frame_and_stats(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    // Update the Golden frame reconstruction buffer if signalled and the GF usage counts.
+    if (cm->refresh_golden_frame)
+    {
+        // Update the golden frame buffer
+        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+
+        // Select an interval before next GF
+        if (!cpi->auto_gold)
+            cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+        if ((cpi->pass != 2) && (cpi->frames_till_gf_update_due > 0))
+        {
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+            // Set the bits per frame that we should try and recover in subsequent inter frames
+            // to account for the extra GF spend... note that his does not apply for GF updates
+            // that occur coincident with a key frame as the extra cost of key frames is dealt
+            // with elsewhere.
+            if ((cm->frame_type != KEY_FRAME) && !cpi->source_alt_ref_active)
+            {
+                // Calcluate GF bits to be recovered
+                // Projected size - av frame bits available for inter frames for clip as a whole
+                cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->inter_frame_target);
+            }
+
+            cpi->non_gf_bitrate_adjustment = cpi->gf_overspend_bits / cpi->frames_till_gf_update_due;
+
+        }
+
+        // Update data structure that monitors level of reference to last GF
+        vpx_memset(cm->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
+        cm->gf_active_count = cm->mb_rows * cm->mb_cols;
+
+        // this frame refreshes means next frames don't unless specified by user
+        cm->refresh_golden_frame = 0;
+        cpi->common.frames_since_golden = 0;
+
+        //if ( cm->frame_type == KEY_FRAME )
+        //{
+        cpi->recent_ref_frame_usage[INTRA_FRAME] = 1;
+        cpi->recent_ref_frame_usage[LAST_FRAME] = 1;
+        cpi->recent_ref_frame_usage[GOLDEN_FRAME] = 1;
+        cpi->recent_ref_frame_usage[ALTREF_FRAME] = 1;
+        //}
+        //else
+        //{
+        //  // Carry a potrtion of count over to begining of next gf sequence
+        //  cpi->recent_ref_frame_usage[INTRA_FRAME] >>= 5;
+        //  cpi->recent_ref_frame_usage[LAST_FRAME] >>= 5;
+        //  cpi->recent_ref_frame_usage[GOLDEN_FRAME] >>= 5;
+        //  cpi->recent_ref_frame_usage[ALTREF_FRAME] >>= 5;
+        //}
+
+        // ******** Fixed Q test code only ************
+        // If we are going to use the ALT reference for the next group of frames set a flag to say so.
+        if (cpi->oxcf.fixed_q >= 0 &&
+            cpi->oxcf.play_alternate && !cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->source_alt_ref_pending = TRUE;
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+
+        if (!cpi->source_alt_ref_pending)
+            cpi->source_alt_ref_active = FALSE;
+
+        // Decrement count down till next gf
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+    }
+    else if (!cpi->common.refresh_alt_ref_frame)
+    {
+        // Decrement count down till next gf
+        if (cpi->frames_till_gf_update_due > 0)
+            cpi->frames_till_gf_update_due--;
+
+        if (cpi->common.frames_till_alt_ref_frame)
+            cpi->common.frames_till_alt_ref_frame --;
+
+        cpi->common.frames_since_golden ++;
+
+        if (cpi->common.frames_since_golden > 1)
+        {
+            cpi->recent_ref_frame_usage[INTRA_FRAME] += cpi->count_mb_ref_frame_usage[INTRA_FRAME];
+            cpi->recent_ref_frame_usage[LAST_FRAME] += cpi->count_mb_ref_frame_usage[LAST_FRAME];
+            cpi->recent_ref_frame_usage[GOLDEN_FRAME] += cpi->count_mb_ref_frame_usage[GOLDEN_FRAME];
+            cpi->recent_ref_frame_usage[ALTREF_FRAME] += cpi->count_mb_ref_frame_usage[ALTREF_FRAME];
+        }
+    }
+}
+
+// This function updates the reference frame probability estimates that
+// will be used during mode selection
+static void update_rd_ref_frame_probs(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+#if 0
+    const int *const rfct = cpi->recent_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->prob_intra_coded = 255;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded  = 128;
+    }
+    else if (!(rf_intra + rf_inter))
+    {
+        // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
+        cpi->prob_intra_coded = 63;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded    = 128;
+    }
+    else
+    {
+        cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+        if (cpi->prob_intra_coded < 1)
+            cpi->prob_intra_coded = 1;
+
+        if ((cm->frames_since_golden > 0) || cpi->source_alt_ref_active)
+        {
+            cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+            if (cpi->prob_last_coded < 1)
+                cpi->prob_last_coded = 1;
+
+            cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                                 ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+            if (cpi->prob_gf_coded < 1)
+                cpi->prob_gf_coded = 1;
+        }
+    }
+
+#else
+    const int *const rfct = cpi->count_mb_ref_frame_usage;
+    const int rf_intra = rfct[INTRA_FRAME];
+    const int rf_inter = rfct[LAST_FRAME] + rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME];
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->prob_intra_coded = 255;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded  = 128;
+    }
+    else if (!(rf_intra + rf_inter))
+    {
+        // This is a trap in case this function is called with cpi->recent_ref_frame_usage[] blank.
+        cpi->prob_intra_coded = 63;
+        cpi->prob_last_coded  = 128;
+        cpi->prob_gf_coded    = 128;
+    }
+    else
+    {
+        cpi->prob_intra_coded = (rf_intra * 255) / (rf_intra + rf_inter);
+
+        if (cpi->prob_intra_coded < 1)
+            cpi->prob_intra_coded = 1;
+
+        cpi->prob_last_coded = rf_inter ? (rfct[LAST_FRAME] * 255) / rf_inter : 128;
+
+        if (cpi->prob_last_coded < 1)
+            cpi->prob_last_coded = 1;
+
+        cpi->prob_gf_coded = (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME])
+                             ? (rfct[GOLDEN_FRAME] * 255) / (rfct[GOLDEN_FRAME] + rfct[ALTREF_FRAME]) : 128;
+
+        if (cpi->prob_gf_coded < 1)
+            cpi->prob_gf_coded = 1;
+    }
+
+    // update reference frame costs since we can do better than what we got last frame.
+
+    if (cpi->common.refresh_alt_ref_frame)
+    {
+        cpi->prob_intra_coded += 40;
+        cpi->prob_last_coded = 200;
+        cpi->prob_gf_coded = 1;
+    }
+    else if (cpi->common.frames_since_golden == 0)
+    {
+        cpi->prob_last_coded = 214;
+        cpi->prob_gf_coded = 1;
+    }
+    else if (cpi->common.frames_since_golden == 1)
+    {
+        cpi->prob_last_coded = 192;
+        cpi->prob_gf_coded = 220;
+    }
+    else if (cpi->source_alt_ref_active)
+    {
+        //int dist = cpi->common.frames_till_alt_ref_frame + cpi->common.frames_since_golden;
+        cpi->prob_gf_coded -= 20;
+
+        if (cpi->prob_gf_coded < 10)
+            cpi->prob_gf_coded = 10;
+    }
+
+#endif
+}
+
+
+// 1 = key, 0 = inter
+static int decide_key_frame(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int code_key_frame = FALSE;
+
+    cpi->kf_boost = 0;
+
+    if (cpi->Speed > 11)
+        return FALSE;
+
+    // Clear down mmx registers
+    vp8_clear_system_state();  //__asm emms;
+
+    if ((cpi->compressor_speed == 2) && (cpi->Speed >= 5) && (cpi->sf.RD == 0))
+    {
+        double change = 1.0 * abs((int)(cpi->intra_error - cpi->last_intra_error)) / (1 + cpi->last_intra_error);
+        double change2 = 1.0 * abs((int)(cpi->prediction_error - cpi->last_prediction_error)) / (1 + cpi->last_prediction_error);
+        double minerror = cm->MBs * 256;
+
+#if 0
+
+        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
+            && cpi->prediction_error > minerror
+            && (change > .25 || change2 > .25))
+        {
+            FILE *f = fopen("intra_inter.stt", "a");
+
+            if (cpi->prediction_error <= 0)
+                cpi->prediction_error = 1;
+
+            fprintf(f, "%d %d %d %d %14.4f\n",
+                    cm->current_video_frame,
+                    (int) cpi->prediction_error,
+                    (int) cpi->intra_error,
+                    (int)((10 * cpi->intra_error) / cpi->prediction_error),
+                    change);
+
+            fclose(f);
+        }
+
+#endif
+
+        cpi->last_intra_error = cpi->intra_error;
+        cpi->last_prediction_error = cpi->prediction_error;
+
+        if (10 * cpi->intra_error / (1 + cpi->prediction_error) < 15
+            && cpi->prediction_error > minerror
+            && (change > .25 || change2 > .25))
+        {
+            /*(change > 1.4 || change < .75)&& cpi->this_frame_percent_intra > cpi->last_frame_percent_intra + 3*/
+            return TRUE;
+        }
+
+        return FALSE;
+
+    }
+
+    // If the following are true we might as well code a key frame
+    if (((cpi->this_frame_percent_intra == 100) &&
+         (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 2))) ||
+        ((cpi->this_frame_percent_intra > 95) &&
+         (cpi->this_frame_percent_intra >= (cpi->last_frame_percent_intra + 5))))
+    {
+        code_key_frame = TRUE;
+    }
+    // in addition if the following are true and this is not a golden frame then code a key frame
+    // Note that on golden frames there often seems to be a pop in intra useage anyway hence this
+    // restriction is designed to prevent spurious key frames. The Intra pop needs to be investigated.
+    else if (((cpi->this_frame_percent_intra > 60) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 2))) ||
+             ((cpi->this_frame_percent_intra > 75) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra * 3 / 2))) ||
+             ((cpi->this_frame_percent_intra > 90) &&
+              (cpi->this_frame_percent_intra > (cpi->last_frame_percent_intra + 10))))
+    {
+        if (!cm->refresh_golden_frame)
+            code_key_frame = TRUE;
+    }
+
+    return code_key_frame;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass1Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+    (void) size;
+    (void) dest;
+    (void) frame_flags;
+    set_quantizer(cpi, 26);
+
+    scale_and_extend_source(cpi->un_scaled_source, cpi);
+    vp8_first_pass(cpi);
+}
+#endif
+
+#if 0
+void write_cx_frame_to_file(YV12_BUFFER_CONFIG *frame, int this_frame)
+{
+
+    // write the frame
+    FILE *yframe;
+    int i;
+    char filename[255];
+
+    sprintf(filename, "cx\\y%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->y_height; i++)
+        fwrite(frame->y_buffer + i * frame->y_stride, frame->y_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\u%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->u_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+    sprintf(filename, "cx\\v%04d.raw", this_frame);
+    yframe = fopen(filename, "wb");
+
+    for (i = 0; i < frame->uv_height; i++)
+        fwrite(frame->v_buffer + i * frame->uv_stride, frame->uv_width, 1, yframe);
+
+    fclose(yframe);
+}
+#endif
+// return of 0 means drop frame
+
+#if VP8_TEMPORAL_ALT_REF
+static void vp8cx_temp_blur1_c
+(
+    unsigned char **frames,
+    int frame_count,
+    unsigned char *src,
+    unsigned char *dst,
+    int width,
+    int stride,
+    int height,
+    int strength,
+    int *fixed_divide,
+    unsigned char *motion_map_ptr,
+    unsigned char block_size
+)
+{
+    int byte = 0;           // Buffer offset for the current pixel value being filtered
+    int frame = 0;
+    int modifier = 0;
+    int i, j, k;
+    int block_ofset;
+    int Cols, Rows;
+    unsigned char Shift = (block_size == 16) ? 4 : 3;
+
+    Cols = width / block_size;
+    Rows = height / block_size;
+
+    for (i = 0; i < height; i++)
+    {
+        block_ofset = (i >> Shift) * Cols;
+
+        for (j = 0; j < Cols; j ++)
+        {
+            if (motion_map_ptr[block_ofset] > 2)
+            {
+                vpx_memcpy(&dst[byte], &src[byte], block_size);
+                byte += block_size;
+            }
+            else
+            {
+                for (k = 0; k < block_size; k++)
+                {
+                    int accumulator = 0;
+                    int count = 0;
+                    int src_byte = src[byte];
+
+                    for (frame = 0; frame < frame_count; frame++)
+                    {
+                        // get current frame pixel value
+                        int pixel_value = frames[frame][byte];       // int pixel_value = *frameptr;
+
+                        modifier   = src_byte;                       // modifier   = s[byte];
+                        modifier  -= pixel_value;
+                        modifier  *= modifier;
+                        modifier >>= strength;
+                        modifier  *= 3;
+
+                        if (modifier > 16)
+                            modifier = 16;
+
+                        modifier = 16 - modifier;
+
+                        accumulator += modifier * pixel_value;
+
+                        count += modifier;
+                    }
+
+                    accumulator += (count >> 1);
+                    accumulator *= fixed_divide[count];          // accumulator *= ppi->fixed_divide[count];
+                    accumulator >>= 16;
+
+                    dst[byte] = accumulator;        // d[byte] = accumulator;
+
+                    // move to next pixel
+                    byte++;
+                }
+            }
+
+            block_ofset++;
+        }
+
+        // Step byte on over the UMV border to the start of the next line
+        byte += stride - width;
+    }
+}
+
+static void vp8cx_temp_filter_c
+(
+    VP8_COMP *cpi
+)
+{
+    YV12_BUFFER_CONFIG *temp_source_buffer;
+    int *fixed_divide = cpi->fixed_divide;
+
+    int frame = 0;
+    int max_frames = 11;
+
+    int num_frames_backward = 0;
+    int num_frames_forward = 0;
+    int frames_to_blur_backward = 0;
+    int frames_to_blur_forward = 0;
+    int frames_to_blur = 0;
+    int start_frame = 0;
+
+    int strength = cpi->oxcf.arnr_strength;
+
+    int blur_type = cpi->oxcf.arnr_type;
+
+    int new_max_frames = cpi->oxcf.arnr_max_frames;
+
+    if (new_max_frames > 0)
+        max_frames = new_max_frames;
+
+    num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index;
+
+    if (num_frames_backward < 0)
+        num_frames_backward += cpi->oxcf.lag_in_frames;
+
+    num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1);
+
+    switch (blur_type)
+    {
+    case 1:
+        /////////////////////////////////////////
+        // Backward Blur
+
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_backward >= max_frames)
+            frames_to_blur_backward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_backward + 1;
+        break;
+
+    case 2:
+        /////////////////////////////////////////
+        // Forward Blur
+
+        frames_to_blur_forward = num_frames_forward;
+
+        if (frames_to_blur_forward >= max_frames)
+            frames_to_blur_forward = max_frames - 1;
+
+        frames_to_blur = frames_to_blur_forward + 1;
+        break;
+
+    case 3:
+        /////////////////////////////////////////
+        // Center Blur
+        frames_to_blur_forward = num_frames_forward;
+        frames_to_blur_backward = num_frames_backward;
+
+        if (frames_to_blur_forward > frames_to_blur_backward)
+            frames_to_blur_forward = frames_to_blur_backward;
+
+        if (frames_to_blur_backward > frames_to_blur_forward)
+            frames_to_blur_backward = frames_to_blur_forward;
+
+        if (frames_to_blur_forward > (max_frames / 2))
+            frames_to_blur_forward = (max_frames / 2);
+
+        if (frames_to_blur_backward > (max_frames / 2))
+            frames_to_blur_backward = (max_frames / 2);
+
+        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+        break;
+
+    default:
+        /////////////////////////////////////////
+        // At most 4 frames forward Blur
+        frames_to_blur_forward = 4;
+        frames_to_blur_backward = num_frames_backward;
+
+        if (max_frames > 5)
+        {
+            if ((frames_to_blur_backward + frames_to_blur_forward) >= max_frames)
+            {
+                frames_to_blur_backward = max_frames - frames_to_blur_forward - 1;
+            }
+        }
+        else
+        {
+            frames_to_blur_forward = max_frames - 1;
+            frames_to_blur_backward = 0;
+        }
+
+        frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1;
+        break;
+    }
+
+    start_frame = (cpi->last_alt_ref_sei + frames_to_blur_forward) % cpi->oxcf.lag_in_frames;
+
+#ifdef DEBUGFWG
+    // DEBUG FWG
+    printf("max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d start:%d"
+           , max_frames
+           , num_frames_backward
+           , num_frames_forward
+           , frames_to_blur
+           , frames_to_blur_backward
+           , frames_to_blur_forward
+           , cpi->source_encode_index
+           , cpi->last_alt_ref_sei
+           , start_frame);
+#endif
+
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  start_frame - frame;
+
+        if (which_buffer < 0)
+            which_buffer += cpi->oxcf.lag_in_frames;
+
+        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.y_buffer;
+    }
+
+    temp_source_buffer = &cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer;
+
+    // Blur Y
+    vp8cx_temp_blur1_c(
+        cpi->frames,
+        frames_to_blur,
+        temp_source_buffer->y_buffer,  // cpi->Source->y_buffer,
+        cpi->alt_ref_buffer.source_buffer.y_buffer,  // cpi->Source->y_buffer,
+        temp_source_buffer->y_width,
+        temp_source_buffer->y_stride,
+        temp_source_buffer->y_height,
+        //temp_source_buffer->y_height * temp_source_buffer->y_stride,
+        strength,
+        fixed_divide,
+        cpi->fp_motion_map, 16);
+
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  cpi->last_alt_ref_sei - frame;
+
+        if (which_buffer < 0)
+            which_buffer += cpi->oxcf.lag_in_frames;
+
+        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.u_buffer;
+    }
+
+    // Blur U
+    vp8cx_temp_blur1_c(
+        cpi->frames,
+        frames_to_blur,
+        temp_source_buffer->u_buffer,
+        cpi->alt_ref_buffer.source_buffer.u_buffer,  // cpi->Source->u_buffer,
+        temp_source_buffer->uv_width,
+        temp_source_buffer->uv_stride,
+        temp_source_buffer->uv_height,
+        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
+        strength,
+        fixed_divide,
+        cpi->fp_motion_map, 8);
+
+    for (frame = 0; frame < frames_to_blur; frame++)
+    {
+        int which_buffer =  cpi->last_alt_ref_sei - frame;
+
+        if (which_buffer < 0)
+            which_buffer += cpi->oxcf.lag_in_frames;
+
+        cpi->frames[frame] = cpi->src_buffer[which_buffer].source_buffer.v_buffer;
+    }
+
+    // Blur V
+    vp8cx_temp_blur1_c(
+        cpi->frames,
+        frames_to_blur,
+        temp_source_buffer->v_buffer,
+        cpi->alt_ref_buffer.source_buffer.v_buffer,  // cpi->Source->v_buffer,
+        temp_source_buffer->uv_width,
+        temp_source_buffer->uv_stride,
+        //temp_source_buffer->uv_height * temp_source_buffer->uv_stride,
+        temp_source_buffer->uv_height,
+        strength,
+        fixed_divide,
+        cpi->fp_motion_map, 8);
+}
+#endif
+
+
+static void encode_frame_to_data_rate(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+    int Q;
+    int frame_over_shoot_limit;
+    int frame_under_shoot_limit;
+
+    int Loop = FALSE;
+    int loop_count;
+    int this_q;
+    int last_zbin_oq;
+
+    int q_low;
+    int q_high;
+    int zbin_oq_high;
+    int zbin_oq_low = 0;
+    int top_index;
+    int bottom_index;
+    VP8_COMMON *cm = &cpi->common;
+    int active_worst_qchanged = FALSE;
+
+    int overshoot_seen = FALSE;
+    int undershoot_seen = FALSE;
+    int drop_mark = cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100;
+    int drop_mark75 = drop_mark * 2 / 3;
+    int drop_mark50 = drop_mark / 4;
+    int drop_mark25 = drop_mark / 8;
+
+    // Clear down mmx registers to allow floating point in what follows
+    vp8_clear_system_state();
+
+    // Test code for segmentation of gf/arf (0,0)
+    //segmentation_test_function((VP8_PTR) cpi);
+
+    // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 2)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->per_frame_bandwidth = cpi->gf_bits;                           // Per frame bit target for the alt ref frame
+            cpi->target_bandwidth = cpi->gf_bits * cpi->output_frame_rate;      // per second target bitrate
+        }
+    }
+    else
+#endif
+        cpi->per_frame_bandwidth  = (int)(cpi->target_bandwidth / cpi->output_frame_rate);
+
+    // Default turn off buffer to buffer copying
+    cm->copy_buffer_to_gf = 0;
+    cm->copy_buffer_to_arf = 0;
+
+    // Clear zbin over-quant value and mode boost values.
+    cpi->zbin_over_quant = 0;
+    cpi->zbin_mode_boost = 0;
+
+    // Enable mode based tweaking of the zbin
+    cpi->zbin_mode_boost_enabled = TRUE;
+
+    // Current default encoder behaviour for the altref sign bias
+    if (cpi->source_alt_ref_active)
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 1;
+    else
+        cpi->common.ref_frame_sign_bias[ALTREF_FRAME] = 0;
+
+    // Check to see if a key frame is signalled
+    // For two pass with auto key frame enabled cm->frame_type may already be set, but not for one pass.
+    if ((cm->current_video_frame == 0) ||
+        (cm->frame_flags & FRAMEFLAGS_KEY) ||
+        (cpi->oxcf.auto_key && (cpi->frames_since_key % cpi->key_frame_frequency == 0)))
+    {
+        // Key frame from VFW/auto-keyframe/first frame
+        cm->frame_type = KEY_FRAME;
+    }
+
+    // Set default state for segment and mode based loop filter update flags
+    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+    // Set various flags etc to special state if it is a key frame
+    if (cm->frame_type == KEY_FRAME)
+    {
+        int i;
+
+        // If segmentation is enabled force a map update for key frames
+        if (cpi->mb.e_mbd.segmentation_enabled)
+        {
+            cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+            cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+        }
+
+        // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
+        if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
+        {
+            cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+        }
+
+        // The alternate reference frame cannot be active for a key frame
+        cpi->source_alt_ref_active = FALSE;
+
+        // Reset the RD threshold multipliers to default of * 1 (128)
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            cpi->rd_thresh_mult[i] = 128;
+        }
+    }
+
+    // Test code for segmentation
+    //if ( (cm->frame_type == KEY_FRAME) || ((cm->current_video_frame % 2) == 0))
+    //if ( (cm->current_video_frame % 2) == 0 )
+    //  enable_segmentation((VP8_PTR)cpi);
+    //else
+    //  disable_segmentation((VP8_PTR)cpi);
+
+#if 0
+    // Experimental code for lagged compress and one pass
+    // Initialise one_pass GF frames stats
+    // Update stats used for GF selection
+    //if ( cpi->pass == 0 )
+    {
+        cpi->one_pass_frame_index = cm->current_video_frame % MAX_LAG_BUFFERS;
+
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frames_so_far = 0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_intra_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_coded_error = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_inter = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_pcnt_motion = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvr_abs = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc = 0.0;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index ].frame_mvc_abs = 0.0;
+    }
+#endif
+
+    update_rd_ref_frame_probs(cpi);
+
+    if (cpi->drop_frames_allowed)
+    {
+        // The reset to decimation 0 is only done here for one pass.
+        // Once it is set two pass leaves decimation on till the next kf.
+        if ((cpi->buffer_level > drop_mark) && (cpi->decimation_factor > 0))
+            cpi->decimation_factor --;
+
+        if (cpi->buffer_level > drop_mark75 && cpi->decimation_factor > 0)
+            cpi->decimation_factor = 1;
+
+        else if (cpi->buffer_level < drop_mark25 && (cpi->decimation_factor == 2 || cpi->decimation_factor == 3))
+        {
+            cpi->decimation_factor = 3;
+        }
+        else if (cpi->buffer_level < drop_mark50 && (cpi->decimation_factor == 1 || cpi->decimation_factor == 2))
+        {
+            cpi->decimation_factor = 2;
+        }
+        else if (cpi->buffer_level < drop_mark75 && (cpi->decimation_factor == 0 || cpi->decimation_factor == 1))
+        {
+            cpi->decimation_factor = 1;
+        }
+
+        //vpx_log("Encoder: Decimation Factor: %d \n",cpi->decimation_factor);
+    }
+
+    // The following decimates the frame rate according to a regular pattern (i.e. to 1/2 or 2/3 frame rate)
+    // This can be used to help prevent buffer under-run in CBR mode. Alternatively it might be desirable in
+    // some situations to drop frame rate but throw more bits at each frame.
+    //
+    // Note that dropping a key frame can be problematic if spatial resampling is also active
+    if (cpi->decimation_factor > 0)
+    {
+        switch (cpi->decimation_factor)
+        {
+        case 1:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 3 / 2;
+            break;
+        case 2:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        case 3:
+            cpi->per_frame_bandwidth  = cpi->per_frame_bandwidth * 5 / 4;
+            break;
+        }
+
+        // Note that we should not throw out a key frame (especially when spatial resampling is enabled).
+        if ((cm->frame_type == KEY_FRAME)) // && cpi->oxcf.allow_spatial_resampling )
+        {
+            cpi->decimation_count = cpi->decimation_factor;
+        }
+        else if (cpi->decimation_count > 0)
+        {
+            cpi->decimation_count --;
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            cm->current_video_frame++;
+            cpi->frames_since_key++;
+
+#if CONFIG_PSNR
+            cpi->count ++;
+#endif
+
+            cpi->buffer_level = cpi->bits_off_target;
+
+            return;
+        }
+        else
+            cpi->decimation_count = cpi->decimation_factor;
+    }
+
+    // Decide how big to make the frame
+    if (!pick_frame_size(cpi))
+    {
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+        return;
+    }
+
+    // Reduce active_worst_allowed_q for CBR if our buffer is getting too full.
+    // This has a knock on effect on active best quality as well.
+    // For CBR if the buffer reaches its maximum level then we can no longer
+    // save up bits for later frames so we might as well use them up
+    // on the current frame.
+    if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
+    {
+        int Adjustment = cpi->active_worst_quality / 4;       // Max adjustment is 1/4
+
+        if (Adjustment)
+        {
+            int buff_lvl_step;
+            int tmp_lvl = cpi->buffer_level;
+
+            if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size)
+            {
+                buff_lvl_step = (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level) / Adjustment;
+
+                if (buff_lvl_step)
+                {
+                    Adjustment = (cpi->buffer_level - cpi->oxcf.optimal_buffer_level) / buff_lvl_step;
+                    cpi->active_worst_quality -= Adjustment;
+                }
+            }
+            else
+            {
+                cpi->active_worst_quality -= Adjustment;
+            }
+        }
+    }
+
+    // Set an active best quality and if necessary active worst quality
+    if (cpi->pass == 2 || (cm->current_video_frame > 150))
+    {
+        //if ( (cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame  )
+        int Q;
+        int i;
+        int bpm_target;
+
+        Q = cpi->active_worst_quality;
+
+        if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+        {
+            vp8_clear_system_state();
+
+            if (cm->frame_type != KEY_FRAME)
+            {
+                // Where a gf overlays an existing arf then allow active max Q to drift to highest allowed value.
+                //if ( cpi->common.refresh_golden_frame && cpi->source_alt_ref_active )
+                //cpi->active_worst_quality = cpi->worst_quality;
+
+                if (cpi->avg_frame_qindex < cpi->active_worst_quality)
+                    Q = cpi->avg_frame_qindex;
+
+                if (cpi->section_is_low_motion)
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 3 / 2) + 128)) / 64;
+                else if (cpi->section_is_fast_motion)
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 128)) / 64;
+                else
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * ((Q * 5 / 4) + 128)) / 64;
+            }
+            // KEY FRAMES
+            else
+            {
+                if (cpi->section_is_low_motion)
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 240)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
+                else
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64;
+            }
+
+            for (i = Q; i > 0; i--)
+            {
+                if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+                    break;
+            }
+
+            cpi->active_best_quality = i;
+
+            // this entire section could be replaced by a look up table
+#if 0
+            {
+                int Q, best_q[128];
+
+                for (Q = 0; Q < 128; Q++)
+                {
+                    bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 160)) / 64; // Approx 2.5 to 4.5 where Q has the range 0-127
+
+                    for (i = Q; i > 0; i--)
+                    {
+                        if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+                            break;
+                    }
+
+                    best_q[Q] = i;
+                }
+
+                Q += 0;
+            }
+#endif
+
+        }
+        else
+        {
+            vp8_clear_system_state();
+
+            //bpm_target = (vp8_bits_per_mb[cm->frame_type][Q]*(Q+128))/64; // Approx 2 to 4 where Q has the range 0-127
+            bpm_target = (vp8_bits_per_mb[cm->frame_type][Q] * (Q + 192)) / 128; // Approx * 1.5 to 2.5 where Q has range 0-127
+
+            for (i = Q; i > 0; i--)
+            {
+                if (bpm_target <= vp8_bits_per_mb[cm->frame_type][i])
+                    break;
+            }
+
+            cpi->active_best_quality = i;
+        }
+
+        // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
+        // to prevent bits just going to waste.
+        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+        {
+            // Note that the use of >= here elliminates the risk of a devide by 0 error in the else if clause
+            if (cpi->buffer_level >= cpi->oxcf.maximum_buffer_size)
+                cpi->active_best_quality = cpi->best_quality;
+
+            else if (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)
+            {
+                int Fraction = ((cpi->buffer_level - cpi->oxcf.optimal_buffer_level) * 128) / (cpi->oxcf.maximum_buffer_size - cpi->oxcf.optimal_buffer_level);
+                int min_qadjustment = ((cpi->active_best_quality - cpi->best_quality) * Fraction) / 128;
+
+                cpi->active_best_quality -= min_qadjustment;
+            }
+
+        }
+    }
+
+    // Clip the active best and worst quality values to limits
+    if (cpi->active_worst_quality > cpi->worst_quality)
+        cpi->active_worst_quality = cpi->worst_quality;
+
+    if (cpi->active_best_quality < cpi->best_quality)
+        cpi->active_best_quality = cpi->best_quality;
+    else if (cpi->active_best_quality > cpi->active_worst_quality)
+        cpi->active_best_quality = cpi->active_worst_quality;
+
+    // Determine initial Q to try
+    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+    last_zbin_oq = cpi->zbin_over_quant;
+
+    // Set highest allowed value for Zbin over quant
+    if (cm->frame_type == KEY_FRAME)
+        zbin_oq_high = 0; //ZBIN_OQ_MAX/16
+    else if (cm->refresh_alt_ref_frame || (cm->refresh_golden_frame && !cpi->source_alt_ref_active))
+        zbin_oq_high = 16;
+    else
+        zbin_oq_high = ZBIN_OQ_MAX;
+
+    // Setup background Q adjustment for error resilliant mode
+    if (cpi->cyclic_refresh_mode_enabled)
+        cyclic_background_refresh(cpi, Q, 0);
+
+    vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+    // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+    bottom_index = cpi->active_best_quality;
+    top_index    = cpi->active_worst_quality;
+
+    vp8_save_coding_context(cpi);
+
+    loop_count = 0;
+
+    q_low  = cpi->best_quality;
+    q_high = cpi->worst_quality;
+
+
+    scale_and_extend_source(cpi->un_scaled_source, cpi);
+#if !(CONFIG_REALTIME_ONLY) && CONFIG_POSTPROC
+
+    if (cpi->oxcf.noise_sensitivity > 0)
+    {
+        unsigned char *src;
+        int l = 0;
+
+        switch (cpi->oxcf.noise_sensitivity)
+        {
+        case 1:
+            l = 20;
+            break;
+        case 2:
+            l = 40;
+            break;
+        case 3:
+            l = 60;
+            break;
+        case 4:
+            l = 80;
+            break;
+        case 5:
+            l = 100;
+            break;
+        case 6:
+            l = 150;
+            break;
+        }
+
+
+        if (cm->frame_type == KEY_FRAME)
+        {
+            vp8_de_noise(cpi->Source, cpi->Source, l , 1,  0, RTCD(postproc));
+            cpi->ppi.frame = 0;
+        }
+        else
+        {
+            vp8_de_noise(cpi->Source, cpi->Source, l , 1,  0, RTCD(postproc));
+
+            src = cpi->Source->y_buffer;
+
+            if (cpi->Source->y_stride < 0)
+            {
+                src += cpi->Source->y_stride * (cpi->Source->y_height - 1);
+            }
+
+            //temp_filter(&cpi->ppi,src,src,
+            //  cm->last_frame.y_width * cm->last_frame.y_height,
+            //  cpi->oxcf.noise_sensitivity);
+        }
+    }
+
+#endif
+
+#ifdef OUTPUT_YUV_SRC
+    vp8_write_yuv_frame(cpi->Source);
+#endif
+
+    do
+    {
+        vp8_clear_system_state();  //__asm emms;
+
+        /*
+        if(cpi->is_src_frame_alt_ref)
+            Q = 127;
+            */
+
+        set_quantizer(cpi, Q);
+        this_q = Q;
+
+        // setup skip prob for costing in mode/mv decision
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            cpi->prob_skip_false = cpi->base_skip_false_prob[Q];
+
+            if (cm->frame_type != KEY_FRAME)
+            {
+                if (cpi->common.refresh_alt_ref_frame)
+                {
+                    if (cpi->last_skip_false_probs[2] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[2]!=0 && abs(Q- cpi->last_skip_probs_q[2])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[2];
+                                        else if (cpi->last_skip_false_probs[2]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[2]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else if (cpi->common.refresh_golden_frame)
+                {
+                    if (cpi->last_skip_false_probs[1] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+
+                    /*
+                                        if(cpi->last_skip_false_probs[1]!=0 && abs(Q- cpi->last_skip_probs_q[1])<=16 )
+                       cpi->prob_skip_false = cpi->last_skip_false_probs[1];
+                                        else if (cpi->last_skip_false_probs[1]!=0)
+                       cpi->prob_skip_false = (cpi->last_skip_false_probs[1]  + cpi->prob_skip_false ) / 2;
+                       */
+                }
+                else
+                {
+                    if (cpi->last_skip_false_probs[0] != 0)
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+
+                    /*
+                    if(cpi->last_skip_false_probs[0]!=0 && abs(Q- cpi->last_skip_probs_q[0])<=16 )
+                        cpi->prob_skip_false = cpi->last_skip_false_probs[0];
+                    else if(cpi->last_skip_false_probs[0]!=0)
+                        cpi->prob_skip_false = (cpi->last_skip_false_probs[0]  + cpi->prob_skip_false ) / 2;
+                        */
+                }
+
+                //as this is for cost estimate, let's make sure it does not go extreme eitehr way
+                if (cpi->prob_skip_false < 5)
+                    cpi->prob_skip_false = 5;
+
+                if (cpi->prob_skip_false > 250)
+                    cpi->prob_skip_false = 250;
+
+                if (cpi->is_src_frame_alt_ref)
+                    cpi->prob_skip_false = 1;
+
+
+            }
+
+#if 0
+
+            if (cpi->pass != 1)
+            {
+                FILE *f = fopen("skip.stt", "a");
+                fprintf(f, "%d, %d, %4d ", cpi->common.refresh_golden_frame, cpi->common.refresh_alt_ref_frame, cpi->prob_skip_false);
+                fclose(f);
+            }
+
+#endif
+
+        }
+
+        if (cm->frame_type == KEY_FRAME)
+            vp8_setup_key_frame(cpi);
+
+        // transform / motion compensation build reconstruction frame
+
+        vp8_encode_frame(cpi);
+        cpi->projected_frame_size -= vp8_estimate_entropy_savings(cpi);
+        cpi->projected_frame_size = (cpi->projected_frame_size > 0) ? cpi->projected_frame_size : 0;
+
+        vp8_clear_system_state();  //__asm emms;
+
+        // Test to see if the stats generated for this frame indicate that we should have coded a key frame
+        // (assuming that we didn't)!
+        if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
+        {
+            if (decide_key_frame(cpi))
+            {
+                vp8_calc_auto_iframe_target_size(cpi);
+
+                // Reset all our sizing numbers and recode
+                cm->frame_type = KEY_FRAME;
+
+                // Clear the Alt reference frame active flag when we have a key frame
+                cpi->source_alt_ref_active = FALSE;
+
+                // If segmentation is enabled force a map update for key frames
+                if (cpi->mb.e_mbd.segmentation_enabled)
+                {
+                    cpi->mb.e_mbd.update_mb_segmentation_map = 1;
+                    cpi->mb.e_mbd.update_mb_segmentation_data = 1;
+                }
+
+                // If mode or reference frame based loop filter deltas are enabled then force an update for key frames.
+                if (cpi->mb.e_mbd.mode_ref_lf_delta_enabled)
+                {
+                    cpi->mb.e_mbd.mode_ref_lf_delta_update = 1;
+                }
+
+                vp8_restore_coding_context(cpi);
+
+                Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                q_low  = cpi->best_quality;
+                q_high = cpi->worst_quality;
+
+                vp8_compute_frame_size_bounds(cpi, &frame_under_shoot_limit, &frame_over_shoot_limit);
+
+                // Limit Q range for the adaptive loop (Values not clipped to range 20-60 as in VP8).
+                bottom_index = cpi->active_best_quality;
+                top_index    = cpi->active_worst_quality;
+
+
+                loop_count++;
+                Loop = TRUE;
+
+                resize_key_frame(cpi);
+                continue;
+            }
+        }
+
+        vp8_clear_system_state();
+
+        if (frame_over_shoot_limit == 0)
+            frame_over_shoot_limit = 1;
+
+        // Are we are overshooting and up against the limit of active max Q.
+        if (((cpi->pass != 2) || (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
+            (Q == cpi->active_worst_quality)                     &&
+            (cpi->active_worst_quality < cpi->worst_quality)      &&
+            (cpi->projected_frame_size > frame_over_shoot_limit))
+        {
+            int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+
+            // If so is there any scope for relaxing it
+            while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
+            {
+                cpi->active_worst_quality++;
+                top_index = cpi->active_worst_quality;
+                over_size_percent = (int)(over_size_percent * 0.96);        // Assume 1 qstep = about 4% on frame size.
+            }
+
+            // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
+            active_worst_qchanged = TRUE;
+        }
+        else
+            active_worst_qchanged = FALSE;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+        // Is the projected frame size out of range and are we allowed to attempt to recode.
+        if (((cpi->sf.recode_loop == 1) ||
+             ((cpi->sf.recode_loop == 2) && (cm->refresh_golden_frame || (cm->frame_type == KEY_FRAME)))) &&
+            (((cpi->projected_frame_size > frame_over_shoot_limit) && (Q < top_index)) ||
+             //((cpi->projected_frame_size > frame_over_shoot_limit ) && (Q == top_index) && (cpi->zbin_over_quant < ZBIN_OQ_MAX)) ||
+             ((cpi->projected_frame_size < frame_under_shoot_limit) && (Q > bottom_index)))
+           )
+        {
+            int last_q = Q;
+            int Retries = 0;
+
+            // Frame size out of permitted range:
+            // Update correction factor & compute new Q to try...
+            if (cpi->projected_frame_size > frame_over_shoot_limit)
+            {
+                //if ( cpi->zbin_over_quant == 0 )
+                q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
+
+                if (cpi->zbin_over_quant > 0)           // If we are using over quant do the same for zbin_oq_low
+                    zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+
+                //if ( undershoot_seen || (Q == MAXQ) )
+                if (undershoot_seen)
+                {
+                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low + 1) / 2;
+
+                    // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+                    if (Q < MAXQ)
+                        cpi->zbin_over_quant = 0;
+                    else
+                    {
+                        zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
+                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                    }
+                }
+                else
+                {
+                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    while (((Q < q_low) || (cpi->zbin_over_quant < zbin_oq_low)) && (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                overshoot_seen = TRUE;
+            }
+            else
+            {
+                if (cpi->zbin_over_quant == 0)
+                    q_high = (Q > q_low) ? (Q - 1) : q_low; // Lower q_high if not using over quant
+                else                                    // else lower zbin_oq_high
+                    zbin_oq_high = (cpi->zbin_over_quant > zbin_oq_low) ? (cpi->zbin_over_quant - 1) : zbin_oq_low;
+
+                if (overshoot_seen)
+                {
+                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 1);
+
+                    Q = (q_high + q_low) / 2;
+
+                    // Adjust cpi->zbin_over_quant (only allowed when Q is max)
+                    if (Q < MAXQ)
+                        cpi->zbin_over_quant = 0;
+                    else
+                        cpi->zbin_over_quant = (zbin_oq_high + zbin_oq_low) / 2;
+                }
+                else
+                {
+                    // Update rate_correction_factor unless cpi->active_worst_quality has changed.
+                    if (!active_worst_qchanged)
+                        vp8_update_rate_correction_factors(cpi, 0);
+
+                    Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+
+                    while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
+                    {
+                        vp8_update_rate_correction_factors(cpi, 0);
+                        Q = vp8_regulate_q(cpi, cpi->this_frame_target);
+                        Retries ++;
+                    }
+                }
+
+                undershoot_seen = TRUE;
+            }
+
+            // Clamp Q to upper and lower limits:
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            // Clamp cpi->zbin_over_quant
+            cpi->zbin_over_quant = (cpi->zbin_over_quant < zbin_oq_low) ? zbin_oq_low : (cpi->zbin_over_quant > zbin_oq_high) ? zbin_oq_high : cpi->zbin_over_quant;
+
+            //Loop = ((Q != last_q) || (last_zbin_oq != cpi->zbin_over_quant)) ? TRUE : FALSE;
+            Loop = ((Q != last_q)) ? TRUE : FALSE;
+            last_zbin_oq = cpi->zbin_over_quant;
+        }
+        else
+#endif
+            Loop = FALSE;
+
+        if (cpi->is_src_frame_alt_ref)
+            Loop = FALSE;
+
+        if (Loop == TRUE)
+        {
+            vp8_restore_coding_context(cpi);
+            loop_count++;
+#if CONFIG_PSNR
+            cpi->tot_recode_hits++;
+#endif
+        }
+    }
+    while (Loop == TRUE);
+
+#if 0
+    // Experimental code for lagged and one pass
+    // Update stats used for one pass GF selection
+    {
+        /*
+            int frames_so_far;
+            double frame_intra_error;
+            double frame_coded_error;
+            double frame_pcnt_inter;
+            double frame_pcnt_motion;
+            double frame_mvr;
+            double frame_mvr_abs;
+            double frame_mvc;
+            double frame_mvc_abs;
+        */
+
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_coded_error = (double)cpi->prediction_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_intra_error = (double)cpi->intra_error;
+        cpi->one_pass_frame_stats[cpi->one_pass_frame_index].frame_pcnt_inter = (double)(100 - cpi->this_frame_percent_intra) / 100.0;
+    }
+#endif
+
+    // Update the GF useage maps.
+    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
+    vp8_update_gf_useage_maps(cm, &cpi->mb.e_mbd);
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->refresh_last_frame = 1;
+
+    if (0)
+    {
+        FILE *f = fopen("gfactive.stt", "a");
+        fprintf(f, "%8d %8d %8d %8d %8d\n", cm->current_video_frame, (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols), cpi->this_iiratio, cpi->next_iiratio, cm->refresh_golden_frame);
+        fclose(f);
+    }
+
+    // For inter frames the current default behaviour is that when cm->refresh_golden_frame is set we copy the old GF over to the ARF buffer
+    // This is purely an encoder descision at present.
+    if (!cpi->oxcf.error_resilient_mode && cm->refresh_golden_frame)
+        cm->copy_buffer_to_arf  = 2;
+    else
+        cm->copy_buffer_to_arf  = 0;
+
+    if (cm->refresh_last_frame)
+    {
+        vp8_swap_yv12_buffer(&cm->last_frame, &cm->new_frame);
+        cm->frame_to_show = &cm->last_frame;
+    }
+    else
+        cm->frame_to_show = &cm->new_frame;
+
+
+
+    //#pragma omp parallel sections
+    {
+
+        //#pragma omp section
+        {
+
+            struct vpx_usec_timer timer;
+
+            vpx_usec_timer_start(&timer);
+
+            if (cpi->sf.auto_filter == 0)
+                vp8cx_pick_filter_level_fast(cpi->Source, cpi);
+            else
+                vp8cx_pick_filter_level(cpi->Source, cpi);
+
+            vpx_usec_timer_mark(&timer);
+
+            cpi->time_pick_lpf +=  vpx_usec_timer_elapsed(&timer);
+
+            if (cm->no_lpf)
+                cm->filter_level = 0;
+
+            if (cm->filter_level > 0)
+            {
+                vp8cx_set_alt_lf_level(cpi, cm->filter_level);
+                vp8_loop_filter_frame(cm, &cpi->mb.e_mbd, cm->filter_level);
+                cm->last_frame_type = cm->frame_type;
+                cm->last_filter_type = cm->filter_type;
+                cm->last_sharpness_level = cm->sharpness_level;
+            }
+
+            vp8_yv12_extend_frame_borders_ptr(cm->frame_to_show);
+
+            if (cpi->oxcf.error_resilient_mode == 1)
+            {
+                cm->refresh_entropy_probs = 0;
+            }
+
+        }
+//#pragma omp section
+        {
+            // build the bitstream
+            vp8_pack_bitstream(cpi, dest, size);
+        }
+    }
+
+
+    // At this point the new frame has been encoded coded.
+    // If any buffer copy / swaping is signalled it should be done here.
+    if (cm->frame_type == KEY_FRAME)
+    {
+        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+        vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+    }
+    else    // For non key frames
+    {
+        // Code to copy between reference buffers
+        if (cm->copy_buffer_to_arf)
+        {
+            if (cm->copy_buffer_to_arf == 1)
+            {
+                if (cm->refresh_last_frame)
+                    // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                    vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->alt_ref_frame);
+                else
+                    vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->alt_ref_frame);
+            }
+            else if (cm->copy_buffer_to_arf == 2)
+                vp8_yv12_copy_frame_ptr(&cm->golden_frame, &cm->alt_ref_frame);
+        }
+
+        if (cm->copy_buffer_to_gf)
+        {
+            if (cm->copy_buffer_to_gf == 1)
+            {
+                if (cm->refresh_last_frame)
+                    // We copy new_frame here because last and new buffers will already have been swapped if cm->refresh_last_frame is set.
+                    vp8_yv12_copy_frame_ptr(&cm->new_frame, &cm->golden_frame);
+                else
+                    vp8_yv12_copy_frame_ptr(&cm->last_frame, &cm->golden_frame);
+            }
+            else if (cm->copy_buffer_to_gf == 2)
+                vp8_yv12_copy_frame_ptr(&cm->alt_ref_frame, &cm->golden_frame);
+        }
+    }
+
+    // Update rate control heuristics
+    cpi->total_byte_count += (*size);
+    cpi->projected_frame_size = (*size) << 3;
+
+    if (!active_worst_qchanged)
+        vp8_update_rate_correction_factors(cpi, 2);
+
+    cpi->last_q[cm->frame_type] = cm->base_qindex;
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        vp8_adjust_key_frame_context(cpi);
+    }
+
+    // Keep a record of ambient average Q.
+    if (cm->frame_type == KEY_FRAME)
+        cpi->avg_frame_qindex = cm->base_qindex;
+    else
+        cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
+
+    // Keep a record from which we can calculate the average Q excluding GF updates and key frames
+    if ((cm->frame_type != KEY_FRAME) && !cm->refresh_golden_frame && !cm->refresh_alt_ref_frame)
+    {
+        cpi->ni_frames++;
+
+        // Calculate the average Q for normal inter frames (not key or GFU frames)
+        // This is used as a basis for setting active worst quality.
+        if (cpi->ni_frames > 150)
+        {
+            cpi->ni_tot_qi += Q;
+            cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+        }
+        // Early in the clip ... average the current frame Q value with the default
+        // entered by the user as a dampening measure
+        else
+        {
+            cpi->ni_tot_qi += Q;
+            cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+        }
+
+        // If the average Q is higher than what was used in the last frame
+        // (after going through the recode loop to keep the frame size within range)
+        // then use the last frame value - 1.
+        // The -1 is designed to stop Q and hence the data rate, from progressively
+        // falling away during difficult sections, but at the same time reduce the number of
+        // itterations around the recode loop.
+        if (Q > cpi->ni_av_qi)
+            cpi->ni_av_qi = Q - 1;
+
+    }
+
+#if 0
+
+    // If the frame was massively oversize and we are below optimal buffer level drop next frame
+    if ((cpi->drop_frames_allowed) &&
+        (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        (cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
+        (cpi->projected_frame_size > (4 * cpi->this_frame_target)))
+    {
+        cpi->drop_frame = TRUE;
+    }
+
+#endif
+
+    // Set the count for maximum consequative dropped frames based upon the ratio of
+    // this frame size to the target average per frame bandwidth.
+    // (cpi->av_per_frame_bandwidth > 0) is just a sanity check to prevent / 0.
+    if (cpi->drop_frames_allowed && (cpi->av_per_frame_bandwidth > 0))
+    {
+        cpi->max_drop_count = cpi->projected_frame_size / cpi->av_per_frame_bandwidth;
+
+        if (cpi->max_drop_count > cpi->max_consec_dropped_frames)
+            cpi->max_drop_count = cpi->max_consec_dropped_frames;
+    }
+
+    // Update the buffer level variable.
+    if (cpi->common.refresh_alt_ref_frame)
+        cpi->bits_off_target -= cpi->projected_frame_size;
+    else
+        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+
+    // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
+    cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
+    cpi->rolling_actual_bits = ((cpi->rolling_actual_bits * 3) + cpi->projected_frame_size + 2) / 4;
+    cpi->long_rolling_target_bits = ((cpi->long_rolling_target_bits * 31) + cpi->this_frame_target + 16) / 32;
+    cpi->long_rolling_actual_bits = ((cpi->long_rolling_actual_bits * 31) + cpi->projected_frame_size + 16) / 32;
+
+    // Actual bits spent
+    cpi->total_actual_bits    += cpi->projected_frame_size;
+
+    // Debug stats
+    cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
+
+    cpi->buffer_level = cpi->bits_off_target;
+
+    // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
+    if (cm->frame_type == KEY_FRAME)
+    {
+        cpi->kf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->kf_group_bits < 0)
+            cpi->kf_group_bits = 0 ;
+    }
+    else if (cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+    {
+        cpi->gf_group_bits += cpi->this_frame_target - cpi->projected_frame_size;
+
+        if (cpi->gf_group_bits < 0)
+            cpi->gf_group_bits = 0 ;
+    }
+
+    if (cm->frame_type != KEY_FRAME)
+    {
+        if (cpi->common.refresh_alt_ref_frame)
+        {
+            cpi->last_skip_false_probs[2] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[2] = cm->base_qindex;
+        }
+        else if (cpi->common.refresh_golden_frame)
+        {
+            cpi->last_skip_false_probs[1] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[1] = cm->base_qindex;
+        }
+        else
+        {
+            cpi->last_skip_false_probs[0] = cpi->prob_skip_false;
+            cpi->last_skip_probs_q[0] = cm->base_qindex;
+
+            //update the baseline
+            cpi->base_skip_false_prob[cm->base_qindex] = cpi->prob_skip_false;
+
+        }
+    }
+
+#if CONFIG_PSNR
+
+    if (0)
+    {
+        FILE *f = fopen("tmp.stt", "a");
+
+        vp8_clear_system_state();  //__asm emms;
+
+        if (cpi->total_coded_error_left != 0.0)
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality,  cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left, (double)cpi->bits_left / cpi->total_coded_error_left,  cpi->tot_recode_hits);
+        else
+            fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f %8ld\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), (int)cpi->total_target_vs_actual, (cpi->oxcf.starting_buffer_level - cpi->bits_off_target), (int)cpi->total_actual_bits, cm->base_qindex, cpi->active_best_quality, cpi->active_worst_quality,  cpi->avg_frame_qindex, cpi->zbin_over_quant, cm->refresh_golden_frame, cm->refresh_alt_ref_frame, cm->frame_type, cpi->gfu_boost, cpi->est_max_qcorrection_factor, (int)cpi->bits_left, cpi->total_coded_error_left,   cpi->tot_recode_hits);
+
+        fclose(f);
+
+        {
+            FILE *fmodes = fopen("Modes.stt", "a");
+            int i;
+
+            fprintf(fmodes, "%6d:%1d:%1d:%1d ", cpi->common.current_video_frame, cm->frame_type, cm->refresh_golden_frame, cm->refresh_alt_ref_frame);
+
+            for (i = 0; i < MAX_MODES; i++)
+                fprintf(fmodes, "%5d ", cpi->mode_chosen_counts[i]);
+
+            fprintf(fmodes, "\n");
+
+            fclose(fmodes);
+        }
+    }
+
+#endif
+
+    // If this was a kf or Gf note the Q
+    if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cm->refresh_alt_ref_frame)
+        cm->last_kf_gf_q = cm->base_qindex;
+
+    if (cm->refresh_golden_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_GOLDEN;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_GOLDEN;
+
+    if (cm->refresh_alt_ref_frame == 1)
+        cm->frame_flags = cm->frame_flags | FRAMEFLAGS_ALTREF;
+    else
+        cm->frame_flags = cm->frame_flags&~FRAMEFLAGS_ALTREF;
+
+
+    if (cm->refresh_last_frame & cm->refresh_golden_frame) // both refreshed
+        cpi->gold_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+        cpi->gold_is_last = 0;
+
+    if (cm->refresh_last_frame & cm->refresh_alt_ref_frame) // both refreshed
+        cpi->alt_is_last = 1;
+    else if (cm->refresh_last_frame ^ cm->refresh_alt_ref_frame) // 1 refreshed but not the other
+        cpi->alt_is_last = 0;
+
+    if (cm->refresh_alt_ref_frame & cm->refresh_golden_frame) // both refreshed
+        cpi->gold_is_alt = 1;
+    else if (cm->refresh_alt_ref_frame ^ cm->refresh_golden_frame) // 1 refreshed but not the other
+        cpi->gold_is_alt = 0;
+
+    cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+    if (cpi->gold_is_last)
+        cpi->ref_frame_flags &= !VP8_GOLD_FLAG;
+
+    if (cpi->alt_is_last)
+        cpi->ref_frame_flags &= !VP8_ALT_FLAG;
+
+    if (cpi->gold_is_alt)
+        cpi->ref_frame_flags &= !VP8_ALT_FLAG;
+
+
+    if (cpi->oxcf.error_resilient_mode)
+    {
+        // Is this an alternate reference update
+        if (cpi->common.refresh_alt_ref_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->alt_ref_frame);
+
+        if (cpi->common.refresh_golden_frame)
+            vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cm->golden_frame);
+    }
+    else
+    {
+        if (cpi->oxcf.play_alternate && cpi->common.refresh_alt_ref_frame)
+            // Update the alternate reference frame and stats as appropriate.
+            update_alt_ref_frame_and_stats(cpi);
+        else
+            // Update the Golden frame and golden frame and stats as appropriate.
+            update_golden_frame_and_stats(cpi);
+    }
+
+    if (cm->frame_type == KEY_FRAME)
+    {
+        // Tell the caller that the frame was coded as a key frame
+        *frame_flags = cm->frame_flags | FRAMEFLAGS_KEY;
+
+        // As this frame is a key frame  the next defaults to an inter frame.
+        cm->frame_type = INTER_FRAME;
+
+        cpi->last_frame_percent_intra = 100;
+    }
+    else
+    {
+        *frame_flags = cm->frame_flags&~FRAMEFLAGS_KEY;
+
+        cpi->last_frame_percent_intra = cpi->this_frame_percent_intra;
+    }
+
+    // Clear the one shot update flags for segmentation map and mode/ref loop filter deltas.
+    cpi->mb.e_mbd.update_mb_segmentation_map = 0;
+    cpi->mb.e_mbd.update_mb_segmentation_data = 0;
+    cpi->mb.e_mbd.mode_ref_lf_delta_update = 0;
+
+
+    // Dont increment frame counters if this was an altref buffer update not a real frame
+    if (cm->show_frame)
+    {
+        cm->current_video_frame++;
+        cpi->frames_since_key++;
+    }
+
+    // reset to normal state now that we are done.
+
+
+
+    if (0)
+    {
+        char filename[512];
+        FILE *recon_file;
+        sprintf(filename, "enc%04d.yuv", (int) cm->current_video_frame);
+        recon_file = fopen(filename, "wb");
+        fwrite(cm->last_frame.buffer_alloc, cm->last_frame.frame_size, 1, recon_file);
+        fclose(recon_file);
+    }
+
+    // DEBUG
+    //vp8_write_yuv_frame("encoder_recon.yuv", cm->frame_to_show);
+
+
+}
+
+int vp8_is_gf_update_needed(VP8_PTR ptr)
+{
+    VP8_COMP *cpi = (VP8_COMP *) ptr;
+    int ret_val;
+
+    ret_val = cpi->gf_update_recommended;
+    cpi->gf_update_recommended = 0;
+
+    return ret_val;
+}
+
+void vp8_check_gf_quality(VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+    int gf_active_pct = (100 * cm->gf_active_count) / (cm->mb_rows * cm->mb_cols);
+    int gf_ref_usage_pct = (cpi->count_mb_ref_frame_usage[GOLDEN_FRAME] * 100) / (cm->mb_rows * cm->mb_cols);
+    int last_ref_zz_useage = (cpi->inter_zz_count * 100) / (cm->mb_rows * cm->mb_cols);
+
+    // Gf refresh is not currently being signalled
+    if (cpi->gf_update_recommended == 0)
+    {
+        if (cpi->common.frames_since_golden > 7)
+        {
+            // Low use of gf
+            if ((gf_active_pct < 10) || ((gf_active_pct + gf_ref_usage_pct) < 15))
+            {
+                // ...but last frame zero zero usage is reasonbable so a new gf might be appropriate
+                if (last_ref_zz_useage >= 25)
+                {
+                    cpi->gf_bad_count ++;
+
+                    if (cpi->gf_bad_count >= 8)   // Check that the condition is stable
+                    {
+                        cpi->gf_update_recommended = 1;
+                        cpi->gf_bad_count = 0;
+                    }
+                }
+                else
+                    cpi->gf_bad_count = 0;        // Restart count as the background is not stable enough
+            }
+            else
+                cpi->gf_bad_count = 0;            // Gf useage has picked up so reset count
+        }
+    }
+    // If the signal is set but has not been read should we cancel it.
+    else if (last_ref_zz_useage < 15)
+    {
+        cpi->gf_update_recommended = 0;
+        cpi->gf_bad_count = 0;
+    }
+
+#if 0
+
+    if (0)
+    {
+        FILE *f = fopen("gfneeded.stt", "a");
+        fprintf(f, "%10d %10d %10d %10d %10ld \n",
+                cm->current_video_frame,
+                cpi->common.frames_since_golden,
+                gf_active_pct, gf_ref_usage_pct,
+                cpi->gf_update_recommended);
+        fclose(f);
+    }
+
+#endif
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, unsigned int *frame_flags)
+{
+    double two_pass_min_rate = (double)(cpi->oxcf.target_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        vp8_second_pass(cpi);
+
+    encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+    cpi->bits_left -= 8 * *size;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        cpi->bits_left += (long long)(two_pass_min_rate / cpi->oxcf.frame_rate);
+}
+#endif
+
+//For ARM NEON, d8-d15 are callee-saved registers, and need to be saved by us.
+#if HAVE_ARMV7
+extern void vp8_push_neon(INT64 *store);
+extern void vp8_pop_neon(INT64 *store);
+static INT64 store_reg[8];
+#endif
+int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)
+{
+    VP8_COMP *cpi = (VP8_COMP *) ptr;
+    VP8_COMMON *cm = &cpi->common;
+    struct vpx_usec_timer  timer;
+
+    if (!cpi)
+        return -1;
+
+#if HAVE_ARMV7
+    vp8_push_neon(store_reg);
+#endif
+
+    vpx_usec_timer_start(&timer);
+
+    // no more room for frames;
+    if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames)
+    {
+#if HAVE_ARMV7
+        vp8_pop_neon(store_reg);
+#endif
+        return -1;
+    }
+
+    //printf("in-cpi->source_buffer_count: %d\n", cpi->source_buffer_count);
+
+    cm->clr_type = sd->clrtype;
+
+    // make a copy of the frame for use later...
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->oxcf.allow_lag)
+    {
+        int which_buffer =  cpi->source_encode_index - 1;
+        SOURCE_SAMPLE *s;
+
+        if (which_buffer == -1)
+            which_buffer = cpi->oxcf.lag_in_frames - 1;
+
+        if (cpi->source_buffer_count < cpi->oxcf.lag_in_frames - 1)
+            which_buffer = cpi->source_buffer_count;
+
+        s = &cpi->src_buffer[which_buffer];
+
+        s->source_time_stamp = time_stamp;
+        s->source_end_time_stamp = end_time;
+        s->source_frame_flags = frame_flags;
+        vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+
+        cpi->source_buffer_count ++;
+    }
+    else
+#endif
+    {
+        SOURCE_SAMPLE *s;
+        s = &cpi->src_buffer[0];
+        s->source_end_time_stamp = end_time;
+        s->source_time_stamp = time_stamp;
+        s->source_frame_flags = frame_flags;
+#if HAVE_ARMV7
+        vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer);
+#else
+        vp8_yv12_copy_frame_ptr(sd, &s->source_buffer);
+#endif
+        cpi->source_buffer_count = 1;
+    }
+
+    vpx_usec_timer_mark(&timer);
+    cpi->time_receive_data += vpx_usec_timer_elapsed(&timer);
+
+#if HAVE_ARMV7
+    vp8_pop_neon(store_reg);
+#endif
+
+    return 0;
+}
+int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)
+{
+
+    VP8_COMP *cpi = (VP8_COMP *) ptr;
+    VP8_COMMON *cm = &cpi->common;
+    struct vpx_usec_timer  tsctimer;
+    struct vpx_usec_timer  ticktimer;
+    struct vpx_usec_timer  cmptimer;
+
+    if (!cpi)
+        return -1;
+
+#if HAVE_ARMV7
+    vp8_push_neon(store_reg);
+#endif
+
+    vpx_usec_timer_start(&cmptimer);
+
+
+    // flush variable tells us that even though we have less than 10 frames
+    // in our buffer we need to start producing compressed frames.
+    // Probably because we are at the end of a file....
+    if ((cpi->source_buffer_count == cpi->oxcf.lag_in_frames && cpi->oxcf.lag_in_frames > 0)
+        || (!cpi->oxcf.allow_lag && cpi->source_buffer_count > 0)
+        || (flush && cpi->source_buffer_count > 0))
+    {
+
+        SOURCE_SAMPLE *s;
+
+        s = &cpi->src_buffer[cpi->source_encode_index];
+        cpi->source_time_stamp = s->source_time_stamp;
+        cpi->source_end_time_stamp = s->source_end_time_stamp;
+
+#if !(CONFIG_REALTIME_ONLY)
+
+        // Should we code an alternate reference frame
+        if (cpi->oxcf.error_resilient_mode == 0 &&
+            cpi->oxcf.play_alternate &&
+            cpi->source_alt_ref_pending  &&
+            (cpi->frames_till_gf_update_due < cpi->source_buffer_count) &&
+            cpi->oxcf.lag_in_frames != 0)
+        {
+            cpi->last_alt_ref_sei = (cpi->source_encode_index + cpi->frames_till_gf_update_due) % cpi->oxcf.lag_in_frames;
+
+#if VP8_TEMPORAL_ALT_REF
+
+            if (cpi->oxcf.arnr_max_frames > 0)
+            {
+#if 0
+                // my attempt at a loop that tests the results of strength filter.
+                int start_frame = cpi->last_alt_ref_sei - 3;
+
+                int i, besti = -1, pastin = cpi->oxcf.arnr_strength;
+
+                int besterr;
+
+                if (start_frame < 0)
+                    start_frame += cpi->oxcf.lag_in_frames;
+
+                besterr = vp8_calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer,
+                                              &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
+
+                for (i = 0; i < 7; i++)
+                {
+                    int thiserr;
+                    cpi->oxcf.arnr_strength = i;
+                    vp8cx_temp_filter_c(cpi);
+
+                    thiserr = vp8_calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer,
+                                                  &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance));
+
+                    if (10 * thiserr < besterr * 8)
+                    {
+                        besterr = thiserr;
+                        besti = i;
+                    }
+                }
+
+                if (besti != -1)
+                {
+                    cpi->oxcf.arnr_strength = besti;
+                    vp8cx_temp_filter_c(cpi);
+                    s = &cpi->alt_ref_buffer;
+
+                    // FWG not sure if I need to copy this data for the Alt Ref frame
+                    s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
+                    s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
+                    s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
+                }
+                else
+                    s = &cpi->src_buffer[cpi->last_alt_ref_sei];
+
+#else
+                vp8cx_temp_filter_c(cpi);
+                s = &cpi->alt_ref_buffer;
+
+                // FWG not sure if I need to copy this data for the Alt Ref frame
+                s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp;
+                s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp;
+                s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags;
+
+#endif
+            }
+            else
+#endif
+                s = &cpi->src_buffer[cpi->last_alt_ref_sei];
+
+            cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due;
+            cm->refresh_alt_ref_frame = 1;
+            cm->refresh_golden_frame = 0;
+            cm->refresh_last_frame = 0;
+            cm->show_frame = 0;
+            cpi->source_alt_ref_pending = FALSE;   // Clear Pending altf Ref flag.
+            cpi->is_src_frame_alt_ref = 0;
+        }
+        else
+#endif
+        {
+            cm->show_frame = 1;
+#if !(CONFIG_REALTIME_ONLY)
+
+            if (cpi->oxcf.allow_lag)
+            {
+                if (cpi->source_encode_index ==  cpi->last_alt_ref_sei)
+                {
+#if VP8_TEMPORAL_ALT_REF
+
+                    if (cpi->oxcf.arnr_max_frames == 0)
+                    {
+                        cpi->is_src_frame_alt_ref = 1; // copy alt ref
+                    }
+                    else
+                    {
+                        cpi->is_src_frame_alt_ref = 0;
+                    }
+
+#else
+                    cpi->is_src_frame_alt_ref = 1;
+#endif
+                    cpi->last_alt_ref_sei    = -1;
+                }
+                else
+                    cpi->is_src_frame_alt_ref = 0;
+
+                cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames;
+            }
+
+#endif
+            cpi->source_buffer_count--;
+        }
+
+        cpi->un_scaled_source = &s->source_buffer;
+        cpi->Source = &s->source_buffer;
+        cpi->source_frame_flags = s->source_frame_flags;
+
+        *time_stamp = cpi->source_time_stamp;
+        *time_end = cpi->source_end_time_stamp;
+    }
+    else
+    {
+        *size = 0;
+#if !(CONFIG_REALTIME_ONLY)
+
+        if (flush && cpi->pass == 1 && !cpi->first_pass_done)
+        {
+            vp8_end_first_pass(cpi);    /* get last stats packet */
+            cpi->first_pass_done = 1;
+        }
+
+#endif
+
+#if HAVE_ARMV7
+        vp8_pop_neon(store_reg);
+#endif
+        return -1;
+    }
+
+    *frame_flags = cpi->source_frame_flags;
+
+#if CONFIG_PSNR
+
+    if (cpi->source_time_stamp < cpi->first_time_stamp_ever)
+        cpi->first_time_stamp_ever = cpi->source_time_stamp;
+
+#endif
+
+    // adjust frame rates based on timestamps given
+    if (!cm->refresh_alt_ref_frame)
+    {
+        if (cpi->last_time_stamp_seen == 0)
+        {
+            double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp);
+
+            vp8_new_frame_rate(cpi, this_fps);
+        }
+        else
+        {
+            long long nanosecs = cpi->source_time_stamp - cpi->last_time_stamp_seen;
+            double this_fps = 10000000.000 / nanosecs;
+
+            vp8_new_frame_rate(cpi, (7 * cpi->oxcf.frame_rate + this_fps) / 8);
+
+        }
+
+        cpi->last_time_stamp_seen = cpi->source_time_stamp;
+    }
+
+    if (cpi->compressor_speed == 2)
+    {
+        vp8_check_gf_quality(cpi);
+    }
+
+    if (!cpi)
+    {
+#if HAVE_ARMV7
+        vp8_pop_neon(store_reg);
+#endif
+        return 0;
+    }
+
+    if (cpi->compressor_speed == 2)
+    {
+        vpx_usec_timer_start(&tsctimer);
+        vpx_usec_timer_start(&ticktimer);
+    }
+
+    // start with a 0 size frame
+    *size = 0;
+
+    // Clear down mmx registers
+    vp8_clear_system_state();  //__asm emms;
+
+    cm->frame_type = INTER_FRAME;
+    cm->frame_flags = *frame_flags;
+
+#if 0
+
+    if (cm->refresh_alt_ref_frame)
+    {
+        //cm->refresh_golden_frame = 1;
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 0;
+    }
+    else
+    {
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+    }
+
+#endif
+
+#if !(CONFIG_REALTIME_ONLY)
+
+    if (cpi->pass == 1)
+    {
+        Pass1Encode(cpi, size, dest, frame_flags);
+    }
+    else if (cpi->pass == 2)
+    {
+        Pass2Encode(cpi, size, dest, frame_flags);
+    }
+    else
+#endif
+        encode_frame_to_data_rate(cpi, size, dest, frame_flags);
+
+    if (cpi->compressor_speed == 2)
+    {
+        unsigned int duration, duration2;
+        vpx_usec_timer_mark(&tsctimer);
+        vpx_usec_timer_mark(&ticktimer);
+
+        duration = vpx_usec_timer_elapsed(&ticktimer);
+        duration2 = (unsigned int)((double)duration / 2);
+
+        if (cm->frame_type != KEY_FRAME)
+        {
+            if (cpi->avg_encode_time == 0)
+                cpi->avg_encode_time = duration;
+            else
+                cpi->avg_encode_time = (7 * cpi->avg_encode_time + duration) >> 3;
+        }
+
+        if (duration2)
+        {
+            //if(*frame_flags!=1)
+            {
+
+                if (cpi->avg_pick_mode_time == 0)
+                    cpi->avg_pick_mode_time = duration2;
+                else
+                    cpi->avg_pick_mode_time = (7 * cpi->avg_pick_mode_time + duration2) >> 3;
+            }
+        }
+
+    }
+
+    if (cm->refresh_entropy_probs == 0)
+    {
+        vpx_memcpy(&cm->fc, &cm->lfc, sizeof(cm->fc));
+    }
+
+    // if its a dropped frame honor the requests on subsequent frames
+    if (*size > 0)
+    {
+
+        // return to normal state
+        cpi->ref_frame_flags = VP8_ALT_FLAG | VP8_GOLD_FLAG | VP8_LAST_FLAG;
+
+        cm->refresh_entropy_probs = 1;
+        cm->refresh_alt_ref_frame = 0;
+        cm->refresh_golden_frame = 0;
+        cm->refresh_last_frame = 1;
+        cm->frame_type = INTER_FRAME;
+
+    }
+
+    cpi->ready_for_new_frame = 1;
+
+    vpx_usec_timer_mark(&cmptimer);
+    cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
+
+    if (cpi->b_calculate_psnr && cpi->pass != 1 && cm->show_frame)
+        generate_psnr_packet(cpi);
+
+#if CONFIG_PSNR
+
+    if (cpi->pass != 1)
+    {
+        cpi->bytes += *size;
+
+        if (cm->show_frame)
+        {
+
+            cpi->count ++;
+
+            if (cpi->b_calculate_psnr)
+            {
+                double y, u, v;
+                double sq_error;
+                double frame_psnr = vp8_calc_psnr(cpi->Source, cm->frame_to_show, &y, &u, &v, &sq_error);
+
+                cpi->total_y += y;
+                cpi->total_u += u;
+                cpi->total_v += v;
+                cpi->total_sq_error += sq_error;
+                cpi->total  += frame_psnr;
+                {
+                    double y2, u2, v2, frame_psnr2, frame_ssim2 = 0;
+                    double weight = 0;
+
+                    vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc));
+                    vp8_clear_system_state();
+                    frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error);
+                    frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight);
+
+                    cpi->summed_quality += frame_ssim2 * weight;
+                    cpi->summed_weights += weight;
+
+                    cpi->totalp_y += y2;
+                    cpi->totalp_u += u2;
+                    cpi->totalp_v += v2;
+                    cpi->totalp  += frame_psnr2;
+                    cpi->total_sq_error2 += sq_error;
+
+                }
+            }
+
+            if (cpi->b_calculate_ssimg)
+            {
+                double y, u, v, frame_all;
+                frame_all =  vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
+                cpi->total_ssimg_y += y;
+                cpi->total_ssimg_u += u;
+                cpi->total_ssimg_v += v;
+                cpi->total_ssimg_all += frame_all;
+            }
+
+        }
+    }
+
+#if 0
+
+    if (cpi->common.frame_type != 0 && cpi->common.base_qindex == cpi->oxcf.worst_allowed_q)
+    {
+        skiptruecount += cpi->skip_true_count;
+        skipfalsecount += cpi->skip_false_count;
+    }
+
+#endif
+#if 0
+
+    if (cpi->pass != 1)
+    {
+        FILE *f = fopen("skip.stt", "a");
+        fprintf(f, "frame:%4d flags:%4x Q:%4d P:%4d Size:%5d\n", cpi->common.current_video_frame, *frame_flags, cpi->common.base_qindex, cpi->prob_skip_false, *size);
+
+        if (cpi->is_src_frame_alt_ref == 1)
+            fprintf(f, "skipcount: %4d framesize: %d\n", cpi->skip_true_count , *size);
+
+        fclose(f);
+    }
+
+#endif
+#endif
+
+#if HAVE_ARMV7
+    vp8_pop_neon(store_reg);
+#endif
+
+    return 0;
+}
+
+int vp8_get_preview_raw_frame(VP8_PTR comp, YV12_BUFFER_CONFIG *dest, int deblock_level, int noise_level, int flags)
+{
+    VP8_COMP *cpi = (VP8_COMP *) comp;
+
+    if (cpi->common.refresh_alt_ref_frame)
+        return -1;
+    else
+    {
+        int ret;
+#if CONFIG_POSTPROC
+        ret = vp8_post_proc_frame(&cpi->common, dest, deblock_level, noise_level, flags);
+#else
+
+        if (cpi->common.frame_to_show)
+        {
+            *dest = *cpi->common.frame_to_show;
+            dest->y_width = cpi->common.Width;
+            dest->y_height = cpi->common.Height;
+            dest->uv_height = cpi->common.Height / 2;
+            ret = 0;
+        }
+        else
+        {
+            ret = -1;
+        }
+
+#endif //!CONFIG_POSTPROC
+        vp8_clear_system_state();
+        return ret;
+    }
+}
+
+int vp8_set_roimap(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4])
+{
+    VP8_COMP *cpi = (VP8_COMP *) comp;
+    signed char feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];
+
+    if (cpi->common.mb_rows != rows || cpi->common.mb_cols != cols)
+        return -1;
+
+    if (!map)
+    {
+        disable_segmentation((VP8_PTR)cpi);
+        return 0;
+    }
+
+    // Set the segmentation Map
+    set_segmentation_map((VP8_PTR)cpi, map);
+
+    // Activate segmentation.
+    enable_segmentation((VP8_PTR)cpi);
+
+    // Set up the quant segment data
+    feature_data[MB_LVL_ALT_Q][0] = delta_q[0];
+    feature_data[MB_LVL_ALT_Q][1] = delta_q[1];
+    feature_data[MB_LVL_ALT_Q][2] = delta_q[2];
+    feature_data[MB_LVL_ALT_Q][3] = delta_q[3];
+
+    // Set up the loop segment data s
+    feature_data[MB_LVL_ALT_LF][0] = delta_lf[0];
+    feature_data[MB_LVL_ALT_LF][1] = delta_lf[1];
+    feature_data[MB_LVL_ALT_LF][2] = delta_lf[2];
+    feature_data[MB_LVL_ALT_LF][3] = delta_lf[3];
+
+    cpi->segment_encode_breakout[0] = threshold[0];
+    cpi->segment_encode_breakout[1] = threshold[1];
+    cpi->segment_encode_breakout[2] = threshold[2];
+    cpi->segment_encode_breakout[3] = threshold[3];
+
+    // Initialise the feature data structure
+    // SEGMENT_DELTADATA    0, SEGMENT_ABSDATA      1
+    set_segment_data((VP8_PTR)cpi, &feature_data[0][0], SEGMENT_DELTADATA);
+
+    return 0;
+}
+
+int vp8_set_active_map(VP8_PTR comp, unsigned char *map, unsigned int rows, unsigned int cols)
+{
+    VP8_COMP *cpi = (VP8_COMP *) comp;
+
+    if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols)
+    {
+        if (map)
+        {
+            vpx_memcpy(cpi->active_map, map, rows * cols);
+            cpi->active_map_enabled = 1;
+        }
+        else
+            cpi->active_map_enabled = 0;
+
+        return 0;
+    }
+    else
+    {
+        //cpi->active_map_enabled = 0;
+        return -1 ;
+    }
+}
+
+int vp8_set_internal_size(VP8_PTR comp, VPX_SCALING horiz_mode, VPX_SCALING vert_mode)
+{
+    VP8_COMP *cpi = (VP8_COMP *) comp;
+
+    if (horiz_mode >= NORMAL && horiz_mode <= ONETWO)
+        cpi->common.horiz_scale = horiz_mode;
+    else
+        return -1;
+
+    if (vert_mode >= NORMAL && vert_mode <= ONETWO)
+        cpi->common.vert_scale  = vert_mode;
+    else
+        return -1;
+
+    return 0;
+}
+
+
+
+int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    int i, j;
+    int Total = 0;
+
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+    (void)rtcd;
+
+    // Loop through the Y plane raw and reconstruction data summing (square differences)
+    for (i = 0; i < source->y_height; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+int vp8_calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    int i, j;
+    int Total = 0;
+
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+    (void)rtcd;
+
+    // Loop through the Y plane raw and reconstruction data summing (square differences)
+    for (i = 0; i < source->y_height; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse, sse2, sum2;
+            VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+
+            if (sse < 8096)
+                Total += sse;
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+int vp8_get_speed(VP8_PTR c)
+{
+    VP8_COMP   *cpi = (VP8_COMP *) c;
+    return cpi->Speed;
+}
+int vp8_get_quantizer(VP8_PTR c)
+{
+    VP8_COMP   *cpi = (VP8_COMP *) c;
+    return cpi->common.base_qindex;
+}

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
new file mode 100644
index 0000000..29b120e
--- /dev/null
+++ b/vp8/encoder/onyx_int.h

@@ -0,0 +1,670 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_INT_H
+#define __INC_VP8_INT_H
+
+#include <stdio.h>
+#include "vpx_ports/config.h"
+#include "onyx.h"
+#include "treewriter.h"
+#include "tokenize.h"
+#include "onyxc_int.h"
+#include "preproc.h"
+#include "variance.h"
+#include "dct.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "entropy.h"
+#include "threading.h"
+#include "vpx_ports/mem.h"
+#include "vpx_codec/internal/vpx_codec_internal.h"
+#include "mcomp.h"
+
+#define INTRARDOPT
+//#define SPEEDSTATS 1
+#define MIN_GF_INTERVAL             4
+#define DEFAULT_GF_INTERVAL         7
+
+#define KEY_FRAME_CONTEXT 5
+
+#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25)
+
+#define AF_THRESH   25
+#define AF_THRESH2  100
+#define ARF_DECAY_THRESH 12
+#define MAX_MODES 20
+
+#define MIN_THRESHMULT  32
+#define MAX_THRESHMULT  512
+
+#define GF_ZEROMV_ZBIN_BOOST 24
+#define ZBIN_OQ_MAX 192
+
+#define VP8_TEMPORAL_ALT_REF 1
+
+typedef struct
+{
+    int kf_indicated;
+    unsigned int frames_since_key;
+    unsigned int frames_since_golden;
+    int filter_level;
+    int frames_till_gf_update_due;
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    MV_CONTEXT mvc[2];
+    int mvcosts[2][MVvals+1];
+
+#ifdef MODE_STATS
+    // Stats
+    int y_modes[5];
+    int uv_modes[4];
+    int b_modes[10];
+    int inter_y_modes[10];
+    int inter_uv_modes[4];
+    int inter_b_modes[10];
+#endif
+
+    vp8_prob ymode_prob[4], uv_mode_prob[3];   /* interframe intra mode probs */
+    vp8_prob kf_ymode_prob[4], kf_uv_mode_prob[3];   /* keyframe "" */
+
+    int ymode_count[5], uv_mode_count[4];  /* intra MB type cts this frame */
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+
+} CODING_CONTEXT;
+
+typedef struct
+{
+    double frame;
+    double intra_error;
+    double coded_error;
+    double ssim_weighted_pred_err;
+    double pcnt_inter;
+    double pcnt_motion;
+    double pcnt_second_ref;
+    double MVr;
+    double mvr_abs;
+    double MVc;
+    double mvc_abs;
+    double MVrv;
+    double MVcv;
+    double mv_in_out_count;
+    double duration;
+    double count;
+}
+FIRSTPASS_STATS;
+
+typedef struct
+{
+    int frames_so_far;
+    double frame_intra_error;
+    double frame_coded_error;
+    double frame_pcnt_inter;
+    double frame_pcnt_motion;
+    double frame_mvr;
+    double frame_mvr_abs;
+    double frame_mvc;
+    double frame_mvc_abs;
+
+} ONEPASS_FRAMESTATS;
+
+
+typedef enum
+{
+    THR_ZEROMV         = 0,
+    THR_DC             = 1,
+
+    THR_NEARESTMV      = 2,
+    THR_NEARMV         = 3,
+
+    THR_ZEROG          = 4,
+    THR_NEARESTG       = 5,
+
+    THR_ZEROA          = 6,
+    THR_NEARESTA       = 7,
+
+    THR_NEARG          = 8,
+    THR_NEARA          = 9,
+
+    THR_V_PRED         = 10,
+    THR_H_PRED         = 11,
+    THR_TM             = 12,
+
+    THR_NEWMV          = 13,
+    THR_NEWG           = 14,
+    THR_NEWA           = 15,
+
+    THR_SPLITMV        = 16,
+    THR_SPLITG         = 17,
+    THR_SPLITA         = 18,
+
+    THR_B_PRED         = 19,
+}
+THR_MODES;
+
+typedef enum
+{
+    DIAMOND = 0,
+    NSTEP = 1,
+    HEX = 2
+} SEARCH_METHODS;
+
+typedef struct
+{
+    int RD;
+    SEARCH_METHODS search_method;
+    int improved_quant;
+    int improved_dct;
+    int auto_filter;
+    int recode_loop;
+    int iterative_sub_pixel;
+    int half_pixel_search;
+    int quarter_pixel_search;
+    int thresh_mult[MAX_MODES];
+    int full_freq[2];
+    int min_fs_radius;
+    int max_fs_radius;
+    int max_step_search_steps;
+    int first_step;
+    int optimize_coefficients;
+
+} SPEED_FEATURES;
+
+typedef struct
+{
+    MACROBLOCK  mb;
+    int mb_row;
+    TOKENEXTRA *tp;
+    int segment_counts[MAX_MB_SEGMENTS];
+    int totalrate;
+    int current_mb_col;
+} MB_ROW_COMP;
+
+typedef struct
+{
+    TOKENEXTRA *start;
+    TOKENEXTRA *stop;
+} TOKENLIST;
+
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+    void *ptr2;
+} ENCODETHREAD_DATA;
+typedef struct
+{
+    int ithread;
+    void *ptr1;
+} LPFTHREAD_DATA;
+
+typedef struct
+{
+    INT64  source_time_stamp;
+    INT64  source_end_time_stamp;
+
+    DECLARE_ALIGNED(16, YV12_BUFFER_CONFIG, source_buffer);
+    unsigned int source_frame_flags;
+} SOURCE_SAMPLE;
+
+typedef struct VP8_ENCODER_RTCD
+{
+    VP8_COMMON_RTCD            *common;
+    vp8_variance_rtcd_vtable_t  variance;
+    vp8_fdct_rtcd_vtable_t      fdct;
+    vp8_encodemb_rtcd_vtable_t  encodemb;
+    vp8_quantize_rtcd_vtable_t  quantize;
+    vp8_search_rtcd_vtable_t    search;
+} VP8_ENCODER_RTCD;
+
+typedef struct
+{
+
+    DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][4][4]);
+
+    DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][4][4]);
+
+    DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][4][4]);
+    DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][4][4]);
+
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y1[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_y2[QINDEX_RANGE][16]);
+    DECLARE_ALIGNED(16, short, zrun_zbin_boost_uv[QINDEX_RANGE][16]);
+
+
+    MACROBLOCK mb;
+    VP8_COMMON common;
+    vp8_writer bc, bc2;
+    // bool_writer *bc2;
+
+    VP8_CONFIG oxcf;
+
+    YV12_BUFFER_CONFIG *Source;
+    YV12_BUFFER_CONFIG *un_scaled_source;
+    INT64 source_time_stamp;
+    INT64 source_end_time_stamp;
+    unsigned int source_frame_flags;
+    YV12_BUFFER_CONFIG scaled_source;
+
+    int source_buffer_count;
+    int source_encode_index;
+    int source_alt_ref_pending;
+    int source_alt_ref_active;
+
+    int last_alt_ref_sei;
+    int is_src_frame_alt_ref;
+
+    int gold_is_last; // golden frame same as last frame ( short circuit gold searches)
+    int alt_is_last;  // Alt reference frame same as last ( short circuit altref search)
+    int gold_is_alt;  // don't do both alt and gold search ( just do gold).
+
+    //int refresh_alt_ref_frame;
+    SOURCE_SAMPLE src_buffer[MAX_LAG_BUFFERS];
+
+    YV12_BUFFER_CONFIG last_frame_uf;
+
+    char *Dest;
+
+    TOKENEXTRA *tok;
+    unsigned int tok_count;
+
+
+    unsigned int frames_since_key;
+    unsigned int key_frame_frequency;
+    unsigned int next_key;
+
+    unsigned int mode_check_freq[MAX_MODES];
+    unsigned int mode_test_hit_counts[MAX_MODES];
+    unsigned int mode_chosen_counts[MAX_MODES];
+    unsigned int mbs_tested_so_far;
+
+    unsigned int check_freq[2];
+    unsigned int do_full[2];
+
+    int rd_thresh_mult[MAX_MODES];
+    int rd_baseline_thresh[MAX_MODES];
+    int rd_threshes[MAX_MODES];
+    int mvcostbase;
+    int mvcostmultiplier;
+    int subseqblockweight;
+    int errthresh;
+
+#ifdef INTRARDOPT
+    int RDMULT;
+    int RDDIV ;
+
+    TOKENEXTRA *rdtok;
+    int intra_rd_opt;
+    vp8_writer rdbc;
+    int intra_mode_costs[10];
+#endif
+
+
+    CODING_CONTEXT coding_context;
+
+    // Rate targetting variables
+    long long prediction_error;
+    long long last_prediction_error;
+    long long intra_error;
+    long long last_intra_error;
+    long long last_auto_filter_prediction_error;
+
+#if 0
+    // Experimental RD code
+    long long frame_distortion;
+    long long last_frame_distortion;
+#endif
+
+    int last_mb_distortion;
+
+    int frames_since_auto_filter;
+
+    int this_frame_target;
+    int projected_frame_size;
+    int last_q[2];                   // Separate values for Intra/Inter
+    int target_bits_per_mb;
+
+    double rate_correction_factor;
+    double key_frame_rate_correction_factor;
+    double gf_rate_correction_factor;
+    double est_max_qcorrection_factor;
+
+    int frames_till_gf_update_due;      // Count down till next GF
+    int current_gf_interval;          // GF interval chosen when we coded the last GF
+
+    int gf_overspend_bits;            // Total bits overspent becasue of GF boost (cumulative)
+
+    int gf_group_bits;                // Projected Bits available for a group of frames including 1 GF or ARF
+    int gf_bits;                     // Bits for the golden frame or ARF - 2 pass only
+    int mid_gf_extra_bits;             // A few extra bits for the frame half way between two gfs.
+
+    int kf_group_bits;                // Projected total bits available for a key frame group of frames
+    int kf_group_error_left;           // Error score of frames still to be coded in kf group
+    int kf_bits;                     // Bits for the key frame in a key frame group - 2 pass only
+
+    int non_gf_bitrate_adjustment;     // Used in the few frames following a GF to recover the extra bits spent in that GF
+    int initial_gf_use;               // percentage use of gf 2 frames after gf
+
+    int gf_group_error_left;           // Remaining error from uncoded frames in a gf group. Two pass use only
+
+    int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
+    int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
+    int max_gf_interval;
+    int baseline_gf_interval;
+    int gf_decay_rate;
+
+    INT64 key_frame_count;
+    INT64 tot_key_frame_bits;
+    int prior_key_frame_size[KEY_FRAME_CONTEXT];
+    int prior_key_frame_distance[KEY_FRAME_CONTEXT];
+    int per_frame_bandwidth;          // Current section per frame bandwidth target
+    int av_per_frame_bandwidth;        // Average frame size target for clip
+    int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+    int last_key_frame_size;
+    int intra_frame_target;
+    int inter_frame_target;
+    double output_frame_rate;
+    long long last_time_stamp_seen;
+    long long first_time_stamp_ever;
+
+    int ni_av_qi;
+    int ni_tot_qi;
+    int ni_frames;
+    int avg_frame_qindex;
+
+    int zbin_over_quant;
+    int zbin_mode_boost;
+    int zbin_mode_boost_enabled;
+
+    INT64 total_byte_count;
+
+    int buffered_mode;
+
+    int buffer_level;
+    int bits_off_target;
+
+    int rolling_target_bits;
+    int rolling_actual_bits;
+
+    int long_rolling_target_bits;
+    int long_rolling_actual_bits;
+
+    long long total_actual_bits;
+    int total_target_vs_actual;        // debug stats
+
+    int worst_quality;
+    int active_worst_quality;
+    int best_quality;
+    int active_best_quality;
+
+    int drop_frames_allowed;          // Are we permitted to drop frames?
+    int drop_frame;                  // Drop this frame?
+    int drop_count;                  // How many frames have we dropped?
+    int max_drop_count;               // How many frames should we drop?
+    int max_consec_dropped_frames;     // Limit number of consecutive frames that can be dropped.
+
+
+    int ymode_count [VP8_YMODES];        /* intra MB type cts this frame */
+    int uv_mode_count[VP8_UV_MODES];       /* intra MB type cts this frame */
+
+    unsigned int MVcount [2] [MVvals];  /* (row,col) MV cts this frame */
+
+    unsigned int coef_counts [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];  /* for this frame */
+    //DECLARE_ALIGNED(16, int, coef_counts_backup [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens]);   //not used any more
+    //save vp8_tree_probs_from_distribution result for each frame to avoid repeat calculation
+    vp8_prob frame_coef_probs [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1];
+    unsigned int frame_branch_ct [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1][2];
+
+    /* Second compressed data partition contains coefficient data. */
+
+    unsigned char *output_partition2;
+    size_t output_partition2size;
+
+    pre_proc_instance ppi;
+
+    int frames_to_key;
+    int gfu_boost;
+    int kf_boost;
+    int last_boost;
+    double total_error_left;
+    double total_intra_error_left;
+    double total_coded_error_left;
+    double start_tot_err_left;
+    double min_error;
+
+    double modified_total_error_left;
+    double avg_iiratio;
+
+    int target_bandwidth;
+    long long bits_left;
+    FIRSTPASS_STATS total_stats;
+    FIRSTPASS_STATS this_frame_stats;
+    FIRSTPASS_STATS *stats_in, *stats_in_end;
+    struct vpx_codec_pkt_list  *output_pkt_list;
+    int                          first_pass_done;
+    unsigned char *fp_motion_map;
+    FILE *fp_motion_mapfile;
+    int fpmm_pos;
+
+#if 0
+    // Experimental code for lagged and one pass
+    ONEPASS_FRAMESTATS one_pass_frame_stats[MAX_LAG_BUFFERS];
+    int one_pass_frame_index;
+#endif
+
+    int decimation_factor;
+    int decimation_count;
+
+    // for real time encoding
+    int avg_encode_time;              //microsecond
+    int avg_pick_mode_time;            //microsecond
+    int Speed;
+    unsigned int cpu_freq;           //Mhz
+    int compressor_speed;
+
+    int interquantizer;
+    int auto_gold;
+    int auto_adjust_gold_quantizer;
+    int goldquantizer;
+    int goldfreq;
+    int auto_adjust_key_quantizer;
+    int keyquantizer;
+    int auto_worst_q;
+    int filter_type;
+    int cpu_used;
+    int chroma_boost;
+    int horiz_scale;
+    int vert_scale;
+    int pass;
+
+
+    int prob_intra_coded;
+    int prob_last_coded;
+    int prob_gf_coded;
+    int prob_skip_false;
+    int last_skip_false_probs[3];
+    int last_skip_probs_q[3];
+    int recent_ref_frame_usage[MAX_REF_FRAMES];
+
+    int count_mb_ref_frame_usage[MAX_REF_FRAMES];
+    int this_frame_percent_intra;
+    int last_frame_percent_intra;
+
+    int last_key_frame_q;
+    int last_kffilt_lvl;
+
+    int ref_frame_flags;
+
+    int exp[512];
+
+    SPEED_FEATURES sf;
+    int error_bins[1024];
+
+    int inter_lvl;
+    int intra_lvl;
+    int motion_lvl;
+    int motion_speed;
+    int motion_var;
+    int next_iiratio;
+    int this_iiratio;
+    int this_frame_modified_error;
+
+    double norm_intra_err_per_mb;
+    double norm_inter_err_per_mb;
+    double norm_iidiff_per_mb;
+
+    int last_best_mode_index;          // Record of mode index chosen for previous macro block.
+    int last_auto_filt_val;
+    int last_auto_filt_q;
+
+    // Data used for real time conferencing mode to help determine if it would be good to update the gf
+    int inter_zz_count;
+    int gf_bad_count;
+    int gf_update_recommended;
+    int skip_true_count;
+    int skip_false_count;
+
+    int alt_qcount;
+
+    int ready_for_new_frame;
+
+    unsigned char *segmentation_map;
+    signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS];            // Segment data (can be deltas or absolute values)
+    int  segment_encode_breakout[MAX_MB_SEGMENTS];                    // segment threashold for encode breakout
+
+    unsigned char *active_map;
+    unsigned int active_map_enabled;
+    // Video conferencing cyclic refresh mode flags etc
+    // This is a mode designed to clean up the background over time in live encoding scenarious. It uses segmentation
+    int cyclic_refresh_mode_enabled;
+    int cyclic_refresh_mode_max_mbs_perframe;
+    int cyclic_refresh_mode_index;
+    int cyclic_refresh_q;
+    signed char *cyclic_refresh_map;
+
+    // multithread data
+    int current_mb_col_main;
+    int processor_core_count;
+    int b_multi_threaded;
+    int encoding_thread_count;
+
+#if CONFIG_MULTITHREAD
+    pthread_t *h_encoding_thread;
+#endif
+    MB_ROW_COMP *mb_row_ei;
+    ENCODETHREAD_DATA *en_thread_data;
+
+#if CONFIG_MULTITHREAD
+    //events
+    sem_t *h_event_mbrencoding;
+    sem_t h_event_main;
+#endif
+
+    TOKENLIST *tplist;
+    // end of multithread data
+
+
+    fractional_mv_step_fp *find_fractional_mv_step;
+    vp8_full_search_fn_t full_search_sad;
+    vp8_diamond_search_fn_t diamond_search_sad;
+    vp8_variance_fn_ptr_t fn_ptr;
+    unsigned int time_receive_data;
+    unsigned int time_compress_data;
+    unsigned int time_pick_lpf;
+    unsigned int time_encode_mb_row;
+
+    unsigned int tempdata1;
+    unsigned int tempdata2;
+
+    int base_skip_false_prob[128];
+    unsigned int section_is_low_motion;
+    unsigned int section_benefits_from_aggresive_q;
+    unsigned int section_is_fast_motion;
+    unsigned int section_intra_rating;
+
+    double section_max_qfactor;
+
+
+#if CONFIG_RUNTIME_CPU_DETECT
+    VP8_ENCODER_RTCD            rtcd;
+#endif
+#if VP8_TEMPORAL_ALT_REF
+    SOURCE_SAMPLE alt_ref_buffer;
+    unsigned char *frames[MAX_LAG_BUFFERS];
+    int fixed_divide[255];
+#endif
+
+#if CONFIG_PSNR
+    int    count;
+    double total_y;
+    double total_u;
+    double total_v;
+    double total ;
+    double total_sq_error;
+    double totalp_y;
+    double totalp_u;
+    double totalp_v;
+    double totalp;
+    double total_sq_error2;
+    int    bytes;
+    double summed_quality;
+    double summed_weights;
+    unsigned int tot_recode_hits;
+
+
+    double total_ssimg_y;
+    double total_ssimg_u;
+    double total_ssimg_v;
+    double total_ssimg_all;
+
+    int b_calculate_ssimg;
+#endif
+    int b_calculate_psnr;
+} VP8_COMP;
+
+void control_data_rate(VP8_COMP *cpi);
+
+void vp8_encode_frame(VP8_COMP *cpi);
+
+void vp8_pack_bitstream(VP8_COMP *cpi, unsigned char *dest, unsigned long *size);
+
+int rd_cost_intra_mb(MACROBLOCKD *x);
+
+void vp8_tokenize_mb(VP8_COMP *, MACROBLOCKD *, TOKENEXTRA **);
+
+void vp8_set_speed_features(VP8_COMP *cpi);
+
+#if CONFIG_DEBUG
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval" at %s:%d", \
+                               __FILE__,__LINE__);\
+    } while(0)
+#else
+#define CHECK_MEM_ERROR(lval,expr) do {\
+        lval = (expr); \
+        if(!lval) \
+            vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,\
+                               "Failed to allocate "#lval);\
+    } while(0)
+#endif
+#endif

diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp
new file mode 100644
index 0000000..66fdafb
--- /dev/null
+++ b/vp8/encoder/parms.cpp

@@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if 0
+
+#include <map>
+#include <string>
+#include <fstream>
+extern "C"
+{
+    #include "onyx.h"
+}
+
+
+using namespace std;
+
+typedef map<string,int> Parms;
+
+#define ALLPARMS(O,DOTHIS) \
+    DOTHIS(O,  interquantizer             )\
+    DOTHIS(O,  auto_gold                   )\
+    DOTHIS(O,  auto_adjust_gold_quantizer    )\
+    DOTHIS(O,  goldquantizer              )\
+    DOTHIS(O,  goldfreq                   )\
+    DOTHIS(O,  auto_key                    )\
+    DOTHIS(O,  auto_adjust_key_quantizer     )\
+    DOTHIS(O,  keyquantizer               )\
+    DOTHIS(O,  keyfreq                    )\
+    DOTHIS(O,  pass                       )\
+    DOTHIS(O,  fixed_q                     )\
+    DOTHIS(O,  target_bandwidth            )\
+    DOTHIS(O,  auto_worst_q                 )\
+    DOTHIS(O,  worst_quality               )\
+    DOTHIS(O,  best_allowed_q               )\
+    DOTHIS(O,  end_usage                   )\
+    DOTHIS(O,  starting_buffer_level        )\
+    DOTHIS(O,  optimal_buffer_level         )\
+    DOTHIS(O,  maximum_buffer_size          )\
+    DOTHIS(O,  under_shoot_pct              )\
+    DOTHIS(O,  allow_df                    )\
+    DOTHIS(O,  drop_frames_water_mark        )\
+    DOTHIS(O,  max_allowed_datarate         )\
+    DOTHIS(O,  two_pass_vbrbias             )\
+    DOTHIS(O,  two_pass_vbrmin_section       )\
+    DOTHIS(O,  two_pass_vbrmax_section       )\
+    DOTHIS(O,  filter_type                 )\
+    DOTHIS(O,  compressor_speed            )\
+    DOTHIS(O,  mbpitch_feature             )\
+    DOTHIS(O,  allow_spatial_resampling     )\
+    DOTHIS(O,  resample_down_water_mark      )\
+    DOTHIS(O,  resample_up_water_mark        )\
+    DOTHIS(O,  noise_sensitivity           )\
+    DOTHIS(O,  horiz_scale                 )\
+    DOTHIS(O,  vert_scale                  )
+
+
+#define GET(O,V) O->V = x[#V];
+#define PUT(O,V) x[#V] = O->V;
+
+
+extern "C" void get_parms(VP8_CONFIG *ocf,char *filename)
+{
+
+    Parms x;
+    int value;
+    string variable;
+    string equal;
+
+    ifstream config_file(filename);
+
+    ALLPARMS(ocf, PUT);
+
+    // store all the parms in a map (really simple parsing)
+    while(!config_file.eof() && config_file.is_open())
+    {
+        config_file >> variable;
+        config_file >> equal;
+
+        if(equal != "=")
+            continue;
+
+        config_file >> value;
+
+        x[variable] = value;
+    }
+
+    ALLPARMS(ocf, GET);
+
+}
+
+#define PRINT(O,V) debug_file<<#V <<" = " << O->V <<"\n";
+extern "C" void print_parms(VP8_CONFIG *ocf,char *filename)
+{
+    ofstream debug_file(filename,ios_base::app);
+    ALLPARMS(ocf, PRINT);
+    debug_file << "=============================================="<<"\n";
+}
+
+#endif

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
new file mode 100644
index 0000000..d61e2ce
--- /dev/null
+++ b/vp8/encoder/pickinter.c

@@ -0,0 +1,923 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <limits.h>
+#include "vpx_ports/config.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "entropymode.h"
+#include "pickinter.h"
+#include "findnearmv.h"
+#include "encodemb.h"
+#include "reconinter.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "g_common.h"
+#include "variance.h"
+#include "mcomp.h"
+
+#include "vpx_mem/vpx_mem.h"
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
+
+extern int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd);
+
+#ifdef SPEEDSTATS
+extern unsigned int cnt_pm;
+#endif
+
+extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+
+
+extern unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+extern int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *, int *, int *, int, int *mvcost[2], int, int fullpixel);
+extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
+extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
+
+
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+{
+    (void) b;
+    (void) d;
+    (void) ref_mv;
+    (void) error_per_bit;
+    (void) svf;
+    (void) vf;
+    (void) mvcost;
+    bestmv->row <<= 3;
+    bestmv->col <<= 3;
+    return 0;
+}
+
+
+static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, unsigned int *sse)
+{
+
+    BLOCK *b = &mb->block[0];
+    BLOCKD *d = &mb->e_mbd.block[0];
+    unsigned char *what = (*(b->base_src) + b->src);
+    int what_stride = b->src_stride;
+    unsigned char *in_what = *(d->base_pre) + d->pre ;
+    int in_what_stride = d->pre_stride;
+    int xoffset = d->bmi.mv.as_mv.col & 7;
+    int yoffset = d->bmi.mv.as_mv.row & 7;
+
+    in_what += (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3);
+
+    if (xoffset | yoffset)
+    {
+        return svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+    }
+    else
+    {
+        return vf(what, what_stride, in_what, in_what_stride, sse);
+    }
+
+}
+
+unsigned int vp8_get16x16pred_error_c
+(
+    unsigned char *src_ptr,
+    int src_stride,
+    unsigned char *ref_ptr,
+    int ref_stride,
+    int max_sad
+)
+{
+    unsigned pred_error = 0;
+    int i, j;
+    int sum = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        int diff;
+
+        for (j = 0; j < 16; j++)
+        {
+            diff = src_ptr[j] - ref_ptr[j];
+            sum += diff;
+            pred_error += diff * diff;
+        }
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+    }
+
+    pred_error -= sum * sum / 256;
+    return pred_error;
+}
+
+
+unsigned int vp8_get4x4sse_cs_c
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    int max_sad
+)
+{
+    int distortion = 0;
+    int r, c;
+
+    for (r = 0; r < 4; r++)
+    {
+        for (c = 0; c < 4; c++)
+        {
+            int diff = src_ptr[c] - ref_ptr[c];
+            distortion += diff * diff;
+        }
+
+        src_ptr += source_stride;
+        ref_ptr += recon_stride;
+    }
+
+    return distortion;
+}
+
+static int get_prediction_error(BLOCK *be, BLOCKD *b, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned char *sptr;
+    unsigned char *dptr;
+    sptr = (*(be->base_src) + be->src);
+    dptr = b->predictor;
+
+    return VARIANCE_INVOKE(rtcd, get4x4sse_cs)(sptr, be->src_stride, dptr, 16, 0x7fffffff);
+
+}
+
+static int pick_intra4x4block(
+    const VP8_ENCODER_RTCD *rtcd,
+    MACROBLOCK *x,
+    BLOCK *be,
+    BLOCKD *b,
+    B_PREDICTION_MODE *best_mode,
+    B_PREDICTION_MODE above,
+    B_PREDICTION_MODE left,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+
+    int *bestrate,
+    int *bestdistortion)
+{
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;       // 1<<30
+    int rate;
+    int distortion;
+    unsigned int *mode_costs;
+    (void) l;
+    (void) a;
+
+    if (x->e_mbd.frame_type == KEY_FRAME)
+    {
+        mode_costs = x->bmode_costs[above][left];
+    }
+    else
+    {
+        mode_costs = x->inter_bmode_costs;
+    }
+
+    for (mode = B_DC_PRED; mode <= B_HE_PRED /*B_HU_PRED*/; mode++)
+    {
+        int this_rd;
+
+        rate = mode_costs[mode];
+        vp8_predict_intra4x4(b, mode, b->predictor);
+        distortion = get_prediction_error(be, b, &rtcd->variance);
+        this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+        }
+    }
+
+    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
+    vp8_encode_intra4x4block(rtcd, x, be, b, b->bmi.mode);
+    return best_rd;
+}
+
+
+int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int *Rate, int *best_dist)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    TEMP_CONTEXT t;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, 0); // Rd estimate for the cost of the block prediction mode
+    int distortion = 0;
+
+    vp8_intra_prediction_down_copy(xd);
+    vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4);
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+        const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+        const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(d);
+
+        error += pick_intra4x4block(rtcd,
+                                    mb, mb->block + i, xd->block + i, &best_mode, A, L,
+                                    t.a + vp8_block2above[i],
+                                    t.l + vp8_block2left[i], &r, &d);
+
+        cost += r;
+        distortion += d;
+
+        mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+
+        // Break out case where we have already exceeded best so far value that was bassed in
+        if (distortion > *best_dist)
+            break;
+    }
+
+    for (i = 0; i < 16; i++)
+        xd->block[i].bmi.mv.as_int = 0;
+
+    *Rate = cost;
+
+    if (i == 16)
+        *best_dist = distortion;
+    else
+        *best_dist = INT_MAX;
+
+
+    return error;
+}
+
+int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)
+{
+
+    MACROBLOCKD *x = &mb->e_mbd;
+    unsigned char *uabove_row = x->dst.u_buffer - x->dst.uv_stride;
+    unsigned char *vabove_row = x->dst.v_buffer - x->dst.uv_stride;
+    unsigned char *usrc_ptr = (mb->block[16].src + *mb->block[16].base_src);
+    unsigned char *vsrc_ptr = (mb->block[20].src + *mb->block[20].base_src);
+    int uvsrc_stride = mb->block[16].src_stride;
+    unsigned char uleft_col[8];
+    unsigned char vleft_col[8];
+    unsigned char utop_left = uabove_row[-1];
+    unsigned char vtop_left = vabove_row[-1];
+    int i, j;
+    int expected_udc;
+    int expected_vdc;
+    int shift;
+    int Uaverage = 0;
+    int Vaverage = 0;
+    int diff;
+    int pred_error[4] = {0, 0, 0, 0}, best_error = INT_MAX;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+
+
+    for (i = 0; i < 8; i++)
+    {
+        uleft_col[i] = x->dst.u_buffer [i* x->dst.uv_stride -1];
+        vleft_col[i] = x->dst.v_buffer [i* x->dst.uv_stride -1];
+    }
+
+    if (!x->up_available && !x->left_available)
+    {
+        expected_udc = 128;
+        expected_vdc = 128;
+    }
+    else
+    {
+        shift = 2;
+
+        if (x->up_available)
+        {
+
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uabove_row[i];
+                Vaverage += vabove_row[i];
+            }
+
+            shift ++;
+
+        }
+
+        if (x->left_available)
+        {
+            for (i = 0; i < 8; i++)
+            {
+                Uaverage += uleft_col[i];
+                Vaverage += vleft_col[i];
+            }
+
+            shift ++;
+
+        }
+
+        expected_udc = (Uaverage + (1 << (shift - 1))) >> shift;
+        expected_vdc = (Vaverage + (1 << (shift - 1))) >> shift;
+    }
+
+
+    for (i = 0; i < 8; i++)
+    {
+        for (j = 0; j < 8; j++)
+        {
+
+            int predu = uleft_col[i] + uabove_row[j] - utop_left;
+            int predv = vleft_col[i] + vabove_row[j] - vtop_left;
+            int u_p, v_p;
+
+            u_p = usrc_ptr[j];
+            v_p = vsrc_ptr[j];
+
+            if (predu < 0)
+                predu = 0;
+
+            if (predu > 255)
+                predu = 255;
+
+            if (predv < 0)
+                predv = 0;
+
+            if (predv > 255)
+                predv = 255;
+
+
+            diff = u_p - expected_udc;
+            pred_error[DC_PRED] += diff * diff;
+            diff = v_p - expected_vdc;
+            pred_error[DC_PRED] += diff * diff;
+
+
+            diff = u_p - uabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+            diff = v_p - vabove_row[j];
+            pred_error[V_PRED] += diff * diff;
+
+
+            diff = u_p - uleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+            diff = v_p - vleft_col[i];
+            pred_error[H_PRED] += diff * diff;
+
+
+            diff = u_p - predu;
+            pred_error[TM_PRED] += diff * diff;
+            diff = v_p - predv;
+            pred_error[TM_PRED] += diff * diff;
+
+
+        }
+
+        usrc_ptr += uvsrc_stride;
+        vsrc_ptr += uvsrc_stride;
+
+        if (i == 3)
+        {
+            usrc_ptr = (mb->block[18].src + *mb->block[18].base_src);
+            vsrc_ptr = (mb->block[22].src + *mb->block[22].base_src);
+        }
+
+
+
+    }
+
+
+    for (i = DC_PRED; i <= TM_PRED; i++)
+    {
+        if (best_error > pred_error[i])
+        {
+            best_error = pred_error[i];
+            best_mode = (MB_PREDICTION_MODE)i;
+        }
+    }
+
+
+    mb->e_mbd.mbmi.uv_mode = best_mode;
+    return best_error;
+
+}
+
+
+int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    B_MODE_INFO best_bmodes[16];
+    MB_MODE_INFO best_mbmode;
+    MV best_ref_mv1;
+    MV mode_mv[MB_MODE_COUNT];
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int i;
+    int mdcounts[4];
+    int best_rd = INT_MAX; // 1 << 30;
+    int best_intra_rd = INT_MAX;
+    int mode_index;
+    int ref_frame_cost[MAX_REF_FRAMES];
+    int rate;
+    int rate2;
+    int distortion2;
+    int bestsme;
+    //int all_rds[MAX_MODES];         // Experimental debug code.
+    int best_mode_index = 0;
+    int sse = INT_MAX;
+
+    MV nearest_mv[4];
+    MV near_mv[4];
+    MV best_ref_mv[4];
+    int MDCounts[4][4];
+    unsigned char *y_buffer[4];
+    unsigned char *u_buffer[4];
+    unsigned char *v_buffer[4];
+
+    int skip_mode[4] = {0, 0, 0, 0};
+
+    vpx_memset(mode_mv, 0, sizeof(mode_mv));
+    vpx_memset(nearest_mv, 0, sizeof(nearest_mv));
+    vpx_memset(near_mv, 0, sizeof(near_mv));
+
+
+    // set up all the refframe dependent pointers.
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    {
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[LAST_FRAME], &near_mv[LAST_FRAME],
+                          &best_ref_mv[LAST_FRAME], MDCounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
+
+        y_buffer[LAST_FRAME] = cpi->common.last_frame.y_buffer + recon_yoffset;
+        u_buffer[LAST_FRAME] = cpi->common.last_frame.u_buffer + recon_uvoffset;
+        v_buffer[LAST_FRAME] = cpi->common.last_frame.v_buffer + recon_uvoffset;
+    }
+    else
+        skip_mode[LAST_FRAME] = 1;
+
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+    {
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[GOLDEN_FRAME], &near_mv[GOLDEN_FRAME],
+                          &best_ref_mv[GOLDEN_FRAME], MDCounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+
+        y_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.y_buffer + recon_yoffset;
+        u_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+        v_buffer[GOLDEN_FRAME] = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+    }
+    else
+        skip_mode[GOLDEN_FRAME] = 1;
+
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG && cpi->source_alt_ref_active)
+    {
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &nearest_mv[ALTREF_FRAME], &near_mv[ALTREF_FRAME],
+                          &best_ref_mv[ALTREF_FRAME], MDCounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
+
+        y_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+        u_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+        v_buffer[ALTREF_FRAME] = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+    }
+    else
+        skip_mode[ALTREF_FRAME] = 1;
+
+    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+
+    *returnintra = best_intra_rd;
+    x->skip = 0;
+
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
+
+    // Special case treatment when GF and ARF are not sensible options for reference
+    if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+    {
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_zero(255);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(255)
+                                        + vp8_cost_zero(128);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(255)
+                                        + vp8_cost_one(128);
+    }
+    else
+    {
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_zero(cpi->prob_last_coded);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_zero(cpi->prob_gf_coded);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_one(cpi->prob_gf_coded);
+    }
+
+
+
+    best_rd = INT_MAX;
+
+    x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+
+    // if we encode a new mv this is important
+    // find the best new motion vector
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int frame_cost;
+        int this_rd = INT_MAX;
+
+        if (best_rd <= cpi->rd_threshes[mode_index])
+            continue;
+
+        x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+
+        if (skip_mode[x->e_mbd.mbmi.ref_frame])
+            continue;
+
+        // Check to see if the testing frequency for this mode is at its max
+        // If so then prevent it from being tested and increase the threshold for its testing
+        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        {
+            //if ( (cpi->mbs_tested_so_far / cpi->mode_test_hit_counts[mode_index]) <= cpi->mode_check_freq[mode_index] )
+            if (cpi->mbs_tested_so_far <= (cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index]))
+            {
+                // Increase the threshold for coding this mode to make it less likely to be chosen
+                cpi->rd_thresh_mult[mode_index] += 4;
+
+                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+                continue;
+            }
+        }
+
+        // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+        cpi->mode_test_hit_counts[mode_index] ++;
+
+        rate2 = 0;
+        distortion2 = 0;
+
+        this_mode = vp8_mode_order[mode_index];
+        
+        // Experimental debug code.
+        //all_rds[mode_index] = -1;
+
+        x->e_mbd.mbmi.mode = this_mode;
+        x->e_mbd.mbmi.uv_mode = DC_PRED;
+
+        // Work out the cost assosciated with selecting the reference frame
+        frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame];
+        rate2 += frame_cost;
+
+        // everything but intra
+        if (x->e_mbd.mbmi.ref_frame)
+        {
+            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mbmi.ref_frame];
+            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mbmi.ref_frame];
+            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mbmi.ref_frame];
+            mode_mv[NEARESTMV] = nearest_mv[x->e_mbd.mbmi.ref_frame];
+            mode_mv[NEARMV] = near_mv[x->e_mbd.mbmi.ref_frame];
+            best_ref_mv1 = best_ref_mv[x->e_mbd.mbmi.ref_frame];
+            memcpy(mdcounts, MDCounts[x->e_mbd.mbmi.ref_frame], sizeof(mdcounts));
+        }
+
+        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
+        if (cpi->is_src_frame_alt_ref)
+        {
+            if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+            distortion2 = *returndistortion;                    // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes
+            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
+            rate2 += rate;
+            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+
+            if (distortion2 == INT_MAX)
+            {
+                this_rd = INT_MAX;
+            }
+            else
+            {
+                this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+                if (this_rd < best_intra_rd)
+                {
+                    best_intra_rd = this_rd;
+                    *returnintra = best_intra_rd ;
+                }
+            }
+
+            break;
+
+        case SPLITMV:
+
+            // Split MV modes currently not supported when RD is nopt enabled.
+            break;
+
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff);
+            rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+            if (this_rd < best_intra_rd)
+            {
+                best_intra_rd = this_rd;
+                *returnintra = best_intra_rd ;
+            }
+
+            break;
+
+        case NEWMV:
+        {
+            int thissme;
+            int step_param;
+            int further_steps;
+            int n = 0;
+            int sadpb = x->sadperbit16;
+
+            // Further step/diamond searches as necessary
+            if (cpi->Speed < 8)
+            {
+                step_param = cpi->sf.first_step + ((cpi->Speed > 5) ? 1 : 0);
+                further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+            }
+            else
+            {
+                step_param = cpi->sf.first_step + 2;
+                further_steps = 0;
+            }
+
+#if 0
+
+            // Initial step Search
+            bestsme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, cpi->mb.mvcost);
+            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+            // Further step searches
+            while (n < further_steps)
+            {
+                n++;
+
+                if (num00)
+                    num00--;
+                else
+                {
+                    thissme = vp8_diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, x->errorperbit, &num00, &cpi->fn_ptr, cpi->mb.mvsadcost, x->mvcost);
+
+                    if (thissme < bestsme)
+                    {
+                        bestsme = thissme;
+                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+                    }
+                    else
+                    {
+                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+                    }
+                }
+            }
+
+#else
+
+            if (cpi->sf.search_method == HEX)
+            {
+                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+            }
+            else
+            {
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+                // Further step/diamond searches as necessary
+                n = 0;
+                //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+                n = num00;
+                num00 = 0;
+
+                while (n < further_steps)
+                {
+                    n++;
+
+                    if (num00)
+                        num00--;
+                    else
+                    {
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+
+                        if (thissme < bestsme)
+                        {
+                            bestsme = thissme;
+                            mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                            mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+                        }
+                        else
+                        {
+                            d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+                            d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+                        }
+                    }
+                }
+            }
+
+#endif
+        }
+
+        if (bestsme < INT_MAX)
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, cpi->fn_ptr.svf, cpi->fn_ptr.vf, cpi->mb.mvcost);
+
+        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+        // mv cost;
+        rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv1, cpi->mb.mvcost, 128);
+
+
+        case NEARESTMV:
+        case NEARMV:
+
+            if (mode_mv[this_mode].row == 0 && mode_mv[this_mode].col == 0)
+                continue;
+
+        case ZEROMV:
+
+            // Trap vectors that reach beyond the UMV borders
+            // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+            // because of the lack of break statements in the previous two cases.
+            if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+                continue;
+
+            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+            x->e_mbd.mbmi.mode = this_mode;
+            x->e_mbd.mbmi.mv.as_mv = mode_mv[this_mode];
+            x->e_mbd.block[0].bmi.mode = this_mode;
+            x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mbmi.mv.as_int;
+
+            distortion2 = get_inter_mbpred_error(x, cpi->fn_ptr.svf, cpi->fn_ptr.vf, (unsigned int *)(&sse));
+
+            this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
+
+            if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+            {
+                x->skip = 1;
+            }
+            else if (sse < x->encode_breakout)
+            {
+                // Check u and v to make sure skip is ok
+                int sse2 = 0;
+
+                sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+
+                if (sse2 * 2 < x->encode_breakout)
+                    x->skip = 1;
+                else
+                    x->skip = 0;
+            }
+
+            break;
+        default:
+            break;
+        }
+
+        // Experimental debug code.
+        //all_rds[mode_index] = this_rd;
+
+        if (this_rd < best_rd || x->skip)
+        {
+            // Note index of best mode
+            best_mode_index = mode_index;
+
+            *returnrate = rate2;
+            *returndistortion = distortion2;
+            best_rd = this_rd;
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO));
+
+            if (this_mode == B_PRED || this_mode == SPLITMV)
+                for (i = 0; i < 16; i++)
+                {
+                    vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO));
+                }
+            else
+            {
+                best_bmodes[0].mv = x->e_mbd.block[0].bmi.mv;
+            }
+
+            // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+        else
+        {
+            cpi->rd_thresh_mult[mode_index] += 4;
+
+            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        if (x->skip)
+            break;
+    }
+
+    // Reduce the activation RD thresholds for the best choice mode
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 3);
+
+        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+    }
+
+    // Keep a record of best mode index for use in next loop
+    cpi->last_best_mode_index = best_mode_index;
+
+    if (best_mbmode.mode <= B_PRED)
+    {
+        x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+        vp8_pick_intra_mbuv_mode(x);
+        best_mbmode.uv_mode = x->e_mbd.mbmi.uv_mode;
+    }
+
+
+    {
+        int this_rdbin = (*returndistortion >> 7);
+
+        if (this_rdbin >= 1024)
+        {
+            this_rdbin = 1023;
+        }
+
+        cpi->error_bins[this_rdbin] ++;
+    }
+
+
+    if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+    {
+        best_mbmode.mode = ZEROMV;
+        best_mbmode.ref_frame = ALTREF_FRAME;
+        best_mbmode.mv.as_int = 0;
+        best_mbmode.uv_mode = 0;
+        best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+        best_mbmode.partitioning = 0;
+        best_mbmode.dc_diff = 0;
+
+        vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+        for (i = 0; i < 16; i++)
+        {
+            vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+        }
+
+        x->e_mbd.mbmi.mv.as_int = 0;
+
+        return best_rd;
+    }
+
+
+    // macroblock modes
+    vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+    if (x->e_mbd.mbmi.mode == B_PRED || x->e_mbd.mbmi.mode == SPLITMV)
+        for (i = 0; i < 16; i++)
+        {
+            vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO));
+
+        }
+    else
+    {
+        vp8_set_mbmode_and_mvs(x, x->e_mbd.mbmi.mode, &best_bmodes[0].mv.as_mv);
+    }
+
+    x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv;
+
+    return best_rd;
+}

diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
new file mode 100644
index 0000000..fb28837
--- /dev/null
+++ b/vp8/encoder/pickinter.h

@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PICKINTER_H
+#define __INC_PICKINTER_H
+#include "vpx_ports/config.h"
+#include "onyxc_int.h"
+
+#define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);
+extern int vp8_pick_intra_mbuv_mode(MACROBLOCK *mb);
+extern int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+#endif

diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
new file mode 100644
index 0000000..bbd7840
--- /dev/null
+++ b/vp8/encoder/picklpf.c

@@ -0,0 +1,435 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "onyxc_int.h"
+#include "onyx_int.h"
+#include "quantize.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_scale/yv12extend.h"
+#include "vpx_scale/vpxscale.h"
+#include "alloccommon.h"
+
+extern void vp8_loop_filter_frame(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val);
+extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm,    MACROBLOCKD *mbd,  int filt_val, int sharpness_lvl);
+extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd);
+#if HAVE_ARMV7
+extern void vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc);
+#endif
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x) (x)
+#else
+#define IF_RTCD(x) NULL
+#endif
+
+extern void
+(*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
+                                   YV12_BUFFER_CONFIG *dst_ybc,
+                                   int Fraction);
+void
+vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction)
+{
+    unsigned char *src_y, *dst_y;
+    int yheight;
+    int ystride;
+    int border;
+    int yoffset;
+    int linestocopy;
+
+    border   = src_ybc->border;
+    yheight  = src_ybc->y_height;
+    ystride  = src_ybc->y_stride;
+
+    linestocopy = (yheight >> (Fraction + 4));
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+    yoffset  = ystride * ((yheight >> 5) * 16 - 8);
+    src_y = src_ybc->y_buffer + yoffset;
+    dst_y = dst_ybc->y_buffer + yoffset;
+
+    vpx_memcpy(dst_y, src_y, ystride *(linestocopy + 16));
+}
+
+static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int Fraction, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    int i, j;
+    int Total = 0;
+    int srcoffset, dstoffset;
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    int linestocopy = (source->y_height >> (Fraction + 4));
+    (void)rtcd;
+
+    if (linestocopy < 1)
+        linestocopy = 1;
+
+    linestocopy <<= 4;
+
+
+    srcoffset = source->y_stride   * (dest->y_height >> 5) * 16;
+    dstoffset = dest->y_stride     * (dest->y_height >> 5) * 16;
+
+    src += srcoffset;
+    dst += dstoffset;
+
+    // Loop through the Y plane raw and reconstruction data summing (square differences)
+    for (i = 0; i < linestocopy; i += 16)
+    {
+        for (j = 0; j < source->y_width; j += 16)
+        {
+            unsigned int sse;
+            Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+        }
+
+        src += 16 * source->y_stride;
+        dst += 16 * dest->y_stride;
+    }
+
+    return Total;
+}
+
+extern void vp8_loop_filter_partial_frame
+(
+    VP8_COMMON *cm,
+    MACROBLOCKD *mbd,
+    int default_filt_lvl,
+    int sharpness_lvl,
+    int Fraction
+);
+
+// Enforce a minimum filter level based upon baseline Q
+static int get_min_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    int min_filter_level;
+
+    if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame)
+        min_filter_level = 0;
+    else
+    {
+        if (base_qindex <= 6)
+            min_filter_level = 0;
+        else if (base_qindex <= 16)
+            min_filter_level = 1;
+        else
+            min_filter_level = (base_qindex / 8);
+    }
+
+    return min_filter_level;
+}
+
+// Enforce a maximum filter level based upon baseline Q
+static int get_max_filter_level(VP8_COMP *cpi, int base_qindex)
+{
+    // PGW August 2006: Highest filter values almost always a bad idea
+
+    // jbb chg: 20100118 - not so any more with this overquant stuff allow high values
+    // with lots of intra coming in.
+    int max_filter_level = MAX_LOOP_FILTER ;//* 3 / 4;
+
+    if (cpi->section_intra_rating > 8)
+        max_filter_level = MAX_LOOP_FILTER * 3 / 4;
+
+    (void) cpi;
+    (void) base_qindex;
+
+    return max_filter_level;
+}
+
+void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level = 0;
+    int max_filter_level = MAX_LOOP_FILTER * 3 / 4;   // PGW August 2006: Highest filter values almost always a bad idea
+    int filt_val;
+    int best_filt_val = cm->filter_level;
+
+    //  Make a copy of the unfiltered / processed recon buffer
+    //vp8_yv12_copy_frame_ptr( cm->frame_to_show, &cpi->last_frame_uf  );
+    vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    // Enforce a minimum filter level based upon Q
+    min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+    // Start the search at the previous frame filter level unless it is now out of range.
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+    else if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+
+    filt_val = cm->filter_level;
+    best_filt_val = filt_val;
+
+    // Set up alternate filter values
+
+    // Get the err using the previous frame's filter value.
+    vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0  , 3);
+    cm->last_frame_type = cm->frame_type;
+    cm->last_filter_type = cm->filter_type;
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+    //  Re-instate the unfiltered frame
+    vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+    filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+
+    // Search lower filter levels
+    while (filt_val >= min_filter_level)
+    {
+        // Apply the loop filter
+        vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3);
+        cm->last_frame_type = cm->frame_type;
+        cm->last_filter_type = cm->filter_type;
+        cm->last_sharpness_level = cm->sharpness_level;
+
+        // Get the err for filtered frame
+        filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+
+        //  Re-instate the unfiltered frame
+        vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+
+        // Update the best case record or exit loop.
+        if (filt_err < best_err)
+        {
+            best_err = filt_err;
+            best_filt_val = filt_val;
+        }
+        else
+            break;
+
+        // Adjust filter level
+        filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
+    }
+
+    // Search up (note that we have already done filt_val = cm->filter_level)
+    filt_val = cm->filter_level + (1 + ((filt_val > 10) ? 1 : 0));
+
+    if (best_filt_val == cm->filter_level)
+    {
+        // Resist raising filter level for very small gains
+        best_err -= (best_err >> 10);
+
+        while (filt_val < max_filter_level)
+        {
+            // Apply the loop filter
+            vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val, 0, 3);
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+
+            // Get the err for filtered frame
+            filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance));
+
+            //  Re-instate the unfiltered frame
+            vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+
+            // Update the best case record or exit loop.
+            if (filt_err < best_err)
+            {
+                // Do not raise filter level if improvement is < 1 part in 4096
+                best_err = filt_err - (filt_err >> 10);
+
+                best_filt_val = filt_val;
+            }
+            else
+                break;
+
+            // Adjust filter level
+            filt_val += (1 + ((filt_val > 10) ? 1 : 0));
+        }
+    }
+
+    cm->filter_level = best_filt_val;
+
+    if (cm->filter_level < min_filter_level)
+        cm->filter_level = min_filter_level;
+
+    if (cm->filter_level > max_filter_level)
+        cm->filter_level = max_filter_level;
+}
+
+// Stub function for now Alt LF not used
+void vp8cx_set_alt_lf_level(VP8_COMP *cpi, int filt_val)
+{
+    MACROBLOCKD *mbd = &cpi->mb.e_mbd;
+    (void) filt_val;
+
+    mbd->segment_feature_data[MB_LVL_ALT_LF][0] = cpi->segment_feature_data[MB_LVL_ALT_LF][0];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][1] = cpi->segment_feature_data[MB_LVL_ALT_LF][1];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][2] = cpi->segment_feature_data[MB_LVL_ALT_LF][2];
+    mbd->segment_feature_data[MB_LVL_ALT_LF][3] = cpi->segment_feature_data[MB_LVL_ALT_LF][3];
+}
+
+void vp8cx_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi)
+{
+    VP8_COMMON *cm = &cpi->common;
+
+    int best_err = 0;
+    int filt_err = 0;
+    int min_filter_level;
+    int max_filter_level;
+    int prediction_difference = (int)(100 * abs((int)(cpi->last_auto_filter_prediction_error - cpi->prediction_error)) / (1 + cpi->prediction_error));
+
+    int filter_step;
+    int filt_high = 0;
+    int filt_mid = cm->filter_level;      // Start search at previous frame filter level
+    int filt_low = 0;
+    int filt_best;
+    int filt_direction = 0;
+
+    int Bias = 0;                       // Bias against raising loop filter and in favour of lowering it
+
+    //  Make a copy of the unfiltered / processed recon buffer
+#if HAVE_ARMV7
+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(cm->frame_to_show, &cpi->last_frame_uf);
+#else
+    vp8_yv12_copy_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf);
+#endif
+
+    if (cm->frame_type == KEY_FRAME)
+        cm->sharpness_level = 0;
+    else
+        cm->sharpness_level = cpi->oxcf.Sharpness;
+
+    // Enforce a minimum filter level based upon Q
+    min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
+    max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
+
+    // Start the search at the previous frame filter level unless it is now out of range.
+    filt_mid = cm->filter_level;
+
+    if (filt_mid < min_filter_level)
+        filt_mid = min_filter_level;
+    else if (filt_mid > max_filter_level)
+        filt_mid = max_filter_level;
+
+    // Define the initial step size
+    filter_step = (filt_mid < 16) ? 4 : filt_mid / 4;
+
+    // Get baseline error score
+    vp8cx_set_alt_lf_level(cpi, filt_mid);
+    vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_mid, 0);
+    cm->last_frame_type = cm->frame_type;
+    cm->last_filter_type = cm->filter_type;
+    cm->last_sharpness_level = cm->sharpness_level;
+
+    best_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+    filt_best = filt_mid;
+
+    //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+    vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+    vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+    while (filter_step > 0)
+    {
+        Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; //PGW change 12/12/06 for small images
+
+        // jbb chg: 20100118 - in sections with lots of new material coming in don't bias as much to a low filter value
+        if (cpi->section_intra_rating < 20)
+            Bias = Bias * cpi->section_intra_rating / 20;
+
+        filt_high = ((filt_mid + filter_step) > max_filter_level) ? max_filter_level : (filt_mid + filter_step);
+        filt_low = ((filt_mid - filter_step) < min_filter_level) ? min_filter_level : (filt_mid - filter_step);
+
+        if ((filt_direction <= 0) && (filt_low != filt_mid))
+        {
+            // Get Low filter error score
+            vp8cx_set_alt_lf_level(cpi, filt_low);
+            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_low, 0);
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+
+            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+
+            //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+            // If value is close to the best so far then bias towards a lower loop filter value.
+            if ((filt_err - Bias) < best_err)
+            {
+                // Was it actually better than the previous best?
+                if (filt_err < best_err)
+                    best_err = filt_err;
+
+                filt_best = filt_low;
+            }
+        }
+
+        // Now look at filt_high
+        if ((filt_direction >= 0) && (filt_high != filt_mid))
+        {
+            vp8cx_set_alt_lf_level(cpi, filt_high);
+            vp8_loop_filter_frame_yonly(cm, &cpi->mb.e_mbd, filt_high, 0);
+            cm->last_frame_type = cm->frame_type;
+            cm->last_filter_type = cm->filter_type;
+            cm->last_sharpness_level = cm->sharpness_level;
+
+            filt_err = vp8_calc_ss_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance));
+
+            //  Re-instate the unfiltered frame
+#if HAVE_ARMV7
+            vp8_yv12_copy_frame_yonly_no_extend_frame_borders_neon(&cpi->last_frame_uf, cm->frame_to_show);
+#else
+            vp8_yv12_copy_frame_yonly_ptr(&cpi->last_frame_uf, cm->frame_to_show);
+#endif
+
+            // Was it better than the previous best?
+            if (filt_err < (best_err - Bias))
+            {
+                best_err = filt_err;
+                filt_best = filt_high;
+            }
+        }
+
+        // Half the step distance if the best filter value was the same as last time
+        if (filt_best == filt_mid)
+        {
+            filter_step = filter_step / 2;
+            filt_direction = 0;
+        }
+        else
+        {
+            filt_direction = (filt_best < filt_mid) ? -1 : 1;
+            filt_mid = filt_best;
+        }
+    }
+
+    cm->filter_level = filt_best;
+    cpi->last_auto_filt_val = filt_best;
+    cpi->last_auto_filt_q  = cm->base_qindex;
+
+    cpi->last_auto_filter_prediction_error = cpi->prediction_error;
+    cpi->frames_since_auto_filter = 0;
+}

diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
new file mode 100644
index 0000000..f99277f
--- /dev/null
+++ b/vp8/encoder/ppc/csystemdependent.c

@@ -0,0 +1,168 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *coeff, short *dqcoeff);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+void (*short_walsh4x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// c imports
+extern int block_error_c(short *coeff, short *dqcoeff);
+extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
+
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern void short_fdct4x4_c(short *input, short *output, int pitch);
+extern void short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction sad16x16_c;
+extern SADFunction sad16x8_c;
+extern SADFunction sad8x16_c;
+extern SADFunction sad8x8_c;
+extern SADFunction sad4x4_c;
+
+extern variance_function variance16x16_c;
+extern variance_function variance8x16_c;
+extern variance_function variance16x8_c;
+extern variance_function variance8x8_c;
+extern variance_function variance4x4_c;
+extern variance_function mse16x16_c;
+
+extern sub_pixel_variance_function sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// ppc
+extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
+
+extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
+
+extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+
+extern SADFunction vp8_sad16x16_ppc;
+extern SADFunction vp8_sad16x8_ppc;
+extern SADFunction vp8_sad8x16_ppc;
+extern SADFunction vp8_sad8x8_ppc;
+extern SADFunction vp8_sad4x4_ppc;
+
+extern variance_function vp8_variance16x16_ppc;
+extern variance_function vp8_variance8x16_ppc;
+extern variance_function vp8_variance16x8_ppc;
+extern variance_function vp8_variance8x8_ppc;
+extern variance_function vp8_variance4x4_ppc;
+extern variance_function vp8_mse16x16_ppc;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
+
+extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+
+void vp8_cmachine_specific_config(void)
+{
+    // Pure C:
+    vp8_mbuverror               = vp8_mbuverror_c;
+    vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
+    vp8_short_fdct4x4            = vp8_short_fdct4x4_ppc;
+    vp8_short_fdct8x4            = vp8_short_fdct8x4_ppc;
+    vp8_fast_fdct4x4             = vp8_short_fdct4x4_ppc;
+    vp8_fast_fdct8x4             = vp8_short_fdct8x4_ppc;
+    short_walsh4x4               = vp8_short_walsh4x4_c;
+
+    vp8_variance4x4             = vp8_variance4x4_ppc;
+    vp8_variance8x8             = vp8_variance8x8_ppc;
+    vp8_variance8x16            = vp8_variance8x16_ppc;
+    vp8_variance16x8            = vp8_variance16x8_ppc;
+    vp8_variance16x16           = vp8_variance16x16_ppc;
+    vp8_mse16x16                = vp8_mse16x16_ppc;
+
+    vp8_sub_pixel_variance4x4     = vp8_sub_pixel_variance4x4_ppc;
+    vp8_sub_pixel_variance8x8     = vp8_sub_pixel_variance8x8_ppc;
+    vp8_sub_pixel_variance8x16    = vp8_sub_pixel_variance8x16_ppc;
+    vp8_sub_pixel_variance16x8    = vp8_sub_pixel_variance16x8_ppc;
+    vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;
+
+    vp8_get_mb_ss                 = vp8_get_mb_ss_c;
+    vp8_get16x16pred_error       = vp8_get16x16pred_error_c;
+    vp8_get8x8var               = vp8_get8x8var_ppc;
+    vp8_get16x16var             = vp8_get16x16var_ppc;
+    vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
+
+    vp8_sad16x16                = vp8_sad16x16_ppc;
+    vp8_sad16x8                 = vp8_sad16x8_ppc;
+    vp8_sad8x16                 = vp8_sad8x16_ppc;
+    vp8_sad8x8                  = vp8_sad8x8_ppc;
+    vp8_sad4x4                  = vp8_sad4x4_ppc;
+
+    vp8_block_error              = vp8_block_error_ppc;
+    vp8_mbblock_error            = vp8_mbblock_error_c;
+
+    vp8_subtract_b               = vp8_subtract_b_c;
+    vp8_subtract_mby             = vp8_subtract_mby_ppc;
+    vp8_subtract_mbuv            = vp8_subtract_mbuv_ppc;
+}

diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm
new file mode 100644
index 0000000..e0e976d
--- /dev/null
+++ b/vp8/encoder/ppc/encodemb_altivec.asm

@@ -0,0 +1,152 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_subtract_mbuv_ppc
+    .globl vp8_subtract_mby_ppc
+
+;# r3 short *diff
+;# r4 unsigned char *usrc
+;# r5 unsigned char *vsrc
+;# r6 unsigned char *pred
+;# r7 int stride
+vp8_subtract_mbuv_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf000
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r9, 256
+    add     r3, r3, r9
+    add     r3, r3, r9
+    add     r6, r6, r9
+
+    li      r10, 16
+    li      r9,  4
+    mtctr   r9
+
+    vspltisw v0, 0
+
+mbu_loop:
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r4, r4, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r4           ;# permutate value for alignment
+    lvx     v1, 0, r4           ;# src
+
+    add     r4, r4, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbu_loop
+
+    mtctr   r9
+
+mbv_loop:
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+    lvx     v2, 0, r6           ;# pred
+
+    add     r5, r5, r7
+    addi    r6, r6, 16
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    lvsl    v5, 0, r5           ;# permutate value for alignment
+    lvx     v1, 0, r5           ;# src
+
+    add     r5, r5, r7
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vperm   v1, v1, v0, v5
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrglb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mbv_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+;# r3 short *diff
+;# r4 unsigned char *src
+;# r5 unsigned char *pred
+;# r6 int stride
+vp8_subtract_mby_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    li      r10, 16
+    mtctr   r10
+
+    vspltisw v0, 0
+
+mby_loop:
+    lvx     v1, 0, r4           ;# src
+    lvx     v2, 0, r5           ;# pred
+
+    add     r4, r4, r6
+    addi    r5, r5, 16
+
+    vmrghb  v3, v0, v1          ;# unpack high src  to short
+    vmrghb  v4, v0, v2          ;# unpack high pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, 0, r3           ;# store out diff
+
+    vmrglb  v3, v0, v1          ;# unpack low src  to short
+    vmrglb  v4, v0, v2          ;# unpack low pred to short
+
+    vsubshs v3, v3, v4
+
+    stvx    v3, r10, r3         ;# store out diff
+
+    addi    r3, r3, 32
+
+    bdnz    mby_loop
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr

diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm
new file mode 100644
index 0000000..eaab14c
--- /dev/null
+++ b/vp8/encoder/ppc/fdct_altivec.asm

@@ -0,0 +1,204 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_short_fdct4x4_ppc
+    .globl vp8_short_fdct8x4_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+;# Forward and inverse DCTs are nearly identical; only differences are
+;#   in normalization (fwd is twice unitary, inv is half unitary)
+;#   and that they are of course transposes of each other.
+;#
+;#   The following three accomplish most of implementation and
+;#   are used only by ppc_idct.c and ppc_fdct.c.
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    li      r6, 16
+
+    load_c v0, dct_tab, 0, r9, r10
+    lvx     v1,   r6, r10
+    addi    r10, r10, 32
+    lvx     v2,    0, r10
+    lvx     v3,   r6, r10
+
+    load_c v4, ppc_dctperm_tab,  0, r9, r10
+    load_c v5, ppc_dctperm_tab, r6, r9, r10
+
+    load_c v6, round_tab, 0, r10, r9
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
+;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
+;#   For fwd transform, indices are horizontal positions, then frequencies.
+;#   For inverse transform, frequencies then positions.
+;#   The two resulting  A0..A3  B0..B3  are later combined
+;#   and vertically transformed.
+
+.macro two_rows_horiz Dst
+    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
+
+    vmsumshm v10, v0, v8, v6
+    vmsumshm v10, v1, v9, v10
+    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
+
+    vmsumshm v11, v2, v8, v6
+    vmsumshm v11, v3, v9, v11
+    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
+
+    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
+    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
+.endm
+
+;# Vertical xf on two rows. DCT values in comments are for inverse transform;
+;#   forward transform uses transpose.
+
+.macro two_rows_vert Ceven, Codd
+    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
+    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v10, v8, v7
+
+    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
+    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
+    vmsumshm v8, v8, v12, v6
+    vmsumshm v8, v9, v13, v8
+    vsraw   v8, v8, v7
+
+    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
+.endm
+
+.macro two_rows_h Dest
+    stw     r0,  0(r8)
+    lwz     r0,  4(r3)
+    stw     r0,  4(r8)
+    lwzux   r0, r3,r5
+    stw     r0,  8(r8)
+    lwz     r0,  4(r3)
+    stw     r0, 12(r8)
+    lvx     v8,  0,r8
+    two_rows_horiz \Dest
+.endm
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct4x4_ppc:
+
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 short *input
+;# r4 short *output
+;# r5 int pitch
+vp8_short_fdct8x4_ppc:
+    prologue
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8,  r1, 0
+    addi    r10, r3, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    ;# Next block
+    addi    r3, r10, 8
+    addi    r4, r4, 32
+    lvx     v6, 0, r9           ;# v6 = Hround
+
+    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
+    addi    r8, r1, 0
+
+    lwz     r0, 0(r3)
+    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
+
+    lwzux   r0, r3, r5
+    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
+
+    lvx     v6, r6, r9          ;# v6 = Vround
+    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
+
+    two_rows_vert v0, v1
+    stvx    v8, 0, r4
+    two_rows_vert v2, v3
+    stvx    v8, r6, r4
+
+    epilogue
+
+    blr
+
+    .data
+    .align 4
+ppc_dctperm_tab:
+    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
+    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
+
+    .align 4
+dct_tab:
+    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
+    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
+
+    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
+    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
+
+    .align 4
+round_tab:
+    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
+    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm
new file mode 100644
index 0000000..917bfe0
--- /dev/null
+++ b/vp8/encoder/ppc/rdopt_altivec.asm

@@ -0,0 +1,50 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_block_error_ppc
+
+    .align 2
+;# r3 short *Coeff
+;# r4 short *dqcoeff
+vp8_block_error_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf800
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    stw     r5, 12(r1)          ;# tranfer dc to vector register
+
+    lvx     v0, 0, r3           ;# Coeff
+    lvx     v1, 0, r4           ;# dqcoeff
+
+    li      r10, 16
+
+    vspltisw v3, 0
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v2, v0, v0, v3     ;# multiply differences
+
+    lvx     v0, r10, r3         ;# Coeff
+    lvx     v1, r10, r4         ;# dqcoeff
+
+    vsubshs v0, v0, v1
+
+    vmsumshm v1, v0, v0, v2     ;# multiply differences
+    vsumsws v1, v1, v3          ;# sum up
+
+    stvx    v1, 0, r1
+    lwz     r3, 12(r1)          ;# return value
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr

diff --git a/vp8/encoder/ppc/sad_altivec.asm b/vp8/encoder/ppc/sad_altivec.asm
new file mode 100644
index 0000000..1102ccf
--- /dev/null
+++ b/vp8/encoder/ppc/sad_altivec.asm

@@ -0,0 +1,276 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_sad16x16_ppc
+    .globl vp8_sad16x8_ppc
+    .globl vp8_sad8x16_ppc
+    .globl vp8_sad8x8_ppc
+    .globl vp8_sad4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v8, 0              ;# zero out total to start
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro SAD_16
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v8, v6, v8
+.endm
+
+.macro sad_16_loop loop_label
+    lvsl    v3,  0, r5          ;# only needs to be done once per block
+
+    ;# preload a line of data before getting into the loop
+    lvx     v4, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    add     r5, r5, r6
+    add     r3, r3, r4
+
+    vperm   v5, v1, v2, v3
+
+    .align 4
+\loop_label:
+    ;# compute difference on first row
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+
+    ;# load up next set of data
+    lvx     v9, 0, r3
+    lvx     v1,  0, r5
+    lvx     v2, r10, r5
+
+    ;# perform abs() of difference
+    vor     v6, v6, v7
+    add     r3, r3, r4
+
+    ;# add to the running tally
+    vsum4ubs v8, v6, v8
+
+    ;# now onto the next line
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+    lvx     v4, 0, r3
+
+    ;# compute difference on second row
+    vsububs v6, v9, v5
+    lvx     v1,  0, r5
+    vsububs v7, v5, v9
+    lvx     v2, r10, r5
+    vor     v6, v6, v7
+    add     r3, r3, r4
+    vsum4ubs v8, v6, v8
+    vperm   v5, v1, v2, v3
+    add     r5, r5, r6
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+.macro sad_8_loop loop_label
+    .align 4
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v7, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v7
+
+    SAD_16
+
+    bdnz    \loop_label
+
+    vspltisw v7, 0
+
+    vsumsws v8, v8, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_16_loop sad16x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad16x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_16_loop sad16x8_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    sad_8_loop sad8x16_loop
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    sad_8_loop sad8x8_loop
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r7, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r7,  4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  src_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  ref_stride
+;#
+;# r3 return value
+vp8_sad4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    vspltisw v8, 0              ;# zero out total to start
+
+    ;# v6 = abs (v4 - v5)
+    vsububs v6, v4, v5
+    vsububs v7, v5, v4
+    vor     v6, v6, v7
+
+    ;# v8 += abs (v4 - v5)
+    vsum4ubs v7, v6, v8
+    vsumsws v7, v7, v8
+
+    stvx    v7, 0, r1
+    lwz     r3, 12(r1)
+
+    epilogue
+
+    blr

diff --git a/vp8/encoder/ppc/variance_altivec.asm b/vp8/encoder/ppc/variance_altivec.asm
new file mode 100644
index 0000000..952bf72
--- /dev/null
+++ b/vp8/encoder/ppc/variance_altivec.asm

@@ -0,0 +1,374 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_get8x8var_ppc
+    .globl vp8_get16x16var_ppc
+    .globl vp8_mse16x16_ppc
+    .globl vp8_variance16x16_ppc
+    .globl vp8_variance16x8_ppc
+    .globl vp8_variance8x16_ppc
+    .globl vp8_variance8x8_ppc
+    .globl vp8_variance4x4_ppc
+
+.macro load_aligned_16 V R O
+    lvsl    v3,  0, \R          ;# permutate value for alignment
+
+    lvx     v1,  0, \R
+    lvx     v2, \O, \R
+
+    vperm   \V, v1, v2, v3
+.endm
+
+.macro prologue
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffc0
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    li      r10, 16             ;# load offset and loop counter
+
+    vspltisw v7, 0              ;# zero for merging
+    vspltisw v8, 0              ;# zero out total to start
+    vspltisw v9, 0              ;# zero out total for dif^2
+.endm
+
+.macro epilogue
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+.endm
+
+.macro compute_sum_sse
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  v2, v7, v4
+    vmrghb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    vmrglb  v2, v7, v4
+    vmrglb  v3, v7, v5
+    vsubshs v2, v2, v3
+    vsum4shs v8, v2, v8
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+.endm
+
+.macro variance_16 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> DS
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
+.endm
+
+.macro variance_8 DS loop_label store_sum
+\loop_label:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v6, r3, r10
+    load_aligned_16 v0, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    vmrghb  v4, v4, v6
+    vmrghb  v5, v5, v0
+
+    compute_sum_sse
+
+    bdnz    \loop_label
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+.if \store_sum
+    stw     r3, 0(r8)           ;# sum
+.endif
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get8x8var_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, get8x8var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *SSE
+;# r8 int *Sum
+;#
+;# r3 return value
+vp8_get16x16var_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, get16x16var_loop, 1
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r 3 return value
+vp8_mse16x16_ppc:
+    prologue
+
+    mtctr   r10
+
+mse16x16_loop:
+    ;# only one of the inputs should need to be aligned.
+    load_aligned_16 v4, r3, r10
+    load_aligned_16 v5, r5, r10
+
+    ;# move onto the next line
+    add     r3, r3, r4
+    add     r5, r5, r6
+
+    ;# Now compute sse.
+    vsububs v2, v4, v5
+    vsububs v3, v5, v4
+    vor     v2, v2, v3
+
+    vmsumubm v9, v2, v2, v9
+
+    bdnz    mse16x16_loop
+
+    vsumsws v9, v9, v7
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r3, 12(r1)
+
+    stw     r3, 0(r7)           ;# sse
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x16_ppc:
+
+    prologue
+
+    mtctr   r10
+
+    variance_16 8, variance16x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance16x8_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_16 7, variance16x8_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x16_ppc:
+
+    prologue
+
+    li      r9, 8
+    mtctr   r9
+
+    variance_8 7, variance8x16_loop, 0
+
+    epilogue
+
+    blr
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance8x8_ppc:
+
+    prologue
+
+    li      r9, 4
+    mtctr   r9
+
+    variance_8 6, variance8x8_loop, 0
+
+    epilogue
+
+    blr
+
+.macro transfer_4x4 I P
+    lwz     r0, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r10,0(\I)
+    add     \I, \I, \P
+
+    lwz     r8, 0(\I)
+    add     \I, \I, \P
+
+    lwz     r9, 0(\I)
+
+    stw     r0,  0(r1)
+    stw     r10, 4(r1)
+    stw     r8,  8(r1)
+    stw     r9, 12(r1)
+.endm
+
+    .align 2
+;# r3 unsigned char *src_ptr
+;# r4 int  source_stride
+;# r5 unsigned char *ref_ptr
+;# r6 int  recon_stride
+;# r7 unsigned int *sse
+;#
+;# r3 return value
+vp8_variance4x4_ppc:
+
+    prologue
+
+    transfer_4x4 r3, r4
+    lvx     v4, 0, r1
+
+    transfer_4x4 r5, r6
+    lvx     v5, 0, r1
+
+    compute_sum_sse
+
+    vsumsws v8, v8, v7
+    vsumsws v9, v9, v7
+
+    stvx    v8, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    v9, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r7)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, 4           ;# (sum*sum) >> 4
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
+
+    epilogue
+
+    blr

diff --git a/vp8/encoder/ppc/variance_subpixel_altivec.asm b/vp8/encoder/ppc/variance_subpixel_altivec.asm
new file mode 100644
index 0000000..148a8d2
--- /dev/null
+++ b/vp8/encoder/ppc/variance_subpixel_altivec.asm

@@ -0,0 +1,864 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+    .globl vp8_sub_pixel_variance4x4_ppc
+    .globl vp8_sub_pixel_variance8x8_ppc
+    .globl vp8_sub_pixel_variance8x16_ppc
+    .globl vp8_sub_pixel_variance16x8_ppc
+    .globl vp8_sub_pixel_variance16x16_ppc
+
+.macro load_c V, LABEL, OFF, R0, R1
+    lis     \R0, \LABEL@ha
+    la      \R1, \LABEL@l(\R0)
+    lvx     \V, \OFF, \R1
+.endm
+
+.macro load_vfilter V0, V1
+    load_c \V0, vfilter_b, r6, r12, r10
+
+    addi    r6,  r6, 16
+    lvx     \V1, r6, r10
+.endm
+
+.macro HProlog jump_label
+    ;# load up horizontal filter
+    slwi.   r5, r5, 4           ;# index into horizontal filter array
+
+    ;# index to the next set of vectors in the row.
+    li      r10, 16
+
+    ;# downshift by 7 ( divide by 128 ) at the end
+    vspltish v19, 7
+
+    ;# If there isn't any filtering to be done for the horizontal, then
+    ;#  just skip to the second pass.
+    beq     \jump_label
+
+    load_c v20, hfilter_b, r5, r12, r0
+
+    ;# setup constants
+    ;# v14 permutation value for alignment
+    load_c v28, b_hperm_b, 0, r12, r0
+
+    ;# index to the next set of vectors in the row.
+    li      r12, 32
+
+    ;# rounding added in on the multiply
+    vspltisw v21, 8
+    vspltisw v18, 3
+    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
+
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+.endm
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+
+.macro hfilter_8 V, hp, lp, increment_counter
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 9 bytes wide, output is 8 bytes.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+
+    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
+    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
+
+    vmsummbm v24, v20, v24, v18
+    vmsummbm v25, v20, v25, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+
+    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
+.endm
+
+.macro vfilter_16 P0 P1
+    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
+    vadduhm v22, v18, v22
+    vmuloub v23, \P0, v20
+    vadduhm v23, v18, v23
+
+    vmuleub v24, \P1, v21
+    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
+    vmuloub v25, \P1, v21
+    vadduhm v23, v23, v25       ;# Ro = odds
+
+    vsrh    v22, v22, v19       ;# divide by 128
+    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
+    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
+    vmrglh  v23, v22, v23
+    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
+.endm
+
+.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
+    ;# Compute sum first.  Unpack to so signed subract
+    ;#  can be used.  Only have a half word signed
+    ;#  subract.  Do high, then low.
+    vmrghb  \t1, \z0, \src
+    vmrghb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    vmrglb  \t1, \z0, \src
+    vmrglb  \t2, \z0, \ref
+    vsubshs \t1, \t1, \t2
+    vsum4shs \sum, \t1, \sum
+
+    ;# Now compute sse.
+    vsububs \t1, \src, \ref
+    vsububs \t2, \ref, \src
+    vor     \t1, \t1, \t2
+
+    vmsumubm \sse, \t1, \t1, \sse
+.endm
+
+.macro variance_final sum, sse, z0, DS
+    vsumsws \sum, \sum, \z0
+    vsumsws \sse, \sse, \z0
+
+    stvx    \sum, 0, r1
+    lwz     r3, 12(r1)
+
+    stvx    \sse, 0, r1
+    lwz     r4, 12(r1)
+
+    stw     r4, 0(r9)           ;# sse
+
+    mullw   r3, r3, r3          ;# sum*sum
+    srawi   r3, r3, \DS         ;# (sum*sum) >> 8
+    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
+.endm
+
+.macro compute_sum_sse_16 V, increment_counter
+    load_and_align_16  v16, r7, r8, \increment_counter
+    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
+.endm
+
+.macro load_and_align_16 V, R, P, increment_counter
+    lvsl    v17,  0, \R         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, \R
+    lvx     v22, r10, \R
+
+.if \increment_counter
+    add     \R, \R, \P
+.endif
+
+    vperm   \V, v21, v22, v17
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance4x4_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xf830
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_4x4_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_4x4_b
+
+    hfilter_8 v4, v10, v11, 0
+
+    b   second_pass_4x4_b
+
+second_pass_4x4_pre_copy_b:
+    slwi    r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 0
+
+second_pass_4x4_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+
+compute_sum_sse_4x4_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    load_and_align_16 v4, r7, r8, 1
+    load_and_align_16 v5, r7, r8, 1
+    load_and_align_16 v6, r7, r8, 1
+    load_and_align_16 v7, r7, r8, 1
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_c v10, b_hilo_b, 0, r12, r0
+
+    vperm   v0, v0, v1, v10
+    vperm   v1, v2, v3, v10
+
+    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 4
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xfff0
+    ori     r12, r12, 0xffff
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x8_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v10, b_0123_b, 0, r12, r0
+    load_c v11, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0, v10, v11, 1
+    hfilter_8 v1, v10, v11, 1
+    hfilter_8 v2, v10, v11, 1
+    hfilter_8 v3, v10, v11, 1
+    hfilter_8 v4, v10, v11, 1
+    hfilter_8 v5, v10, v11, 1
+    hfilter_8 v6, v10, v11, 1
+    hfilter_8 v7, v10, v11, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x8_b
+
+    hfilter_8 v8, v10, v11, 0
+
+    b   second_pass_8x8_b
+
+second_pass_8x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0, r3, r4, 1
+    load_and_align_16 v1, r3, r4, 1
+    load_and_align_16 v2, r3, r4, 1
+    load_and_align_16 v3, r3, r4, 1
+    load_and_align_16 v4, r3, r4, 1
+    load_and_align_16 v5, r3, r4, 1
+    load_and_align_16 v6, r3, r4, 1
+    load_and_align_16 v7, r3, r4, 1
+    load_and_align_16 v8, r3, r4, 0
+
+    beq     compute_sum_sse_8x8_b
+
+second_pass_8x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0, v1
+    vfilter_16 v1, v2
+    vfilter_16 v2, v3
+    vfilter_16 v3, v4
+    vfilter_16 v4, v5
+    vfilter_16 v5, v6
+    vfilter_16 v6, v7
+    vfilter_16 v7, v8
+
+compute_sum_sse_8x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0, v1
+    vmrghb  v1, v2, v3
+    vmrghb  v2, v4, v5
+    vmrghb  v3, v6, v7
+
+    load_and_align_16 v4,  r7, r8, 1
+    load_and_align_16 v5,  r7, r8, 1
+    load_and_align_16 v6,  r7, r8, 1
+    load_and_align_16 v7,  r7, r8, 1
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 0
+
+    vmrghb  v4, v4,  v5
+    vmrghb  v5, v6,  v7
+    vmrghb  v6, v8,  v9
+    vmrghb  v7, v10, v11
+
+    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
+    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
+    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 6
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance8x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfffc
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1,-32(r1)          ;# create space on the stack
+
+    HProlog second_pass_8x16_pre_copy_b
+
+    ;# Load up permutation constants
+    load_c v29, b_0123_b, 0, r12, r0
+    load_c v30, b_4567_b, 0, r12, r0
+
+    hfilter_8 v0,  v29, v30, 1
+    hfilter_8 v1,  v29, v30, 1
+    hfilter_8 v2,  v29, v30, 1
+    hfilter_8 v3,  v29, v30, 1
+    hfilter_8 v4,  v29, v30, 1
+    hfilter_8 v5,  v29, v30, 1
+    hfilter_8 v6,  v29, v30, 1
+    hfilter_8 v7,  v29, v30, 1
+    hfilter_8 v8,  v29, v30, 1
+    hfilter_8 v9,  v29, v30, 1
+    hfilter_8 v10, v29, v30, 1
+    hfilter_8 v11, v29, v30, 1
+    hfilter_8 v12, v29, v30, 1
+    hfilter_8 v13, v29, v30, 1
+    hfilter_8 v14, v29, v30, 1
+    hfilter_8 v15, v29, v30, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_8x16_b
+
+    hfilter_8 v16, v29, v30, 0
+
+    b   second_pass_8x16_b
+
+second_pass_8x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16 v0,  r3, r4, 1
+    load_and_align_16 v1,  r3, r4, 1
+    load_and_align_16 v2,  r3, r4, 1
+    load_and_align_16 v3,  r3, r4, 1
+    load_and_align_16 v4,  r3, r4, 1
+    load_and_align_16 v5,  r3, r4, 1
+    load_and_align_16 v6,  r3, r4, 1
+    load_and_align_16 v7,  r3, r4, 1
+    load_and_align_16 v8,  r3, r4, 1
+    load_and_align_16 v9,  r3, r4, 1
+    load_and_align_16 v10, r3, r4, 1
+    load_and_align_16 v11, r3, r4, 1
+    load_and_align_16 v12, r3, r4, 1
+    load_and_align_16 v13, r3, r4, 1
+    load_and_align_16 v14, r3, r4, 1
+    load_and_align_16 v15, r3, r4, 1
+    load_and_align_16 v16, r3, r4, 0
+
+    beq     compute_sum_sse_8x16_b
+
+second_pass_8x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_8x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    vmrghb  v0, v0,  v1
+    vmrghb  v1, v2,  v3
+    vmrghb  v2, v4,  v5
+    vmrghb  v3, v6,  v7
+    vmrghb  v4, v8,  v9
+    vmrghb  v5, v10, v11
+    vmrghb  v6, v12, v13
+    vmrghb  v7, v14, v15
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 1
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
+
+    load_and_align_16 v8,  r7, r8, 1
+    load_and_align_16 v9,  r7, r8, 1
+    load_and_align_16 v10, r7, r8, 1
+    load_and_align_16 v11, r7, r8, 1
+    load_and_align_16 v12, r7, r8, 1
+    load_and_align_16 v13, r7, r8, 1
+    load_and_align_16 v14, r7, r8, 1
+    load_and_align_16 v15, r7, r8, 0
+
+    vmrghb  v8,  v8,  v9
+    vmrghb  v9,  v10, v11
+    vmrghb  v10, v12, v13
+    vmrghb  v11, v14, v15
+
+    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
+    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
+    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
+    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+    mtspr   256, r11            ;# reset old VRSAVE
+    blr
+
+;# Filters a horizontal line
+;# expects:
+;#  r3  src_ptr
+;#  r4  pitch
+;#  r10 16
+;#  r12 32
+;#  v17 perm intput
+;#  v18 rounding
+;#  v19 shift
+;#  v20 filter taps
+;#  v21 tmp
+;#  v22 tmp
+;#  v23 tmp
+;#  v24 tmp
+;#  v25 tmp
+;#  v26 tmp
+;#  v27 tmp
+;#  v28 perm output
+;#
+.macro hfilter_16 V, increment_counter
+
+    lvsl    v17,  0, r3         ;# permutate value for alignment
+
+    ;# input to filter is 21 bytes wide, output is 16 bytes.
+    ;#  input will can span three vectors if not aligned correctly.
+    lvx     v21,   0, r3
+    lvx     v22, r10, r3
+    lvx     v23, r12, r3
+
+.if \increment_counter
+    add     r3, r3, r4
+.endif
+    vperm   v21, v21, v22, v17
+    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
+
+    ;# set 0
+    vmsummbm v24, v20, v21, v18 ;# taps times elements
+
+    ;# set 1
+    vsldoi  v23, v21, v22, 1
+    vmsummbm v25, v20, v23, v18
+
+    ;# set 2
+    vsldoi  v23, v21, v22, 2
+    vmsummbm v26, v20, v23, v18
+
+    ;# set 3
+    vsldoi  v23, v21, v22, 3
+    vmsummbm v27, v20, v23, v18
+
+    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
+    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
+
+    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
+    vsrh    v25, v25, v19
+
+    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
+    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
+.endm
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x8_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x8_pre_copy_b
+
+    hfilter_16 v0, 1
+    hfilter_16 v1, 1
+    hfilter_16 v2, 1
+    hfilter_16 v3, 1
+    hfilter_16 v4, 1
+    hfilter_16 v5, 1
+    hfilter_16 v6, 1
+    hfilter_16 v7, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x8_b
+
+    hfilter_16 v8, 0
+
+    b   second_pass_16x8_b
+
+second_pass_16x8_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+
+    beq     compute_sum_sse_16x8_b
+
+second_pass_16x8_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+
+compute_sum_sse_16x8_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0, 1
+    compute_sum_sse_16 v1, 1
+    compute_sum_sse_16 v2, 1
+    compute_sum_sse_16 v3, 1
+    compute_sum_sse_16 v4, 1
+    compute_sum_sse_16 v5, 1
+    compute_sum_sse_16 v6, 1
+    compute_sum_sse_16 v7, 0
+
+    variance_final v18, v19, v23, 7
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .align 2
+;# r3 unsigned char  *src_ptr
+;# r4 int  src_pixels_per_line
+;# r5 int  xoffset
+;# r6 int  yoffset
+;# r7 unsigned char *dst_ptr
+;# r8 int dst_pixels_per_line
+;# r9 unsigned int *sse
+;#
+;# r3 return value
+vp8_sub_pixel_variance16x16_ppc:
+    mfspr   r11, 256            ;# get old VRSAVE
+    oris    r12, r11, 0xffff
+    ori     r12, r12, 0xfff8
+    mtspr   256, r12            ;# set VRSAVE
+
+    stwu    r1, -32(r1)         ;# create space on the stack
+
+    HProlog second_pass_16x16_pre_copy_b
+
+    hfilter_16 v0,  1
+    hfilter_16 v1,  1
+    hfilter_16 v2,  1
+    hfilter_16 v3,  1
+    hfilter_16 v4,  1
+    hfilter_16 v5,  1
+    hfilter_16 v6,  1
+    hfilter_16 v7,  1
+    hfilter_16 v8,  1
+    hfilter_16 v9,  1
+    hfilter_16 v10, 1
+    hfilter_16 v11, 1
+    hfilter_16 v12, 1
+    hfilter_16 v13, 1
+    hfilter_16 v14, 1
+    hfilter_16 v15, 1
+
+    ;# Finished filtering main horizontal block.  If there is no
+    ;#  vertical filtering, jump to storing the data.  Otherwise
+    ;#  load up and filter the additional line that is needed
+    ;#  for the vertical filter.
+    beq     compute_sum_sse_16x16_b
+
+    hfilter_16 v16, 0
+
+    b   second_pass_16x16_b
+
+second_pass_16x16_pre_copy_b:
+    slwi.   r6, r6, 5           ;# index into vertical filter array
+
+    load_and_align_16  v0,  r3, r4, 1
+    load_and_align_16  v1,  r3, r4, 1
+    load_and_align_16  v2,  r3, r4, 1
+    load_and_align_16  v3,  r3, r4, 1
+    load_and_align_16  v4,  r3, r4, 1
+    load_and_align_16  v5,  r3, r4, 1
+    load_and_align_16  v6,  r3, r4, 1
+    load_and_align_16  v7,  r3, r4, 1
+    load_and_align_16  v8,  r3, r4, 1
+    load_and_align_16  v9,  r3, r4, 1
+    load_and_align_16  v10, r3, r4, 1
+    load_and_align_16  v11, r3, r4, 1
+    load_and_align_16  v12, r3, r4, 1
+    load_and_align_16  v13, r3, r4, 1
+    load_and_align_16  v14, r3, r4, 1
+    load_and_align_16  v15, r3, r4, 1
+    load_and_align_16  v16, r3, r4, 0
+
+    beq     compute_sum_sse_16x16_b
+
+second_pass_16x16_b:
+    vspltish v20, 8
+    vspltish v18, 3
+    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
+
+    load_vfilter v20, v21
+
+    vfilter_16 v0,  v1
+    vfilter_16 v1,  v2
+    vfilter_16 v2,  v3
+    vfilter_16 v3,  v4
+    vfilter_16 v4,  v5
+    vfilter_16 v5,  v6
+    vfilter_16 v6,  v7
+    vfilter_16 v7,  v8
+    vfilter_16 v8,  v9
+    vfilter_16 v9,  v10
+    vfilter_16 v10, v11
+    vfilter_16 v11, v12
+    vfilter_16 v12, v13
+    vfilter_16 v13, v14
+    vfilter_16 v14, v15
+    vfilter_16 v15, v16
+
+compute_sum_sse_16x16_b:
+    vspltish v18, 0             ;# sum
+    vspltish v19, 0             ;# sse
+    vspltish v23, 0             ;# unpack
+    li      r10, 16
+
+    compute_sum_sse_16 v0,  1
+    compute_sum_sse_16 v1,  1
+    compute_sum_sse_16 v2,  1
+    compute_sum_sse_16 v3,  1
+    compute_sum_sse_16 v4,  1
+    compute_sum_sse_16 v5,  1
+    compute_sum_sse_16 v6,  1
+    compute_sum_sse_16 v7,  1
+    compute_sum_sse_16 v8,  1
+    compute_sum_sse_16 v9,  1
+    compute_sum_sse_16 v10, 1
+    compute_sum_sse_16 v11, 1
+    compute_sum_sse_16 v12, 1
+    compute_sum_sse_16 v13, 1
+    compute_sum_sse_16 v14, 1
+    compute_sum_sse_16 v15, 0
+
+    variance_final v18, v19, v23, 8
+
+    addi    r1, r1, 32          ;# recover stack
+
+    mtspr   256, r11            ;# reset old VRSAVE
+
+    blr
+
+    .data
+
+    .align 4
+hfilter_b:
+    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
+    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
+    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
+    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
+    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
+    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
+    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
+    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
+
+    .align 4
+vfilter_b:
+    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
+    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
+    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
+    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
+    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
+    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
+    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
+    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
+
+    .align 4
+b_hperm_b:
+    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
+
+    .align 4
+b_0123_b:
+    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
+
+    .align 4
+b_4567_b:
+    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
+
+b_hilo_b:
+    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/vp8/encoder/preproc.c b/vp8/encoder/preproc.c
new file mode 100644
index 0000000..d2a13dc
--- /dev/null
+++ b/vp8/encoder/preproc.c

@@ -0,0 +1,250 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+/****************************************************************************
+*
+*   Module Title :     preproc.c
+*
+*   Description  :     Simple pre-processor.
+*
+****************************************************************************/
+
+/****************************************************************************
+*  Header Files
+****************************************************************************/
+
+#include "memory.h"
+#include "preproc7.h"
+#include "vpx_mem/vpx_mem.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vp8_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+);
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_c
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_c
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int frame = 0;
+
+            do
+            {
+                *frameptr = s[byte];
+                ++frameptr;
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            d[byte] = s[byte];
+
+            ++byte;
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int modifier;
+        int offset = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            int accumulator = 0;
+            int count = 0;
+            int frame = 0;
+
+            frameptr[offset] = s[byte];
+
+            do
+            {
+                int pixel_value = *frameptr;
+
+                modifier   = s[byte];
+                modifier  -= pixel_value;
+                modifier  *= modifier;
+                modifier >>= strength;
+                modifier  *= 3;
+
+                if (modifier > 16)
+                    modifier = 16;
+
+                modifier = 16 - modifier;
+
+                accumulator += modifier * pixel_value;
+
+                count += modifier;
+
+                frameptr++;
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            accumulator += (count >> 1);
+            accumulator *= ppi->fixed_divide[count];
+            accumulator >>= 16;
+
+            d[byte] = accumulator;
+
+            ++byte;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+}
+/****************************************************************************
+ *
+ *  ROUTINE       : delete_pre_proc
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Deletes a pre-processing instance.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+void delete_pre_proc(pre_proc_instance *ppi)
+{
+    if (ppi->frame_buffer_alloc)
+        vpx_free(ppi->frame_buffer_alloc);
+
+    ppi->frame_buffer_alloc = 0;
+    ppi->frame_buffer      = 0;
+
+    if (ppi->fixed_divide_alloc)
+        vpx_free(ppi->fixed_divide_alloc);
+
+    ppi->fixed_divide_alloc = 0;
+    ppi->fixed_divide      = 0;
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : init_pre_proc
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  int frame_size        : Number of bytes in one frame.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : int: 1 if successful, 0 if failed.
+ *
+ *  FUNCTION      : Initializes prepprocessor instance.
+ *
+ *  SPECIAL NOTES : None.
+ *
+ ****************************************************************************/
+int init_pre_proc7(pre_proc_instance *ppi, int frame_size)
+{
+    int i;
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vp8_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (wmt_enabled)
+        temp_filter = temp_filter_wmt;
+    else if (mmx_enabled)
+        temp_filter = temp_filter_mmx;
+    else
+        temp_filter = temp_filter_c;
+
+
+    delete_pre_proc(ppi);
+
+    ppi->frame_buffer_alloc = vpx_malloc(32 + frame_size * FRAMECOUNT * sizeof(unsigned char));
+
+    if (!ppi->frame_buffer_alloc)
+    {
+        delete_pre_proc(ppi);
+        return 0;
+    }
+
+    ppi->frame_buffer = (unsigned char *) ROUNDUP32(ppi->frame_buffer_alloc);
+
+    ppi->fixed_divide_alloc = vpx_malloc(32 + 255 * sizeof(unsigned int));
+
+    if (!ppi->fixed_divide_alloc)
+    {
+        delete_pre_proc(ppi);
+        return 0;
+    }
+
+    ppi->fixed_divide = (unsigned int *) ROUNDUP32(ppi->fixed_divide_alloc);
+
+    for (i = 1; i < 255; i++)
+        ppi->fixed_divide[i] = 0x10000 / i;
+
+    return 1;
+}

diff --git a/vp8/encoder/psnr.c b/vp8/encoder/psnr.c
new file mode 100644
index 0000000..0e34cec
--- /dev/null
+++ b/vp8/encoder/psnr.c

@@ -0,0 +1,116 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+#include "systemdependent.h" /* for vp8_clear_system_state() */
+
+#define MAX_PSNR 60
+
+double vp8_mse2psnr(double Samples, double Peak, double Mse)
+{
+    double psnr;
+
+    if ((double)Mse > 0.0)
+        psnr = 10.0 * log10(Peak * Peak * Samples / Mse);
+    else
+        psnr = MAX_PSNR;      // Limit to prevent / 0
+
+    if (psnr > MAX_PSNR)
+        psnr = MAX_PSNR;
+
+    return psnr;
+}
+
+double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error)
+{
+    int i, j;
+    int Diff;
+    double frame_psnr;
+    double Total;
+    double grand_total;
+    unsigned char *src = source->y_buffer;
+    unsigned char *dst = dest->y_buffer;
+
+    Total = 0.0;
+    grand_total = 0.0;
+
+    // Loop throught the Y plane raw and reconstruction data summing (square differences)
+    for (i = 0; i < source->y_height; i++)
+    {
+
+        for (j = 0; j < source->y_width; j++)
+        {
+            Diff        = (int)(src[j]) - (int)(dst[j]);
+            Total      += Diff * Diff;
+        }
+
+        src += source->y_stride;
+        dst += dest->y_stride;
+    }
+
+    // Work out Y PSNR
+    *YPsnr = vp8_mse2psnr(source->y_height * source->y_width, 255.0, Total);
+    grand_total += Total;
+    Total = 0;
+
+
+    // Loop through the U plane
+    src = source->u_buffer;
+    dst = dest->u_buffer;
+
+    for (i = 0; i < source->uv_height; i++)
+    {
+
+        for (j = 0; j < source->uv_width; j++)
+        {
+            Diff        = (int)(src[j]) - (int)(dst[j]);
+            Total      += Diff * Diff;
+        }
+
+        src += source->uv_stride;
+        dst += dest->uv_stride;
+    }
+
+    // Work out U PSNR
+    *UPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
+    grand_total += Total;
+    Total = 0;
+
+
+    // V PSNR
+    src = source->v_buffer;
+    dst = dest->v_buffer;
+
+    for (i = 0; i < source->uv_height; i++)
+    {
+
+        for (j = 0; j < source->uv_width; j++)
+        {
+            Diff        = (int)(src[j]) - (int)(dst[j]);
+            Total      += Diff * Diff;
+        }
+
+        src += source->uv_stride;
+        dst += dest->uv_stride;
+    }
+
+    // Work out UV PSNR
+    *VPsnr = vp8_mse2psnr(source->uv_height * source->uv_width, 255.0, Total);
+    grand_total += Total;
+    Total = 0;
+
+    // Work out total PSNR
+    frame_psnr = vp8_mse2psnr(source->y_height * source->y_width * 3 / 2 , 255.0, grand_total);
+
+    *sq_error = 1.0 * grand_total;
+
+    return frame_psnr;
+}

diff --git a/vp8/encoder/psnr.h b/vp8/encoder/psnr.h
new file mode 100644
index 0000000..9f6ca0b
--- /dev/null
+++ b/vp8/encoder/psnr.h

@@ -0,0 +1,17 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_PSNR_H
+#define __INC_PSNR_H
+
+extern double vp8_mse2psnr(double Samples, double Peak, double Mse);
+extern double vp8_calc_psnr(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, double *YPsnr, double *UPsnr, double *VPsnr, double *sq_error);
+
+#endif

diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
new file mode 100644
index 0000000..6028ebf
--- /dev/null
+++ b/vp8/encoder/quantize.c

@@ -0,0 +1,249 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include "vpx_mem/vpx_mem.h"
+
+#include "quantize.h"
+#include "entropy.h"
+#include "predictdc.h"
+
+void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+        zbin = zbin_ptr[rc] ;
+
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        if (x >= zbin)
+        {
+            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+            x  = (y ^ sz) - sz;                         // get the sign back
+            qcoeff_ptr[rc] = x;                          // write to destination
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+            if (y)
+            {
+                eob = i;                                // last nonzero coeffs
+            }
+        }
+    }
+
+    d->eob = eob + 1;
+
+}
+
+void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d)
+{
+    int i, rc, eob;
+    int zbin;
+    int x, y, z, sz;
+    short *zbin_boost_ptr = &b->zrun_zbin_boost[0];
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+    short zbin_oq_value = b->zbin_extra;
+
+    vpx_memset(qcoeff_ptr, 0, 32);
+    vpx_memset(dqcoeff_ptr, 0, 32);
+
+    eob = -1;
+
+    for (i = 0; i < 16; i++)
+    {
+        rc   = vp8_default_zig_zag1d[i];
+        z    = coeff_ptr[rc];
+
+        //if ( i == 0 )
+        //    zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value/2;
+        //else
+        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
+
+        zbin_boost_ptr ++;
+        sz = (z >> 31);                                 // sign of z
+        x  = (z ^ sz) - sz;                             // x = abs(z)
+
+        if (x >= zbin)
+        {
+            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; // quantize (x)
+            x  = (y ^ sz) - sz;                         // get the sign back
+            qcoeff_ptr[rc]  = x;                         // write to destination
+            dqcoeff_ptr[rc] = x * dequant_ptr[rc];        // dequantized value
+
+            if (y)
+            {
+                eob = i;                                // last nonzero coeffs
+                zbin_boost_ptr = &b->zrun_zbin_boost[0];    // reset zero runlength
+            }
+        }
+    }
+
+    d->eob = eob + 1;
+}
+void vp8_quantize_mby(MACROBLOCK *x)
+{
+    int i;
+
+    if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+        }
+
+        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
+        x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob);
+
+    }
+    else
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+}
+
+void vp8_quantize_mb(MACROBLOCK *x)
+{
+    int i;
+
+    x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+    if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+        }
+
+        for (i = 16; i < 25; i++)
+        {
+            x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+    else
+    {
+        for (i = 0; i < 24; i++)
+        {
+            x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+
+}
+
+
+void vp8_quantize_mbuv(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i++)
+    {
+        x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
+        x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+    }
+}
+
+// This function is not currently called
+void vp8_quantize_mbrd(MACROBLOCK *x)
+{
+    int i;
+
+    x->e_mbd.mbmi.mb_skip_coeff = 1;
+
+    if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+        }
+
+        for (i = 16; i < 25; i++)
+        {
+            x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+    else
+    {
+        for (i = 0; i < 24; i++)
+        {
+            x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+}
+
+void vp8_quantize_mbuvrd(MACROBLOCK *x)
+{
+    int i;
+
+    for (i = 16; i < 24; i++)
+    {
+        x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+        x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+    }
+}
+
+void vp8_quantize_mbyrd(MACROBLOCK *x)
+{
+    int i;
+
+    if (x->e_mbd.mbmi.mode != B_PRED && x->e_mbd.mbmi.mode != SPLITMV)
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (x->e_mbd.block[i].eob < 2);
+        }
+
+        x->quantize_brd(&x->block[24], &x->e_mbd.block[24]);
+        x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[24].eob);
+
+    }
+    else
+    {
+        for (i = 0; i < 16; i++)
+        {
+            x->quantize_brd(&x->block[i], &x->e_mbd.block[i]);
+            x->e_mbd.mbmi.mb_skip_coeff &= (!x->e_mbd.block[i].eob);
+        }
+    }
+}

diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
new file mode 100644
index 0000000..868e8e3
--- /dev/null
+++ b/vp8/encoder/quantize.h

@@ -0,0 +1,52 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_QUANTIZE_H
+#define __INC_QUANTIZE_H
+
+#include "block.h"
+
+#define prototype_quantize_block(sym) \
+    void (sym)(BLOCK *b,BLOCKD *d)
+
+#if ARCH_ARM
+#include "arm/quantize_arm.h"
+#endif
+
+#ifndef vp8_quantize_quantb
+#define vp8_quantize_quantb vp8_regular_quantize_b
+#endif
+extern prototype_quantize_block(vp8_quantize_quantb);
+
+#ifndef vp8_quantize_fastquantb
+#define vp8_quantize_fastquantb vp8_fast_quantize_b_c
+#endif
+extern prototype_quantize_block(vp8_quantize_fastquantb);
+
+typedef struct
+{
+    prototype_quantize_block(*quantb);
+    prototype_quantize_block(*fastquantb);
+} vp8_quantize_rtcd_vtable_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define QUANTIZE_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define QUANTIZE_INVOKE(ctx,fn) vp8_quantize_##fn
+#endif
+
+extern void vp8_quantize_mb(MACROBLOCK *x);
+extern void vp8_quantize_mbuv(MACROBLOCK *x);
+extern void vp8_quantize_mby(MACROBLOCK *x);
+extern void vp8_quantize_mbyrd(MACROBLOCK *x);
+extern void vp8_quantize_mbuvrd(MACROBLOCK *x);
+extern void vp8_quantize_mbrd(MACROBLOCK *x);
+
+#endif

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
new file mode 100644
index 0000000..05040d3
--- /dev/null
+++ b/vp8/encoder/ratectrl.c

@@ -0,0 +1,1552 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+#include <assert.h>
+
+#include "math.h"
+#include "common.h"
+#include "ratectrl.h"
+#include "entropymode.h"
+#include "vpx_mem/vpx_mem.h"
+#include "systemdependent.h"
+#include "encodemv.h"
+
+
+#define MIN_BPB_FACTOR          0.01
+#define MAX_BPB_FACTOR          50
+
+extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
+extern const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES];
+
+
+
+#ifdef MODE_STATS
+extern int y_modes[5];
+extern int uv_modes[4];
+extern int b_modes[10];
+
+extern int inter_y_modes[10];
+extern int inter_uv_modes[4];
+extern int inter_b_modes[10];
+#endif
+
+// Bits Per MB at different Q (Multiplied by 512)
+#define BPER_MB_NORMBITS    9
+
+const int vp8_bits_per_mb[2][QINDEX_RANGE] =
+{
+    // (Updated 19 March 08) Baseline estimate of INTRA-frame Bits Per MB at each Q:
+    {
+        674781, 606845, 553905, 524293, 500428, 452540, 435379, 414719,
+        390970, 371082, 359416, 341807, 336957, 317263, 303724, 298402,
+        285688, 275237, 268455, 262560, 256038, 248734, 241087, 237615,
+        229247, 225211, 219112, 213920, 211559, 202714, 198482, 193401,
+        187866, 183453, 179212, 175965, 171852, 167235, 163972, 160560,
+        156032, 154349, 151390, 148725, 145708, 142311, 139981, 137700,
+        134084, 131863, 129746, 128498, 126077, 123461, 121290, 117782,
+        114883, 112332, 108410, 105685, 103434, 101192,  98587,  95959,
+        94059,  92017,  89970,  87936,  86142,  84801,  82736,  81106,
+        79668,  78135,  76641,  75103,  73943,  72693,  71401,  70098,
+        69165,  67901,  67170,  65987,  64923,  63534,  62378,  61302,
+        59921,  58941,  57844,  56782,  55960,  54973,  54257,  53454,
+        52230,  50938,  49962,  49190,  48288,  47270,  46738,  46037,
+        45020,  44027,  43216,  42287,  41594,  40702,  40081,  39414,
+        38282,  37627,  36987,  36375,  35808,  35236,  34710,  34162,
+        33659,  33327,  32751,  32384,  31936,  31461,  30982,  30582,
+    },
+
+    // (Updated 19 March 08) Baseline estimate of INTER-frame Bits Per MB at each Q:
+    {
+        497401, 426316, 372064, 352732, 335763, 283921, 273848, 253321,
+        233181, 217727, 210030, 196685, 194836, 178396, 167753, 164116,
+        154119, 146929, 142254, 138488, 133591, 127741, 123166, 120226,
+        114188, 111756, 107882, 104749, 102522,  96451,  94424,  90905,
+        87286,  84931,  82111,  80534,  77610,  74700,  73037,  70715,
+        68006,  67235,  65374,  64009,  62134,  60180,  59105,  57691,
+        55509,  54512,  53318,  52693,  51194,  49840,  48944,  46980,
+        45668,  44177,  42348,  40994,  39859,  38889,  37717,  36391,
+        35482,  34622,  33795,  32756,  32002,  31492,  30573,  29737,
+        29152,  28514,  27941,  27356,  26859,  26329,  25874,  25364,
+        24957,  24510,  24290,  23689,  23380,  22845,  22481,  22066,
+        21587,  21219,  20880,  20452,  20260,  19926,  19661,  19334,
+        18915,  18391,  18046,  17833,  17441,  17105,  16888,  16729,
+        16383,  16023,  15706,  15442,  15222,  14938,  14673,  14452,
+        14005,  13807,  13611,  13447,  13223,  13102,  12963,  12801,
+        12627,  12534,  12356,  12228,  12056,  11907,  11746,  11643,
+    }
+};
+
+const int vp8_kf_boost_qadjustment[QINDEX_RANGE] =
+{
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 185, 186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199,
+    200, 200, 201, 201, 202, 203, 203, 203,
+    204, 204, 205, 205, 206, 206, 207, 207,
+    208, 208, 209, 209, 210, 210, 211, 211,
+    212, 212, 213, 213, 214, 214, 215, 215,
+    216, 216, 217, 217, 218, 218, 219, 219,
+    220, 220, 220, 220, 220, 220, 220, 220,
+    220, 220, 220, 220, 220, 220, 220, 220,
+};
+
+//#define GFQ_ADJUSTMENT (Q+100)
+#define GFQ_ADJUSTMENT vp8_gf_boost_qadjustment[Q]
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    80, 82, 84, 86, 88, 90, 92, 94,
+    96, 97, 98, 99, 100, 101, 102, 103,
+    104, 105, 106, 107, 108, 109, 110, 111,
+    112, 113, 114, 115, 116, 117, 118, 119,
+    120, 121, 122, 123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135,
+    136, 137, 138, 139, 140, 141, 142, 143,
+    144, 145, 146, 147, 148, 149, 150, 151,
+    152, 153, 154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167,
+    168, 169, 170, 171, 172, 173, 174, 175,
+    176, 177, 178, 179, 180, 181, 182, 183,
+    184, 184, 185, 185, 186, 186, 187, 187,
+    188, 188, 189, 189, 190, 190, 191, 191,
+    192, 192, 193, 193, 194, 194, 194, 194,
+    195, 195, 196, 196, 197, 197, 198, 198
+};
+
+/*
+const int vp8_gf_boost_qadjustment[QINDEX_RANGE] =
+{
+    100,101,102,103,104,105,105,106,
+    106,107,107,108,109,109,110,111,
+    112,113,114,115,116,117,118,119,
+    120,121,122,123,124,125,126,127,
+    128,129,130,131,132,133,134,135,
+    136,137,138,139,140,141,142,143,
+    144,145,146,147,148,149,150,151,
+    152,153,154,155,156,157,158,159,
+    160,161,162,163,164,165,166,167,
+    168,169,170,170,171,171,172,172,
+    173,173,173,174,174,174,175,175,
+    175,176,176,176,177,177,177,177,
+    178,178,179,179,180,180,181,181,
+    182,182,183,183,184,184,185,185,
+    186,186,187,187,188,188,189,189,
+    190,190,191,191,192,192,193,193,
+};
+*/
+
+const int vp8_kf_gf_boost_qlimits[QINDEX_RANGE] =
+{
+    150, 155, 160, 165, 170, 175, 180, 185,
+    190, 195, 200, 205, 210, 215, 220, 225,
+    230, 235, 240, 245, 250, 255, 260, 265,
+    270, 275, 280, 285, 290, 295, 300, 305,
+    310, 320, 330, 340, 350, 360, 370, 380,
+    390, 400, 410, 420, 430, 440, 450, 460,
+    470, 480, 490, 500, 510, 520, 530, 540,
+    550, 560, 570, 580, 590, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+    600, 600, 600, 600, 600, 600, 600, 600,
+};
+
+// % adjustment to target kf size based on seperation from previous frame
+const int vp8_kf_boost_seperationt_adjustment[16] =
+{
+    30,   40,   50,   55,   60,   65,   70,   75,
+    80,   85,   90,   95,  100,  100,  100,  100,
+};
+
+
+const int vp8_gf_adjust_table[101] =
+{
+    100,
+    115, 130, 145, 160, 175, 190, 200, 210, 220, 230,
+    240, 260, 270, 280, 290, 300, 310, 320, 330, 340,
+    350, 360, 370, 380, 390, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+    400, 400, 400, 400, 400, 400, 400, 400, 400, 400,
+};
+
+const int vp8_gf_intra_useage_adjustment[20] =
+{
+    125, 120, 115, 110, 105, 100,  95,  85,  80,  75,
+    70,  65,  60,  55,  50,  50,  50,  50,  50,  50,
+};
+
+const int vp8_gf_interval_table[101] =
+{
+    7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+};
+
+static const unsigned int prior_key_frame_weight[KEY_FRAME_CONTEXT] = { 1, 2, 3, 4, 5 };
+
+
+void vp8_save_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    // Stores a snapshot of key state variables which can subsequently be
+    // restored with a call to vp8_restore_coding_context. These functions are
+    // intended for use in a re-code loop in vp8_compress_frame where the
+    // quantizer value is adjusted between loop iterations.
+
+    cc->frames_since_key          = cpi->frames_since_key;
+    cc->filter_level             = cpi->common.filter_level;
+    cc->frames_till_gf_update_due   = cpi->frames_till_gf_update_due;
+    cc->frames_since_golden       = cpi->common.frames_since_golden;
+
+    vp8_copy(cc->mvc,      cpi->common.fc.mvc);
+    vp8_copy(cc->mvcosts,  cpi->mb.mvcosts);
+
+    vp8_copy(cc->kf_ymode_prob,   cpi->common.kf_ymode_prob);
+    vp8_copy(cc->ymode_prob,   cpi->common.fc.ymode_prob);
+    vp8_copy(cc->kf_uv_mode_prob,  cpi->common.kf_uv_mode_prob);
+    vp8_copy(cc->uv_mode_prob,  cpi->common.fc.uv_mode_prob);
+
+    vp8_copy(cc->ymode_count, cpi->ymode_count);
+    vp8_copy(cc->uv_mode_count, cpi->uv_mode_count);
+
+
+    // Stats
+#ifdef MODE_STATS
+    vp8_copy(cc->y_modes,       y_modes);
+    vp8_copy(cc->uv_modes,      uv_modes);
+    vp8_copy(cc->b_modes,       b_modes);
+    vp8_copy(cc->inter_y_modes,  inter_y_modes);
+    vp8_copy(cc->inter_uv_modes, inter_uv_modes);
+    vp8_copy(cc->inter_b_modes,  inter_b_modes);
+#endif
+
+    cc->this_frame_percent_intra = cpi->this_frame_percent_intra;
+}
+
+
+void vp8_restore_coding_context(VP8_COMP *cpi)
+{
+    CODING_CONTEXT *const cc = & cpi->coding_context;
+
+    // Restore key state variables to the snapshot state stored in the
+    // previous call to vp8_save_coding_context.
+
+    cpi->frames_since_key         =   cc->frames_since_key;
+    cpi->common.filter_level     =   cc->filter_level;
+    cpi->frames_till_gf_update_due  =   cc->frames_till_gf_update_due;
+    cpi->common.frames_since_golden       =   cc->frames_since_golden;
+
+    vp8_copy(cpi->common.fc.mvc, cc->mvc);
+
+    vp8_copy(cpi->mb.mvcosts, cc->mvcosts);
+
+    vp8_copy(cpi->common.kf_ymode_prob,   cc->kf_ymode_prob);
+    vp8_copy(cpi->common.fc.ymode_prob,   cc->ymode_prob);
+    vp8_copy(cpi->common.kf_uv_mode_prob,  cc->kf_uv_mode_prob);
+    vp8_copy(cpi->common.fc.uv_mode_prob,  cc->uv_mode_prob);
+
+    vp8_copy(cpi->ymode_count, cc->ymode_count);
+    vp8_copy(cpi->uv_mode_count, cc->uv_mode_count);
+
+    // Stats
+#ifdef MODE_STATS
+    vp8_copy(y_modes, cc->y_modes);
+    vp8_copy(uv_modes, cc->uv_modes);
+    vp8_copy(b_modes, cc->b_modes);
+    vp8_copy(inter_y_modes, cc->inter_y_modes);
+    vp8_copy(inter_uv_modes, cc->inter_uv_modes);
+    vp8_copy(inter_b_modes, cc->inter_b_modes);
+#endif
+
+
+    cpi->this_frame_percent_intra = cc->this_frame_percent_intra;
+}
+
+
+void vp8_setup_key_frame(VP8_COMP *cpi)
+{
+    // Setup for Key frame:
+
+    vp8_default_coef_probs(& cpi->common);
+    vp8_kf_default_bmode_probs(cpi->common.kf_bmode_prob);
+
+    vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
+    {
+        int flag[2] = {1, 1};
+        vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+    }
+
+    vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc));  //initialize pre_mvc to all zero.
+
+    //cpi->common.filter_level = 0;      // Reset every key frame.
+    cpi->common.filter_level = cpi->common.base_qindex * 3 / 8 ;
+
+    // Provisional interval before next GF
+    if (cpi->auto_gold)
+        //cpi->frames_till_gf_update_due = DEFAULT_GF_INTERVAL;
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+    else
+        cpi->frames_till_gf_update_due = cpi->goldfreq;
+
+    cpi->common.refresh_golden_frame = TRUE;
+}
+
+void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi)
+{
+    // boost defaults to half second
+    int kf_boost;
+
+    // Clear down mmx registers to allow floating point in what follows
+    vp8_clear_system_state();  //__asm emms;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        vp8_calc_iframe_target_size(cpi);
+        return;
+    }
+
+    if (cpi->pass == 2)
+    {
+        cpi->this_frame_target = cpi->per_frame_bandwidth;      // New Two pass RC
+    }
+    else
+    {
+        // Boost depends somewhat on frame rate
+        kf_boost = (int)(2 * cpi->output_frame_rate - 16);
+
+        // adjustment up based on q
+        kf_boost = kf_boost * vp8_kf_boost_qadjustment[cpi->ni_av_qi] / 100;
+
+        // frame separation adjustment ( down)
+        if (cpi->frames_since_key  < cpi->output_frame_rate / 2)
+            kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2));
+
+        if (kf_boost < 16)
+            kf_boost = 16;
+
+        // Reset the active worst quality to the baseline value for key frames.
+        cpi->active_worst_quality = cpi->worst_quality;
+
+        cpi->this_frame_target = ((16 + kf_boost)  * cpi->per_frame_bandwidth) >> 4;
+    }
+
+
+    // Should the next frame be an altref frame
+    if (cpi->pass != 2)
+    {
+        // For now Alt ref is not allowed except in 2 pass modes.
+        cpi->source_alt_ref_pending = FALSE;
+
+        /*if ( cpi->oxcf.fixed_q == -1)
+        {
+            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+                cpi->source_alt_ref_pending = TRUE;
+            else
+                cpi->source_alt_ref_pending = FALSE;
+        }*/
+    }
+
+    if (0)
+    {
+        FILE *f;
+
+        f = fopen("kf_boost.stt", "a");
+        //fprintf(f, " %8d %10d %10d %10d %10d %10d %10d\n",
+        //  cpi->common.current_video_frame,  cpi->target_bandwidth, cpi->frames_to_key, kf_boost_qadjustment[cpi->ni_av_qi], cpi->kf_boost, (cpi->this_frame_target *100 / cpi->per_frame_bandwidth), cpi->this_frame_target );
+
+        fprintf(f, " %8u %10d %10d %10d\n",
+                cpi->common.current_video_frame,  cpi->gfu_boost, cpi->baseline_gf_interval, cpi->source_alt_ref_pending);
+
+        fclose(f);
+    }
+}
+
+//  Do the best we can to define the parameteres for the next GF based on what information we have available.
+static void calc_gf_params(VP8_COMP *cpi)
+{
+    int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+    int Boost = 0;
+
+    int gf_frame_useage = 0;      // Golden frame useage since last GF
+    int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                  cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                  cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                  cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+    int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+    // Reset the last boost indicator
+    //cpi->last_boost = 100;
+
+    if (tot_mbs)
+        gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+    if (pct_gf_active > gf_frame_useage)
+        gf_frame_useage = pct_gf_active;
+
+    // Not two pass
+    if (cpi->pass != 2)
+    {
+        // Single Pass lagged mode: TBD
+        if (FALSE)
+        {
+        }
+
+        // Single Pass compression: Has to use current and historical data
+        else
+        {
+#if 0
+            // Experimental code
+            int index = cpi->one_pass_frame_index;
+            int frames_to_scan = (cpi->max_gf_interval <= MAX_LAG_BUFFERS) ? cpi->max_gf_interval : MAX_LAG_BUFFERS;
+
+            /*
+            // *************** Experimental code - incomplete
+            double decay_val = 1.0;
+            double IIAccumulator = 0.0;
+            double last_iiaccumulator = 0.0;
+            double IIRatio;
+
+            cpi->one_pass_frame_index = cpi->common.current_video_frame%MAX_LAG_BUFFERS;
+
+            for ( i = 0; i < (frames_to_scan - 1); i++ )
+            {
+                if ( index < 0 )
+                    index = MAX_LAG_BUFFERS;
+                index --;
+
+                if ( cpi->one_pass_frame_stats[index].frame_coded_error > 0.0 )
+                {
+                    IIRatio = cpi->one_pass_frame_stats[index].frame_intra_error / cpi->one_pass_frame_stats[index].frame_coded_error;
+
+                    if ( IIRatio > 30.0 )
+                        IIRatio = 30.0;
+                }
+                else
+                    IIRatio = 30.0;
+
+                IIAccumulator += IIRatio * decay_val;
+
+                decay_val = decay_val * cpi->one_pass_frame_stats[index].frame_pcnt_inter;
+
+                if (    (i > MIN_GF_INTERVAL) &&
+                        ((IIAccumulator - last_iiaccumulator) < 2.0) )
+                {
+                    break;
+                }
+                last_iiaccumulator = IIAccumulator;
+            }
+
+            Boost = IIAccumulator*100.0/16.0;
+            cpi->baseline_gf_interval = i;
+
+            */
+#else
+
+            /*************************************************************/
+            // OLD code
+
+            // Adjust boost based upon ambient Q
+            Boost = GFQ_ADJUSTMENT;
+
+            // Adjust based upon most recently measure intra useage
+            Boost = Boost * vp8_gf_intra_useage_adjustment[(cpi->this_frame_percent_intra < 15) ? cpi->this_frame_percent_intra : 14] / 100;
+
+            // Adjust gf boost based upon GF usage since last GF
+            Boost = Boost * vp8_gf_adjust_table[gf_frame_useage] / 100;
+#endif
+        }
+
+        // golden frame boost without recode loop often goes awry.  be safe by keeping numbers down.
+        if (!cpi->sf.recode_loop)
+        {
+            if (cpi->compressor_speed == 2)
+                Boost = Boost / 2;
+        }
+
+        // Apply an upper limit based on Q for 1 pass encodes
+        if (Boost > vp8_kf_gf_boost_qlimits[Q] && (cpi->pass == 0))
+            Boost = vp8_kf_gf_boost_qlimits[Q];
+
+        // Apply lower limits to boost.
+        else if (Boost < 110)
+            Boost = 110;
+
+        // Note the boost used
+        cpi->last_boost = Boost;
+
+    }
+
+    // Estimate next interval
+    // This is updated once the real frame size/boost is known.
+    if (cpi->oxcf.fixed_q == -1)
+    {
+        if (cpi->pass == 2)         // 2 Pass
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+        }
+        else                            // 1 Pass
+        {
+            cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+            if (cpi->last_boost > 750)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1000)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost > 1250)
+                cpi->frames_till_gf_update_due++;
+
+            if (cpi->last_boost >= 1500)
+                cpi->frames_till_gf_update_due ++;
+
+            if (vp8_gf_interval_table[gf_frame_useage] > cpi->frames_till_gf_update_due)
+                cpi->frames_till_gf_update_due = vp8_gf_interval_table[gf_frame_useage];
+
+            if (cpi->frames_till_gf_update_due > cpi->max_gf_interval)
+                cpi->frames_till_gf_update_due = cpi->max_gf_interval;
+        }
+    }
+    else
+        cpi->frames_till_gf_update_due = cpi->baseline_gf_interval;
+
+    // ARF on or off
+    if (cpi->pass != 2)
+    {
+        // For now Alt ref is not allowed except in 2 pass modes.
+        cpi->source_alt_ref_pending = FALSE;
+
+        /*if ( cpi->oxcf.fixed_q == -1)
+        {
+            if ( cpi->oxcf.play_alternate && (cpi->last_boost > (100 + (AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+                cpi->source_alt_ref_pending = TRUE;
+            else
+                cpi->source_alt_ref_pending = FALSE;
+        }*/
+    }
+}
+/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */
+static int baseline_bits_at_q(int frame_kind, int Q, int MBs)
+{
+    int Bpm = vp8_bits_per_mb[frame_kind][Q];
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * MBs;
+    else
+        return (Bpm * MBs) >> BPER_MB_NORMBITS;
+}
+
+void vp8_calc_iframe_target_size(VP8_COMP *cpi)
+{
+    int Q;
+    int Boost = 100;
+
+    Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex;
+
+    if (cpi->auto_adjust_key_quantizer == 1)
+    {
+        // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames.
+        // The enhanced Q is calculated so as to boost the key frame size by a factor
+        // specified in kf_boost_qadjustment. Also, can adjust based on distance
+        // between key frames.
+
+        // Adjust boost based upon ambient Q
+        Boost = vp8_kf_boost_qadjustment[Q];
+
+        // Make the Key frame boost less if the seperation from the previous key frame is small
+        if (cpi->frames_since_key < 16)
+            Boost = Boost * vp8_kf_boost_seperationt_adjustment[cpi->frames_since_key] / 100;
+        else
+            Boost = Boost * vp8_kf_boost_seperationt_adjustment[15] / 100;
+
+        // Apply limits on boost
+        if (Boost > vp8_kf_gf_boost_qlimits[Q])
+            Boost = vp8_kf_gf_boost_qlimits[Q];
+        else if (Boost < 120)
+            Boost = 120;
+    }
+
+    // Keep a record of the boost that was used
+    cpi->last_boost = Boost;
+
+    // Should the next frame be an altref frame
+    if (cpi->pass != 2)
+    {
+        // For now Alt ref is not allowed except in 2 pass modes.
+        cpi->source_alt_ref_pending = FALSE;
+
+        /*if ( cpi->oxcf.fixed_q == -1)
+        {
+            if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) )
+                cpi->source_alt_ref_pending = TRUE;
+            else
+                cpi->source_alt_ref_pending = FALSE;
+        }*/
+    }
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100;
+    }
+    else
+    {
+
+        int bits_per_mb_at_this_q ;
+
+        if (cpi->oxcf.error_resilient_mode == 1)
+        {
+            cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth;
+            return;
+        }
+
+        // Rate targetted scenario:
+        // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target
+        bits_per_mb_at_this_q = (int)(.5 +
+                                      cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]);
+
+        cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100;
+
+        // Reset the active worst quality to the baseline value for key frames.
+        if (cpi->pass < 2)
+            cpi->active_worst_quality = cpi->worst_quality;
+    }
+}
+
+
+
+void vp8_calc_pframe_target_size(VP8_COMP *cpi)
+{
+    int min_frame_target;
+    int Adjustment;
+
+    // Set the min frame bandwidth.
+    //min_frame_target = estimate_min_frame_size( cpi );
+    min_frame_target = 0;
+
+    if (cpi->pass == 2)
+    {
+        min_frame_target = cpi->min_frame_bandwidth;
+
+        if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
+            min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+    }
+    else if (min_frame_target < cpi->per_frame_bandwidth / 4)
+        min_frame_target = cpi->per_frame_bandwidth / 4;
+
+
+    // Special alt reference frame case
+    if (cpi->common.refresh_alt_ref_frame)
+    {
+        if (cpi->pass == 2)
+        {
+            cpi->per_frame_bandwidth = cpi->gf_bits;                       // Per frame bit target for the alt ref frame
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+
+        /* One Pass ??? TBD */
+        /*else
+        {
+            int frames_in_section;
+            int allocation_chunks;
+            int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+            int alt_boost;
+            int max_arf_rate;
+
+            alt_boost = (cpi->gfu_boost * 3 * GFQ_ADJUSTMENT) / (2 * 100);
+            alt_boost += (cpi->frames_till_gf_update_due * 50);
+
+            // If alt ref is not currently active then we have a pottential double hit with GF and ARF so reduce the boost a bit.
+            // A similar thing is done on GFs that preceed a arf update.
+            if ( !cpi->source_alt_ref_active )
+                alt_boost = alt_boost * 3 / 4;
+
+            frames_in_section = cpi->frames_till_gf_update_due+1;                                   // Standard frames + GF
+            allocation_chunks = (frames_in_section * 100) + alt_boost;
+
+            // Normalize Altboost and allocations chunck down to prevent overflow
+            while ( alt_boost > 1000 )
+            {
+                alt_boost /= 2;
+                allocation_chunks /= 2;
+            }
+
+            else
+            {
+                int bits_in_section;
+
+                if ( cpi->kf_overspend_bits > 0 )
+                {
+                    Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+                    if ( Adjustment > (cpi->per_frame_bandwidth - min_frame_target) )
+                        Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+                    cpi->kf_overspend_bits -= Adjustment;
+
+                    // Calculate an inter frame bandwidth target for the next few frames designed to recover
+                    // any extra bits spent on the key frame.
+                    cpi->inter_frame_target = cpi->per_frame_bandwidth - Adjustment;
+                    if ( cpi->inter_frame_target < min_frame_target )
+                        cpi->inter_frame_target = min_frame_target;
+                }
+                else
+                    cpi->inter_frame_target = cpi->per_frame_bandwidth;
+
+                bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+                // Avoid loss of precision but avoid overflow
+                if ( (bits_in_section>>7) > allocation_chunks )
+                    cpi->this_frame_target = alt_boost * (bits_in_section / allocation_chunks);
+                else
+                    cpi->this_frame_target = (alt_boost * bits_in_section) / allocation_chunks;
+            }
+        }
+        */
+    }
+
+    // Normal frames (gf,and inter)
+    else
+    {
+        // 2 pass
+        if (cpi->pass == 2)
+        {
+            cpi->this_frame_target = cpi->per_frame_bandwidth;
+        }
+        // 1 pass
+        else
+        {
+            // Make rate adjustment to recover bits spent in key frame
+            // Test to see if the key frame inter data rate correction should still be in force
+            if (cpi->kf_overspend_bits > 0)
+            {
+                Adjustment = (cpi->kf_bitrate_adjustment <= cpi->kf_overspend_bits) ? cpi->kf_bitrate_adjustment : cpi->kf_overspend_bits;
+
+                if (Adjustment > (cpi->per_frame_bandwidth - min_frame_target))
+                    Adjustment = (cpi->per_frame_bandwidth - min_frame_target);
+
+                cpi->kf_overspend_bits -= Adjustment;
+
+                // Calculate an inter frame bandwidth target for the next few frames designed to recover
+                // any extra bits spent on the key frame.
+                cpi->this_frame_target = cpi->per_frame_bandwidth - Adjustment;
+
+                if (cpi->this_frame_target < min_frame_target)
+                    cpi->this_frame_target = min_frame_target;
+            }
+            else
+                cpi->this_frame_target = cpi->per_frame_bandwidth;
+
+            // If appropriate make an adjustment to recover bits spent on a recent GF
+            if ((cpi->gf_overspend_bits > 0) && (cpi->this_frame_target > min_frame_target))
+            {
+                int Adjustment = (cpi->non_gf_bitrate_adjustment <= cpi->gf_overspend_bits) ? cpi->non_gf_bitrate_adjustment : cpi->gf_overspend_bits;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                cpi->gf_overspend_bits -= Adjustment;
+                cpi->this_frame_target -= Adjustment;
+            }
+
+            // Apply small + and - boosts for non gf frames
+            if ((cpi->last_boost > 150) && (cpi->frames_till_gf_update_due > 0) &&
+                (cpi->current_gf_interval >= (MIN_GF_INTERVAL << 1)))
+            {
+                // % Adjustment limited to the range 1% to 10%
+                Adjustment = (cpi->last_boost - 100) >> 5;
+
+                if (Adjustment < 1)
+                    Adjustment = 1;
+                else if (Adjustment > 10)
+                    Adjustment = 10;
+
+                // Convert to bits
+                Adjustment = (cpi->this_frame_target * Adjustment) / 100;
+
+                if (Adjustment > (cpi->this_frame_target - min_frame_target))
+                    Adjustment = (cpi->this_frame_target - min_frame_target);
+
+                if (cpi->common.frames_since_golden == (cpi->current_gf_interval >> 1))
+                    cpi->this_frame_target += ((cpi->current_gf_interval - 1) * Adjustment);
+                else
+                    cpi->this_frame_target -= Adjustment;
+            }
+        }
+    }
+
+    // Set a reduced data rate target for our initial Q calculation.
+    // This should help to save bits during earier sections.
+    if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100))
+        cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100;
+
+    // Sanity check that the total sum of adjustments is not above the maximum allowed
+    // That is that having allowed for KF and GF penalties we have not pushed the
+    // current interframe target to low. If the adjustment we apply here is not capable of recovering
+    // all the extra bits we have spent in the KF or GF then the remainder will have to be recovered over
+    // a longer time span via other buffer / rate control mechanisms.
+    if (cpi->this_frame_target < min_frame_target)
+        cpi->this_frame_target = min_frame_target;
+
+    if (!cpi->common.refresh_alt_ref_frame)
+        // Note the baseline target data rate for this inter frame.
+        cpi->inter_frame_target = cpi->this_frame_target;
+
+    // One Pass specific code
+    if (cpi->pass == 0)
+    {
+        // Adapt target frame size with respect to any buffering constraints:
+        if (cpi->buffered_mode)
+        {
+            int one_percent_bits = 1 + cpi->oxcf.optimal_buffer_level / 100;
+
+            if ((cpi->buffer_level < cpi->oxcf.optimal_buffer_level) || (cpi->bits_off_target < cpi->oxcf.optimal_buffer_level))
+            {
+                int percent_low = 0;
+
+                // Decide whether or not we need to adjust the frame data rate target.
+                //
+                // If we are are below the optimal buffer fullness level and adherence
+                // to buffering contraints is important to the end useage then adjust
+                // the per frame target.
+                if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) && (cpi->buffer_level < cpi->oxcf.optimal_buffer_level))
+                {
+                    percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits;
+
+                    if (percent_low > 100)
+                        percent_low = 100;
+                    else if (percent_low < 0)
+                        percent_low = 0;
+                }
+                // Are we overshooting the long term clip data rate...
+                else if (cpi->bits_off_target < 0)
+                {
+                    // Adjust per frame data target downwards to compensate.
+                    percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8));
+
+                    if (percent_low > 100)
+                        percent_low = 100;
+                    else if (percent_low < 0)
+                        percent_low = 0;
+                }
+
+                // lower the target bandwidth for this frame.
+                cpi->this_frame_target = (cpi->this_frame_target * (100 - (percent_low / 2))) / 100;
+
+                // Are we using allowing control of active_worst_allowed_q according to buffer level.
+                if (cpi->auto_worst_q)
+                {
+                    int critical_buffer_level;
+
+                    // For streaming applications the most important factor is cpi->buffer_level as this takes
+                    // into account the specified short term buffering constraints. However, hitting the long
+                    // term clip data rate target is also important.
+                    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                    {
+                        // Take the smaller of cpi->buffer_level and cpi->bits_off_target
+                        critical_buffer_level = (cpi->buffer_level < cpi->bits_off_target) ? cpi->buffer_level : cpi->bits_off_target;
+                    }
+                    // For local file playback short term buffering contraints are less of an issue
+                    else
+                    {
+                        // Consider only how we are doing for the clip as a whole
+                        critical_buffer_level = cpi->bits_off_target;
+                    }
+
+                    // Set the active worst quality based upon the selected buffer fullness number.
+                    if (critical_buffer_level < cpi->oxcf.optimal_buffer_level)
+                    {
+                        if (critical_buffer_level > (cpi->oxcf.optimal_buffer_level / 4))
+                        {
+                            int qadjustment_range = cpi->worst_quality - cpi->ni_av_qi;
+                            int above_base = (critical_buffer_level - (cpi->oxcf.optimal_buffer_level / 4));
+
+                            // Step active worst quality down from cpi->ni_av_qi when (critical_buffer_level == cpi->optimal_buffer_level)
+                            // to cpi->oxcf.worst_allowed_q when (critical_buffer_level == cpi->optimal_buffer_level/4)
+                            cpi->active_worst_quality = cpi->worst_quality - ((qadjustment_range * above_base) / (cpi->oxcf.optimal_buffer_level * 3 / 4));
+                        }
+                        else
+                        {
+                            cpi->active_worst_quality = cpi->worst_quality;
+                        }
+                    }
+                    else
+                    {
+                        cpi->active_worst_quality = cpi->ni_av_qi;
+                    }
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+            else
+            {
+                int percent_high;
+
+                if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level)
+                {
+                    percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8));
+
+                    if (percent_high > 100)
+                        percent_high = 100;
+                    else if (percent_high < 0)
+                        percent_high = 0;
+
+                    cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100;
+
+                }
+
+                // Are we allowing control of active_worst_allowed_q according to bufferl level.
+                if (cpi->auto_worst_q)
+                {
+                    // When using the relaxed buffer model stick to the user specified value
+                    cpi->active_worst_quality = cpi->ni_av_qi;
+                }
+                else
+                {
+                    cpi->active_worst_quality = cpi->worst_quality;
+                }
+            }
+
+            // Set active_best_quality to prevent quality rising too high
+            cpi->active_best_quality = cpi->best_quality;
+
+            // Worst quality obviously must not be better than best quality
+            if (cpi->active_worst_quality <= cpi->active_best_quality)
+                cpi->active_worst_quality = cpi->active_best_quality + 1;
+
+        }
+        // Unbuffered mode (eg. video conferencing)
+        else
+        {
+            // Set the active worst quality
+            cpi->active_worst_quality = cpi->worst_quality;
+        }
+    }
+
+    // Test to see if we have to drop a frame
+    // The auto-drop frame code is only used in buffered mode.
+    // In unbufferd mode (eg vide conferencing) the descision to
+    // code or drop a frame is made outside the codec in response to real
+    // world comms or buffer considerations.
+    if (cpi->drop_frames_allowed && cpi->buffered_mode &&
+        (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+        ((cpi->common.frame_type != KEY_FRAME))) //|| !cpi->oxcf.allow_spatial_resampling) )
+    {
+        // Check for a buffer underun-crisis in which case we have to drop a frame
+        if ((cpi->buffer_level < 0))
+        {
+#if 0
+            FILE *f = fopen("dec.stt", "a");
+            fprintf(f, "%10d %10d %10d %10d ***** BUFFER EMPTY\n",
+                    (int) cpi->common.current_video_frame,
+                    cpi->decimation_factor, cpi->common.horiz_scale,
+                    (cpi->buffer_level * 100) / cpi->oxcf.optimal_buffer_level);
+            fclose(f);
+#endif
+            //vpx_log("Decoder: Drop frame due to bandwidth: %d \n",cpi->buffer_level, cpi->av_per_frame_bandwidth);
+
+            cpi->drop_frame = TRUE;
+        }
+
+#if 0
+        // Check for other drop frame crtieria (Note 2 pass cbr uses decimation on whole KF sections)
+        else if ((cpi->buffer_level < cpi->oxcf.drop_frames_water_mark * cpi->oxcf.optimal_buffer_level / 100) &&
+                 (cpi->drop_count < cpi->max_drop_count) && (cpi->pass == 0))
+        {
+            cpi->drop_frame = TRUE;
+        }
+
+#endif
+
+        if (cpi->drop_frame)
+        {
+            // Update the buffer level variable.
+            cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+            cpi->buffer_level = cpi->bits_off_target;
+        }
+        else
+            cpi->drop_count = 0;
+    }
+
+    // Adjust target frame size for Golden Frames:
+    if (cpi->oxcf.error_resilient_mode == 0 &&
+        (cpi->frames_till_gf_update_due == 0) && !cpi->drop_frame)
+    {
+        //int Boost = 0;
+        int Q = (cpi->oxcf.fixed_q < 0) ? cpi->last_q[INTER_FRAME] : cpi->oxcf.fixed_q;
+
+        int gf_frame_useage = 0;      // Golden frame useage since last GF
+        int tot_mbs = cpi->recent_ref_frame_usage[INTRA_FRAME]  +
+                      cpi->recent_ref_frame_usage[LAST_FRAME]   +
+                      cpi->recent_ref_frame_usage[GOLDEN_FRAME] +
+                      cpi->recent_ref_frame_usage[ALTREF_FRAME];
+
+        int pct_gf_active = (100 * cpi->common.gf_active_count) / (cpi->common.mb_rows * cpi->common.mb_cols);
+
+        // Reset the last boost indicator
+        //cpi->last_boost = 100;
+
+        if (tot_mbs)
+            gf_frame_useage = (cpi->recent_ref_frame_usage[GOLDEN_FRAME] + cpi->recent_ref_frame_usage[ALTREF_FRAME]) * 100 / tot_mbs;
+
+        if (pct_gf_active > gf_frame_useage)
+            gf_frame_useage = pct_gf_active;
+
+        // Is a fixed manual GF frequency being used
+        if (!cpi->auto_gold)
+            cpi->common.refresh_golden_frame = TRUE;
+        else
+        {
+            // For one pass throw a GF if recent frame intra useage is low or the GF useage is high
+            if ((cpi->pass == 0) && (cpi->this_frame_percent_intra < 15 || gf_frame_useage >= 5))
+                cpi->common.refresh_golden_frame = TRUE;
+
+            // Two pass GF descision
+            else if (cpi->pass == 2)
+                cpi->common.refresh_golden_frame = TRUE;
+        }
+
+#if 0
+
+        // Debug stats
+        if (0)
+        {
+            FILE *f;
+
+            f = fopen("gf_useaget.stt", "a");
+            fprintf(f, " %8ld %10ld %10ld %10ld %10ld\n",
+                    cpi->common.current_video_frame,  cpi->gfu_boost, GFQ_ADJUSTMENT, cpi->gfu_boost, gf_frame_useage);
+            fclose(f);
+        }
+
+#endif
+
+        if (cpi->common.refresh_golden_frame == TRUE)
+        {
+            int isize_adjustment = 0;
+#if 0
+
+            if (0)   // p_gw
+            {
+                FILE *f;
+
+                f = fopen("GFexit.stt", "a");
+                fprintf(f, "%8ld GF coded\n", cpi->common.current_video_frame);
+                fclose(f);
+            }
+
+#endif
+            cpi->initial_gf_use = 0;
+
+            if (cpi->auto_adjust_gold_quantizer)
+            {
+                calc_gf_params(cpi);
+            }
+
+            // If we are using alternate ref instead of gf then do not apply the boost
+            // It will instead be applied to the altref update
+            // Jims modified boost
+            if (!cpi->source_alt_ref_active)
+            {
+                if (cpi->oxcf.fixed_q < 0)
+                {
+                    if (cpi->pass == 2)
+                    {
+                        cpi->this_frame_target = cpi->per_frame_bandwidth;          // The spend on the GF is defined in the two pass code for two pass encodes
+                    }
+                    else
+                    {
+                        int Boost = cpi->last_boost;
+                        int frames_in_section = cpi->frames_till_gf_update_due + 1;
+                        int allocation_chunks = (frames_in_section * 100) + (Boost - 100);
+                        int bits_in_section = cpi->inter_frame_target * frames_in_section;
+
+                        // Normalize Altboost and allocations chunck down to prevent overflow
+                        while (Boost > 1000)
+                        {
+                            Boost /= 2;
+                            allocation_chunks /= 2;
+                        }
+
+                        // Avoid loss of precision but avoid overflow
+                        if ((bits_in_section >> 7) > allocation_chunks)
+                            cpi->this_frame_target = Boost * (bits_in_section / allocation_chunks);
+                        else
+                            cpi->this_frame_target = (Boost * bits_in_section) / allocation_chunks;
+                    }
+                }
+                else
+                    cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100;
+
+            }
+            // If there is an active ARF at this location use the minimum bits on this frame
+            else
+            {
+                cpi->this_frame_target = 0;           // Minimial spend on gf that is replacing an arf
+            }
+
+            cpi->current_gf_interval = cpi->frames_till_gf_update_due;
+
+        }
+    }
+}
+
+
+void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var)
+{
+    int    Q = cpi->common.base_qindex;
+    int    correction_factor = 100;
+    double rate_correction_factor;
+    double adjustment_limit;
+
+    int    projected_size_based_on_q = 0;
+
+    // Clear down mmx registers to allow floating point in what follows
+    vp8_clear_system_state();  //__asm emms;
+
+    if (cpi->common.frame_type == KEY_FRAME)
+    {
+        rate_correction_factor = cpi->key_frame_rate_correction_factor;
+    }
+    else
+    {
+        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            rate_correction_factor = cpi->gf_rate_correction_factor;
+        else
+            rate_correction_factor = cpi->rate_correction_factor;
+    }
+
+    // Work out how big we would have expected the frame to be at this Q given the current correction factor.
+    // Stay in double to avoid int overflow when values are large
+    //projected_size_based_on_q = ((int)(.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) >> BPER_MB_NORMBITS;
+    projected_size_based_on_q = (int)(((.5 + rate_correction_factor * vp8_bits_per_mb[cpi->common.frame_type][Q]) * cpi->common.MBs) / (1 << BPER_MB_NORMBITS));
+
+    // Make some allowance for cpi->zbin_over_quant
+    if (cpi->zbin_over_quant > 0)
+    {
+        int Z = cpi->zbin_over_quant;
+        double Factor = 0.99;
+        double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+
+        while (Z > 0)
+        {
+            Z --;
+            projected_size_based_on_q *= (int)Factor;
+            Factor += factor_adjustment;
+
+            if (Factor  >= 0.999)
+                Factor = 0.999;
+        }
+    }
+
+    // Work out a size correction factor.
+    //if ( cpi->this_frame_target > 0 )
+    //  correction_factor = (100 * cpi->projected_frame_size) / cpi->this_frame_target;
+    if (projected_size_based_on_q > 0)
+        correction_factor = (100 * cpi->projected_frame_size) / projected_size_based_on_q;
+
+    // More heavily damped adjustment used if we have been oscillating either side of target
+    switch (damp_var)
+    {
+    case 0:
+        adjustment_limit = 0.75;
+        break;
+    case 1:
+        adjustment_limit = 0.375;
+        break;
+    case 2:
+    default:
+        adjustment_limit = 0.25;
+        break;
+    }
+
+    //if ( (correction_factor > 102) && (Q < cpi->active_worst_quality) )
+    if (correction_factor > 102)
+    {
+        // We are not already at the worst allowable quality
+        correction_factor = (int)(100.5 + ((correction_factor - 100) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        // Keep rate_correction_factor within limits
+        if (rate_correction_factor > MAX_BPB_FACTOR)
+            rate_correction_factor = MAX_BPB_FACTOR;
+    }
+    //else if ( (correction_factor < 99) && (Q > cpi->active_best_quality) )
+    else if (correction_factor < 99)
+    {
+        // We are not already at the best allowable quality
+        correction_factor = (int)(100.5 - ((100 - correction_factor) * adjustment_limit));
+        rate_correction_factor = ((rate_correction_factor * correction_factor) / 100);
+
+        // Keep rate_correction_factor within limits
+        if (rate_correction_factor < MIN_BPB_FACTOR)
+            rate_correction_factor = MIN_BPB_FACTOR;
+    }
+
+    if (cpi->common.frame_type == KEY_FRAME)
+        cpi->key_frame_rate_correction_factor = rate_correction_factor;
+    else
+    {
+        if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            cpi->gf_rate_correction_factor = rate_correction_factor;
+        else
+            cpi->rate_correction_factor = rate_correction_factor;
+    }
+}
+
+static int estimate_bits_at_q(VP8_COMP *cpi, int Q)
+{
+    int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]);
+
+    /* Attempt to retain reasonable accuracy without overflow. The cutoff is
+     * chosen such that the maximum product of Bpm and MBs fits 31 bits. The
+     * largest Bpm takes 20 bits.
+     */
+    if (cpi->common.MBs > (1 << 11))
+        return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs;
+    else
+        return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS;
+
+}
+
+
+int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame)
+{
+    int Q = cpi->active_worst_quality;
+
+    // Reset Zbin OQ value
+    cpi->zbin_over_quant = 0;
+
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        Q = cpi->oxcf.fixed_q;
+
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            Q = cpi->oxcf.key_q;
+        }
+        else if (cpi->common.refresh_alt_ref_frame)
+        {
+            Q = cpi->oxcf.alt_q;
+        }
+        else if (cpi->common.refresh_golden_frame)
+        {
+            Q = cpi->oxcf.gold_q;
+        }
+
+    }
+    else
+    {
+        int i;
+        int last_error = INT_MAX;
+        int target_bits_per_mb;
+        int bits_per_mb_at_this_q;
+        double correction_factor;
+
+        // Select the appropriate correction factor based upon type of frame.
+        if (cpi->common.frame_type == KEY_FRAME)
+            correction_factor = cpi->key_frame_rate_correction_factor;
+        else
+        {
+            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+                correction_factor = cpi->gf_rate_correction_factor;
+            else
+                correction_factor = cpi->rate_correction_factor;
+        }
+
+        // Calculate required scaling factor based on target frame size and size of frame produced using previous Q
+        if (target_bits_per_frame >= (INT_MAX >> BPER_MB_NORMBITS))
+            target_bits_per_mb = (target_bits_per_frame / cpi->common.MBs) << BPER_MB_NORMBITS;       // Case where we would overflow int
+        else
+            target_bits_per_mb = (target_bits_per_frame << BPER_MB_NORMBITS) / cpi->common.MBs;
+
+        i = cpi->active_best_quality;
+
+        do
+        {
+            bits_per_mb_at_this_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][i]);
+
+            if (bits_per_mb_at_this_q <= target_bits_per_mb)
+            {
+                if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error)
+                    Q = i;
+                else
+                    Q = i - 1;
+
+                break;
+            }
+            else
+                last_error = bits_per_mb_at_this_q - target_bits_per_mb;
+        }
+        while (++i <= cpi->active_worst_quality);
+
+
+        // If we are at MAXQ then enable Q over-run which seeks to claw back additional bits through things like
+        // the RD multiplier and zero bin size.
+        if (Q >= MAXQ)
+        {
+            int zbin_oqmax;
+
+            double Factor = 0.99;
+            double factor_adjustment = 0.01 / 256.0; //(double)ZBIN_OQ_MAX;
+
+            if (cpi->common.frame_type == KEY_FRAME)
+                zbin_oqmax = 0; //ZBIN_OQ_MAX/16
+            else if (cpi->common.refresh_alt_ref_frame || (cpi->common.refresh_golden_frame && !cpi->source_alt_ref_active))
+                zbin_oqmax = 16;
+            else
+                zbin_oqmax = ZBIN_OQ_MAX;
+
+            /*{
+                double Factor = (double)target_bits_per_mb/(double)bits_per_mb_at_this_q;
+                double Oq;
+
+                Factor = Factor/1.2683;
+
+                Oq = pow( Factor, (1.0/-0.165) );
+
+                if ( Oq > zbin_oqmax )
+                    Oq = zbin_oqmax;
+
+                cpi->zbin_over_quant = (int)Oq;
+            }*/
+
+            // Each incrment in the zbin is assumed to have a fixed effect on bitrate. This is not of course true.
+            // The effect will be highly clip dependent and may well have sudden steps.
+            // The idea here is to acheive higher effective quantizers than the normal maximum by expanding the zero
+            // bin and hence decreasing the number of low magnitude non zero coefficients.
+            while (cpi->zbin_over_quant < zbin_oqmax)
+            {
+                cpi->zbin_over_quant ++;
+
+                if (cpi->zbin_over_quant > zbin_oqmax)
+                    cpi->zbin_over_quant = zbin_oqmax;
+
+                bits_per_mb_at_this_q *= (int)Factor;                   // Each over-ruin step is assumed to equate to approximately 3% reduction in bitrate
+                Factor += factor_adjustment;
+
+                if (Factor  >= 0.999)
+                    Factor = 0.999;
+
+                if (bits_per_mb_at_this_q <= target_bits_per_mb)    // Break out if we get down to the target rate
+                    break;
+            }
+
+        }
+    }
+
+    return Q;
+}
+
+static int estimate_min_frame_size(VP8_COMP *cpi)
+{
+    double correction_factor;
+    int bits_per_mb_at_max_q;
+
+    // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt.
+    if (cpi->common.current_video_frame < 10)
+    {
+        if (cpi->pass == 2)
+            return (cpi->min_frame_bandwidth);
+        else
+            return cpi->per_frame_bandwidth / 3;
+    }
+
+    /*  // Select the appropriate correction factor based upon type of frame.
+        if ( cpi->common.frame_type == KEY_FRAME )
+            correction_factor = cpi->key_frame_rate_correction_factor;
+        else
+        {
+            if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame )
+                correction_factor = cpi->gf_rate_correction_factor;
+            else
+                correction_factor = cpi->rate_correction_factor;
+        }*/
+
+    // We estimate at half the value we get from vp8_bits_per_mb
+    correction_factor = cpi->rate_correction_factor / 2.0;
+
+    bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]);
+
+    return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS;
+}
+
+void vp8_adjust_key_frame_context(VP8_COMP *cpi)
+{
+    int i;
+    int av_key_frames_per_second;
+
+    // Average key frame frequency and size
+    unsigned int total_weight = 0;
+    unsigned int av_key_frame_frequency = 0;
+    unsigned int av_key_frame_bits = 0;
+
+    unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate);
+    unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth);
+
+    // Clear down mmx registers to allow floating point in what follows
+    vp8_clear_system_state();  //__asm emms;
+
+    // Update the count of total key frame bits
+    cpi->tot_key_frame_bits += cpi->projected_frame_size;
+
+    // First key frame at start of sequence is a special case. We have no frequency data.
+    if (cpi->key_frame_count == 1)
+    {
+        av_key_frame_frequency = (int)cpi->output_frame_rate * 2;            // Assume a default of 1 kf every 2 seconds
+        av_key_frame_bits = cpi->projected_frame_size;
+        av_key_frames_per_second  = output_frame_rate / av_key_frame_frequency;  // Note output_frame_rate not cpi->output_frame_rate
+    }
+    else
+    {
+        // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes
+        for (i = 0; i < KEY_FRAME_CONTEXT; i++)
+        {
+            if (i < KEY_FRAME_CONTEXT - 1)
+            {
+                cpi->prior_key_frame_size[i]     = cpi->prior_key_frame_size[i+1];
+                cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1];
+            }
+            else
+            {
+                cpi->prior_key_frame_size[KEY_FRAME_CONTEXT - 1]     = cpi->projected_frame_size;
+                cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] = cpi->frames_since_key;
+            }
+
+            av_key_frame_bits      += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i];
+            av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i];
+            total_weight         += prior_key_frame_weight[i];
+        }
+
+        av_key_frame_bits       /= total_weight;
+        av_key_frame_frequency  /= total_weight;
+        av_key_frames_per_second  = output_frame_rate / av_key_frame_frequency;
+
+    }
+
+    // Do we have any key frame overspend to recover?
+    if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth))
+    {
+        // Update the count of key frame overspend to be recovered in subsequent frames
+        // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly)
+        // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits
+        // allocated than those following other gfs.
+        cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8;
+        cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8;
+
+        // Work out how much to try and recover per frame.
+        // For one pass we estimate the number of frames to spread it over based upon past history.
+        // For two pass we know how many frames there will be till the next kf.
+        if (cpi->pass == 2)
+        {
+            if (cpi->frames_to_key > 16)
+                cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key;
+            else
+                cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16;
+        }
+        else
+            cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency;
+    }
+
+    cpi->frames_since_key = 0;
+    cpi->last_key_frame_size = cpi->projected_frame_size;
+    cpi->key_frame_count++;
+}
+
+void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit)
+{
+    // Set-up bounds on acceptable frame size:
+    if (cpi->oxcf.fixed_q >= 0)
+    {
+        // Fixed Q scenario: frame size never outranges target (there is no target!)
+        *frame_under_shoot_limit = 0;
+        *frame_over_shoot_limit  = INT_MAX;
+    }
+    else
+    {
+        if (cpi->common.frame_type == KEY_FRAME)
+        {
+            *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+            *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+        }
+        else
+        {
+            if (cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame)
+            {
+                *frame_over_shoot_limit  = cpi->this_frame_target * 9 / 8;
+                *frame_under_shoot_limit = cpi->this_frame_target * 7 / 8;
+            }
+            else
+            {
+                // For CBR take buffer fullness into account
+                if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+                {
+                    if (cpi->buffer_level >= ((cpi->oxcf.optimal_buffer_level + cpi->oxcf.maximum_buffer_size) >> 1))
+                    {
+                        // Buffer is too full so relax overshoot and tighten undershoot
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 12 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 6 / 8;
+                    }
+                    else if (cpi->buffer_level <= (cpi->oxcf.optimal_buffer_level >> 1))
+                    {
+                        // Buffer is too low so relax undershoot and tighten overshoot
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 10 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 4 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
+                }
+                // VBR
+                // Note that tighter restrictions here can help quality but hurt encode speed
+                else
+                {
+                    *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                    *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                }
+            }
+        }
+    }
+}

diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h
new file mode 100644
index 0000000..588c7a8
--- /dev/null
+++ b/vp8/encoder/ratectrl.h

@@ -0,0 +1,26 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#if !defined __INC_RATECTRL_H
+
+#include "onyx_int.h"
+
+extern void vp8_save_coding_context(VP8_COMP *cpi);
+extern void vp8_restore_coding_context(VP8_COMP *cpi);
+
+extern void vp8_setup_key_frame(VP8_COMP *cpi);
+extern void vp8_calc_iframe_target_size(VP8_COMP *cpi);
+extern void vp8_calc_pframe_target_size(VP8_COMP *cpi);
+extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var);
+extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame);
+extern void vp8_adjust_key_frame_context(VP8_COMP *cpi);
+extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit);
+
+#endif

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
new file mode 100644
index 0000000..0846996
--- /dev/null
+++ b/vp8/encoder/rdopt.c

@@ -0,0 +1,2212 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdio.h>
+#include <math.h>
+#include <limits.h>
+#include <assert.h>
+#include "pragmas.h"
+
+#include "tokenize.h"
+#include "treewriter.h"
+#include "onyx_int.h"
+#include "modecosts.h"
+#include "encodeintra.h"
+#include "entropymode.h"
+#include "reconinter.h"
+#include "reconintra.h"
+#include "reconintra4x4.h"
+#include "findnearmv.h"
+#include "encodemb.h"
+#include "quantize.h"
+#include "idct.h"
+#include "g_common.h"
+#include "variance.h"
+#include "mcomp.h"
+
+#include "vpx_mem/vpx_mem.h"
+#include "dct.h"
+#include "systemdependent.h"
+
+#define DIAMONDSEARCH 1
+#if CONFIG_RUNTIME_CPU_DETECT
+#define IF_RTCD(x)  (x)
+#else
+#define IF_RTCD(x)  NULL
+#endif
+
+
+void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+
+
+#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+/*int  RDFUNC( int RM,int DM, int R, int D, int target_r )
+{
+    int rd_value;
+
+    rd_value =  ( ((128+(R)*(RM)) >> 8) + (DM)*(D) );
+
+    return rd_value;
+}*/
+
+#define UVRDFUNC(RM,DM,R,D,target_r)  RDFUNC(RM,DM,R,D,target_r)
+
+#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
+
+#define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
+
+
+extern const TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+extern const TOKENEXTRA *vp8_dct_value_tokens_ptr;
+extern int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+extern int *vp8_dct_value_cost_ptr;
+
+
+const int vp8_auto_speed_thresh[17] =
+{
+    1000,
+    200,
+    150,
+    130,
+    150,
+    125,
+    120,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    115,
+    105
+};
+
+const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES] =
+{
+    ZEROMV,
+    DC_PRED,
+
+    NEARESTMV,
+    NEARMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    ZEROMV,
+    NEARESTMV,
+
+    NEARMV,
+    NEARMV,
+
+    V_PRED,
+    H_PRED,
+    TM_PRED,
+
+    NEWMV,
+    NEWMV,
+    NEWMV,
+
+    SPLITMV,
+    SPLITMV,
+    SPLITMV,
+
+    B_PRED,
+};
+
+const MV_REFERENCE_FRAME vp8_ref_frame_order[MAX_MODES] =
+{
+    LAST_FRAME,
+    INTRA_FRAME,
+
+    LAST_FRAME,
+    LAST_FRAME,
+
+    GOLDEN_FRAME,
+    GOLDEN_FRAME,
+
+    ALTREF_FRAME,
+    ALTREF_FRAME,
+
+    GOLDEN_FRAME,
+    ALTREF_FRAME,
+
+    INTRA_FRAME,
+    INTRA_FRAME,
+    INTRA_FRAME,
+
+    LAST_FRAME,
+    GOLDEN_FRAME,
+    ALTREF_FRAME,
+
+    LAST_FRAME,
+    GOLDEN_FRAME,
+    ALTREF_FRAME,
+
+    INTRA_FRAME,
+};
+
+static void fill_token_costs(
+    unsigned int c      [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens],
+    const vp8_prob p    [BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens-1]
+)
+{
+    int i, j, k;
+
+
+    for (i = 0; i < BLOCK_TYPES; i++)
+        for (j = 0; j < COEF_BANDS; j++)
+            for (k = 0; k < PREV_COEF_CONTEXTS; k++)
+
+                vp8_cost_tokens((int *)(c [i][j][k]), p [i][j][k], vp8_coef_tree);
+
+}
+
+static int rd_iifactor [ 32 ] =  {    16,  16,  16,  12,   8,   4,   2,   0,
+                                      0,   0,   0,   0,   0,   0,   0,   0,
+                                      0,   0,   0,   0,   0,   0,   0,   0,
+                                      0,   0,   0,   0,   0,   0,   0,   0,
+                                 };
+
+
+
+
+// The values in this table should be reviewed
+static int sad_per_bit16lut[128] =
+{
+    4,  4, 4, 4,  4, 4, 4, 4,   // 4
+    4,  4, 4, 4,  4, 4, 4, 4,   // 1
+    4,  4, 4, 4,  4, 4, 4, 4,   // 2
+    4,  4, 4, 4,  4, 4, 4, 4,   // 3
+    4,  4, 4, 4,  4, 4, 4, 4,   // 4
+    4,  4, 12, 12, 13, 13, 14, 14, // 5
+    14, 14, 14, 15, 15, 15, 15, 15, // 6
+    15, 15, 15, 15, 15, 15, 15, 15, // 7
+    15, 15, 15, 15, 15, 16, 16, 16, // 8
+    16, 16, 18, 18, 18, 18, 19, 19, // 9
+    19, 19, 19, 19, 19, 19, 19, 19, // 10
+    20, 20, 22, 22, 22, 22, 21, 21, // 11
+    22, 22, 22, 22, 22, 22, 22, 22, // 12
+    22, 22, 22, 22, 22, 22, 22, 22, // 13
+    22, 22, 22, 22, 22, 22, 22, 22, // 14
+    22, 22, 22, 22, 22, 22, 22, 22, // 15
+};
+
+static int sad_per_bit4lut[128] =
+{
+    4,  4, 4, 4,  4, 4, 4, 4,   // 4
+    4,  4, 4, 4,  4, 4, 4, 4,   // 1
+    4,  4, 4, 4,  4, 4, 4, 4,   // 2
+    4,  4, 4, 4,  4, 4, 4, 4,   // 3
+    4,  4, 4, 4,  4, 4, 4, 4,   // 4
+    4,  4, 15, 15, 15, 15, 16, 16, // 5
+    16, 17, 17, 17, 17, 17, 17, 17, // 6
+    17, 17, 19, 19, 22, 22, 21, 21, // 7
+    23, 23, 23, 23, 23, 24, 24, 24, // 8
+    25, 25, 27, 27, 27, 27, 28, 28, // 9
+    28, 28, 29, 29, 29, 29, 29, 29, // 10
+    30, 30, 31, 31, 31, 31, 32, 32, // 11
+    34, 34, 34, 34, 34, 34, 34, 34, // 12
+    34, 34, 34, 34, 34, 34, 34, 34, // 13
+    34, 34, 34, 34, 34, 34, 34, 34, // 14
+    34, 34, 34, 34, 34, 34, 34, 34, // 15
+};
+
+void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
+{
+    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
+    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
+}
+
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
+{
+    int q;
+    int i;
+    int *thresh;
+    int threshmult;
+
+    int capped_q = (Qvalue < 160) ? Qvalue : 160;
+
+    vp8_clear_system_state();  //__asm emms;
+
+    cpi->RDMULT = (int)((0.00007 * (capped_q * capped_q * capped_q * capped_q)) - (0.0125 * (capped_q * capped_q * capped_q)) +
+                        (2.25 * (capped_q * capped_q)) - (12.5 * capped_q) + 25.0);
+
+    if (cpi->RDMULT < 25)
+        cpi->RDMULT = 25;
+
+    if (cpi->pass == 2)
+    {
+        if (cpi->common.frame_type == KEY_FRAME)
+            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[0]) / 16;
+        else if (cpi->next_iiratio > 31)
+            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[31]) / 16;
+        else
+            cpi->RDMULT += (cpi->RDMULT * rd_iifactor[cpi->next_iiratio]) / 16;
+    }
+
+
+    // Extend rate multiplier along side quantizer zbin increases
+    if (cpi->zbin_over_quant  > 0)
+    {
+        // Extend rate multiplier along side quantizer zbin increases
+        if (cpi->zbin_over_quant  > 0)
+        {
+            double oq_factor = pow(1.006,  cpi->zbin_over_quant);
+
+            if (oq_factor > (1.0 + ((double)cpi->zbin_over_quant / 64.0)))
+                oq_factor = (1.0 + (double)cpi->zbin_over_quant / 64.0);
+
+            cpi->RDMULT *= (int)oq_factor;
+        }
+    }
+
+    cpi->mb.errorperbit = (cpi->RDMULT / 100);
+
+    if (cpi->mb.errorperbit < 1)
+        cpi->mb.errorperbit = 1;
+
+    vp8_set_speed_features(cpi);
+
+    if (cpi->common.simpler_lpf)
+        cpi->common.filter_type = SIMPLE_LOOPFILTER;
+
+    q = (int)pow(Qvalue, 1.25);
+
+    if (q < 8)
+        q = 8;
+
+    if (cpi->ref_frame_flags == VP8_ALT_FLAG)
+    {
+        thresh      = &cpi->rd_threshes[THR_NEWA];
+        threshmult  = cpi->sf.thresh_mult[THR_NEWA];
+    }
+    else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
+    {
+        thresh      = &cpi->rd_threshes[THR_NEWG];
+        threshmult  = cpi->sf.thresh_mult[THR_NEWG];
+    }
+    else
+    {
+        thresh      = &cpi->rd_threshes[THR_NEWMV];
+        threshmult  = cpi->sf.thresh_mult[THR_NEWMV];
+    }
+
+    if (cpi->RDMULT > 1000)
+    {
+        cpi->RDDIV = 1;
+        cpi->RDMULT /= 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < INT_MAX)
+            {
+                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q / 100;
+            }
+            else
+            {
+                cpi->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+        }
+    }
+    else
+    {
+        cpi->RDDIV = 100;
+
+        for (i = 0; i < MAX_MODES; i++)
+        {
+            if (cpi->sf.thresh_mult[i] < (INT_MAX / q))
+            {
+                cpi->rd_threshes[i] = cpi->sf.thresh_mult[i] * q;
+            }
+            else
+            {
+                cpi->rd_threshes[i] = INT_MAX;
+            }
+
+            cpi->rd_baseline_thresh[i] = cpi->rd_threshes[i];
+        }
+    }
+
+    fill_token_costs(
+        cpi->mb.token_costs,
+        (const vp8_prob( *)[8][3][11]) cpi->common.fc.coef_probs
+    );
+
+    vp8_init_mode_costs(cpi);
+
+}
+
+void vp8_auto_select_speed(VP8_COMP *cpi)
+{
+    int used = cpi->oxcf.cpu_used;
+
+    int milliseconds_for_compress = (int)(1000000 / cpi->oxcf.frame_rate);
+
+    milliseconds_for_compress = milliseconds_for_compress * (16 - cpi->oxcf.cpu_used) / 16;
+
+#if 0
+
+    if (0)
+    {
+        FILE *f;
+
+        f = fopen("speed.stt", "a");
+        fprintf(f, " %8ld %10ld %10ld %10ld\n",
+                cpi->common.current_video_frame, cpi->Speed, milliseconds_for_compress, cpi->avg_pick_mode_time);
+        fclose(f);
+    }
+
+#endif
+
+    /*
+    // this is done during parameter valid check
+    if( used > 16)
+        used = 16;
+    if( used < -16)
+        used = -16;
+    */
+
+    if (cpi->avg_pick_mode_time < milliseconds_for_compress && (cpi->avg_encode_time - cpi->avg_pick_mode_time) < milliseconds_for_compress)
+    {
+        if (cpi->avg_pick_mode_time == 0)
+        {
+            cpi->Speed = 4;
+        }
+        else
+        {
+            if (milliseconds_for_compress * 100 < cpi->avg_encode_time * 95)
+            {
+                cpi->Speed          += 2;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                if (cpi->Speed > 16)
+                {
+                    cpi->Speed = 16;
+                }
+            }
+
+            if (milliseconds_for_compress * 100 > cpi->avg_encode_time * vp8_auto_speed_thresh[cpi->Speed])
+            {
+                cpi->Speed          -= 1;
+                cpi->avg_pick_mode_time = 0;
+                cpi->avg_encode_time = 0;
+
+                // In real-time mode, cpi->speed is in [4, 16].
+                if (cpi->Speed < 4)        //if ( cpi->Speed < 0 )
+                {
+                    cpi->Speed = 4;        //cpi->Speed = 0;
+                }
+            }
+        }
+    }
+    else
+    {
+        cpi->Speed += 4;
+
+        if (cpi->Speed > 16)
+            cpi->Speed = 16;
+
+
+        cpi->avg_pick_mode_time = 0;
+        cpi->avg_encode_time = 0;
+    }
+}
+
+int vp8_block_error_c(short *coeff, short *dqcoeff)
+{
+    int i;
+    int error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        int this_diff = coeff[i] - dqcoeff[i];
+        error += this_diff * this_diff;
+    }
+
+    return error;
+}
+
+int vp8_mbblock_error_c(MACROBLOCK *mb, int dc)
+{
+    BLOCK  *be;
+    BLOCKD *bd;
+    int i, j;
+    int berror, error = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        berror = 0;
+
+        for (j = dc; j < 16; j++)
+        {
+            int this_diff = be->coeff[j] - bd->dqcoeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    return error;
+}
+
+int vp8_mbuverror_c(MACROBLOCK *mb)
+{
+
+    BLOCK  *be;
+    BLOCKD *bd;
+
+
+    int i;
+    int error = 0;
+
+    for (i = 16; i < 24; i++)
+    {
+        be = &mb->block[i];
+        bd = &mb->e_mbd.block[i];
+
+        error += vp8_block_error_c(be->coeff, bd->dqcoeff);
+    }
+
+    return error;
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static int macro_block_max_error(MACROBLOCK *mb)
+{
+    int error = 0;
+    int dc = 0;
+    BLOCK  *be;
+    int i, j;
+    int berror;
+
+    dc = !(mb->e_mbd.mbmi.mode == B_PRED || mb->e_mbd.mbmi.mode == SPLITMV);
+
+    for (i = 0; i < 16; i++)
+    {
+        be = &mb->block[i];
+
+        berror = 0;
+
+        for (j = dc; j < 16; j++)
+        {
+            int this_diff = be->coeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    for (i = 16; i < 24; i++)
+    {
+        be = &mb->block[i];
+        berror = 0;
+
+        for (j = 0; j < 16; j++)
+        {
+            int this_diff = be->coeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    error <<= 2;
+
+    if (dc)
+    {
+        be = &mb->block[24];
+        berror = 0;
+
+        for (j = 0; j < 16; j++)
+        {
+            int this_diff = be->coeff[j];
+            berror += this_diff * this_diff;
+        }
+
+        error += berror;
+    }
+
+    error >>= 4;
+    return error;
+}
+#endif
+
+int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd)
+{
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = (*(x->block[16].base_src) + x->block[16].src);
+    unsigned char *vpred_ptr = (*(x->block[20].base_src) + x->block[20].src);
+    int uv_stride = x->block[16].src_stride;
+
+    unsigned int sse1 = 0;
+    unsigned int sse2 = 0;
+    int mv_row;
+    int mv_col;
+    int offset;
+    int pre_stride = x->e_mbd.block[16].pre_stride;
+
+    vp8_build_uvmvs(&x->e_mbd, 0);
+    mv_row = x->e_mbd.block[16].bmi.mv.as_mv.row;
+    mv_col = x->e_mbd.block[16].bmi.mv.as_mv.col;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->e_mbd.pre.u_buffer + offset;
+    vptr = x->e_mbd.pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
+    {
+        VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+        VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+    else
+    {
+        VARIANCE_INVOKE(rtcd, subpixvar8x8)(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
+        VARIANCE_INVOKE(rtcd, subpixvar8x8)(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
+        sse2 += sse1;
+    }
+
+    return sse2;
+
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l)
+{
+    int c = !type;              /* start at coef 0, unless Y with Y2 */
+    int eob = b->eob;
+    int pt ;    /* surrounding block/prev coef predictor */
+    int cost = 0;
+    short *qcoeff_ptr = b->qcoeff;
+
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+# define QC( I)  ( qcoeff_ptr [vp8_default_zig_zag1d[I]] )
+
+    for (; c < eob; c++)
+    {
+        int v = QC(c);
+        int t = vp8_dct_value_tokens_ptr[v].Token;
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [t];
+        cost += vp8_dct_value_cost_ptr[v];
+        pt = vp8_prev_token_class[t];
+    }
+
+# undef QC
+
+    if (c < 16)
+        cost += mb->token_costs [type] [vp8_coef_bands[c]] [pt] [DCT_EOB_TOKEN];
+
+    pt = (c != !type); // is eob first coefficient;
+    *a = *l = pt;
+
+    return cost;
+}
+
+int vp8_rdcost_mby(MACROBLOCK *mb)
+{
+    int cost = 0;
+    int b;
+    TEMP_CONTEXT t, t2;
+    int type = 0;
+
+    MACROBLOCKD *x = &mb->e_mbd;
+
+    vp8_setup_temp_context(&t, x->above_context[Y1CONTEXT], x->left_context[Y1CONTEXT], 4);
+    vp8_setup_temp_context(&t2, x->above_context[Y2CONTEXT], x->left_context[Y2CONTEXT], 1);
+
+    if (x->mbmi.mode == SPLITMV)
+        type = 3;
+
+    for (b = 0; b < 16; b++)
+        cost += cost_coeffs(mb, x->block + b, type,
+                            t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+    if (x->mbmi.mode != SPLITMV)
+        cost += cost_coeffs(mb, x->block + 24, 1,
+                            t2.a + vp8_block2above[24], t2.l + vp8_block2left[24]);
+
+    return cost;
+}
+
+
+static void rd_pick_intra4x4block(
+    VP8_COMP *cpi,
+    MACROBLOCK *x,
+    BLOCK *be,
+    BLOCKD *b,
+    B_PREDICTION_MODE *best_mode,
+    B_PREDICTION_MODE above,
+    B_PREDICTION_MODE left,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+
+    int *bestrate,
+    int *bestratey,
+    int *bestdistortion)
+{
+    B_PREDICTION_MODE mode;
+    int best_rd = INT_MAX;       // 1<<30
+    int rate = 0;
+    int distortion;
+    unsigned int *mode_costs;
+
+    ENTROPY_CONTEXT ta = *a, tempa = *a;
+    ENTROPY_CONTEXT tl = *l, templ = *l;
+
+
+    if (x->e_mbd.frame_type == KEY_FRAME)
+    {
+        mode_costs  = x->bmode_costs[above][left];
+    }
+    else
+    {
+        mode_costs = x->inter_bmode_costs;
+    }
+
+    for (mode = B_DC_PRED; mode <= B_HU_PRED; mode++)
+    {
+        int this_rd;
+        int ratey;
+
+        rate = mode_costs[mode];
+        vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, mode);
+
+        tempa = ta;
+        templ = tl;
+
+        ratey = cost_coeffs(x, b, 3, &tempa, &templ);
+        rate += ratey;
+        distortion = ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), berr)(be->coeff, b->dqcoeff) >> 2;
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            *bestrate = rate;
+            *bestratey = ratey;
+            *bestdistortion = distortion;
+            best_rd = this_rd;
+            *best_mode = mode;
+            *a = tempa;
+            *l = templ;
+        }
+    }
+
+    b->bmi.mode = (B_PREDICTION_MODE)(*best_mode);
+    vp8_encode_intra4x4block_rd(IF_RTCD(&cpi->rtcd), x, be, b, b->bmi.mode);
+
+}
+
+
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, int *Distortion)
+{
+    MACROBLOCKD *const xd = &mb->e_mbd;
+    int i;
+    TEMP_CONTEXT t;
+    int cost = mb->mbmode_cost [xd->frame_type] [B_PRED];
+    int distortion = 0;
+    int tot_rate_y = 0;
+
+    vp8_intra_prediction_down_copy(xd);
+    vp8_setup_temp_context(&t, xd->above_context[Y1CONTEXT], xd->left_context[Y1CONTEXT], 4);
+
+    for (i = 0; i < 16; i++)
+    {
+        MODE_INFO *const mic = xd->mode_info_context;
+        const int mis = xd->mode_info_stride;
+        const B_PREDICTION_MODE A = vp8_above_bmi(mic, i, mis)->mode;
+        const B_PREDICTION_MODE L = vp8_left_bmi(mic, i)->mode;
+        B_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
+        int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry), UNINITIALIZED_IS_SAFE(d);
+
+        rd_pick_intra4x4block(
+            cpi, mb, mb->block + i, xd->block + i, &best_mode, A, L,
+            t.a + vp8_block2above[i],
+            t.l + vp8_block2left[i], &r, &ry, &d);
+
+        cost += r;
+        distortion += d;
+        tot_rate_y += ry;
+        mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode;
+    }
+
+    *Rate = cost;
+    *rate_y += tot_rate_y;
+    *Distortion = distortion;
+
+    return RDCOST(mb->rdmult, mb->rddiv, cost, distortion);
+}
+
+int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *Rate, int *rate_y, int *Distortion)
+{
+
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int rate, ratey;
+    unsigned int distortion;
+    int best_rd = INT_MAX;
+
+    //Y Search for 16x16 intra prediction mode
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        int this_rd;
+        int dummy;
+        rate = 0;
+
+        x->e_mbd.mbmi.mode = mode;
+
+        rate += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+
+        vp8_encode_intra16x16mbyrd(IF_RTCD(&cpi->rtcd), x);
+
+        ratey = vp8_rdcost_mby(x);
+
+        rate += ratey;
+
+        VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.dst.y_buffer, x->e_mbd.dst.y_stride, &distortion, &dummy);
+
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (this_rd < best_rd)
+        {
+            mode_selected = mode;
+            best_rd = this_rd;
+            *Rate = rate;
+            *rate_y = ratey;
+            *Distortion = (int)distortion;
+        }
+    }
+
+    x->e_mbd.mbmi.mode = mode_selected;
+    return best_rd;
+}
+
+
+static int rd_cost_mbuv(MACROBLOCK *mb)
+{
+    TEMP_CONTEXT t, t2;
+    int b;
+    int cost = 0;
+    MACROBLOCKD *x = &mb->e_mbd;
+
+    vp8_setup_temp_context(&t, x->above_context[UCONTEXT], x->left_context[UCONTEXT], 2);
+    vp8_setup_temp_context(&t2, x->above_context[VCONTEXT], x->left_context[VCONTEXT], 2);
+
+    for (b = 16; b < 20; b++)
+        cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
+                            t.a + vp8_block2above[b], t.l + vp8_block2left[b]);
+
+    for (b = 20; b < 24; b++)
+        cost += cost_coeffs(mb, x->block + b, vp8_block2type[b],
+                            t2.a + vp8_block2above[b], t2.l + vp8_block2left[b]);
+
+    return cost;
+}
+
+
+unsigned int vp8_get_mbuvrecon_error(const vp8_variance_rtcd_vtable_t *rtcd, const MACROBLOCK *x) // sum of squares
+{
+    unsigned int sse0, sse1;
+    int sum0, sum1;
+    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.u_buffer, x->src.uv_stride, x->e_mbd.dst.u_buffer, x->e_mbd.dst.uv_stride, &sse0, &sum0);
+    VARIANCE_INVOKE(rtcd, get8x8var)(x->src.v_buffer, x->src.uv_stride, x->e_mbd.dst.v_buffer, x->e_mbd.dst.uv_stride, &sse1, &sum1);
+    return (sse0 + sse1);
+}
+
+static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel)
+{
+    vp8_build_uvmvs(&x->e_mbd, fullpixel);
+    vp8_encode_inter16x16uvrd(IF_RTCD(&cpi->rtcd), x);
+
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+
+    return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb);
+}
+
+int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
+{
+    MB_PREDICTION_MODE mode;
+    MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
+    int best_rd = INT_MAX;
+    int UNINITIALIZED_IS_SAFE(d), UNINITIALIZED_IS_SAFE(r);
+    int rate_to;
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode++)
+    {
+        int rate;
+        int distortion;
+        int this_rd;
+
+        x->e_mbd.mbmi.uv_mode = mode;
+        vp8_encode_intra16x16mbuvrd(IF_RTCD(&cpi->rtcd), x);
+
+        rate_to = rd_cost_mbuv(x);
+        rate = rate_to + x->intra_uv_mode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.uv_mode];
+
+        distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
+
+        this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+
+        if (this_rd < best_rd)
+        {
+            best_rd = this_rd;
+            d = distortion;
+            r = rate;
+            *rate_tokenonly = rate_to;
+            mode_selected = mode;
+        }
+    }
+
+    *rate = r;
+    *distortion = d;
+
+    x->e_mbd.mbmi.uv_mode = mode_selected;
+    return best_rd;
+}
+#endif
+
+int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4])
+{
+    vp8_prob p [VP8_MVREFS-1];
+    assert(NEARESTMV <= m  &&  m <= SPLITMV);
+    vp8_mv_ref_probs(p, near_mv_ref_ct);
+    return vp8_cost_token(vp8_mv_ref_tree, p, VP8_MVREFENCODINGS + m);
+}
+
+void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv)
+{
+    int i;
+
+    x->e_mbd.mbmi.mode = mb;
+    x->e_mbd.mbmi.mv.as_mv.row = mv->row;
+    x->e_mbd.mbmi.mv.as_mv.col = mv->col;
+
+    for (i = 0; i < 16; i++)
+    {
+        B_MODE_INFO *bmi = &x->e_mbd.block[i].bmi;
+        bmi->mode = (B_PREDICTION_MODE) mb;
+        bmi->mv.as_mv.row = mv->row;
+        bmi->mv.as_mv.col = mv->col;
+    }
+}
+
+#if !(CONFIG_REALTIME_ONLY)
+int vp8_count_labels(int const *labelings)
+{
+    int i;
+    int count = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        if (labelings[i] > count)
+            count = labelings[i];
+    }
+
+    return count + 1;
+}
+
+
+static int labels2mode(
+    MACROBLOCK *x,
+    int const *labelings, int which_label,
+    B_PREDICTION_MODE this_mode,
+    MV *this_mv, MV *best_ref_mv,
+    int *mvcost[2]
+)
+{
+    MACROBLOCKD *const xd = & x->e_mbd;
+    MODE_INFO *const mic = xd->mode_info_context;
+    const int mis = xd->mode_info_stride;
+
+    int cost = 0;
+    int thismvcost = 0;
+
+    /* We have to be careful retrieving previously-encoded motion vectors.
+       Ones from this macroblock have to be pulled from the BLOCKD array
+       as they have not yet made it to the bmi array in our MB_MODE_INFO. */
+
+    int i = 0;
+
+    do
+    {
+        BLOCKD *const d = xd->block + i;
+        const int row = i >> 2,  col = i & 3;
+
+        B_PREDICTION_MODE m;
+
+        if (labelings[i] != which_label)
+            continue;
+
+        if (col  &&  labelings[i] == labelings[i-1])
+            m = LEFT4X4;
+        else if (row  &&  labelings[i] == labelings[i-4])
+            m = ABOVE4X4;
+        else
+        {
+            // the only time we should do costing for new motion vector or mode
+            // is when we are on a new label  (jbb May 08, 2007)
+            switch (m = this_mode)
+            {
+            case NEW4X4 :
+                thismvcost  = vp8_mv_bit_cost(this_mv, best_ref_mv, mvcost, 102);
+                break;
+            case LEFT4X4:
+                *this_mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv;
+                break;
+            case ABOVE4X4:
+                *this_mv = row ? d[-4].bmi.mv.as_mv : vp8_above_bmi(mic, i, mis)->mv.as_mv;
+                break;
+            case ZERO4X4:
+                this_mv->row = this_mv->col = 0;
+                break;
+            default:
+                break;
+            }
+
+            if (m == ABOVE4X4)  // replace above with left if same
+            {
+                const MV mv = col ? d[-1].bmi.mv.as_mv : vp8_left_bmi(mic, i)->mv.as_mv;
+
+                if (mv.row == this_mv->row  &&  mv.col == this_mv->col)
+                    m = LEFT4X4;
+            }
+
+            cost = x->inter_bmode_costs[ m];
+        }
+
+        d->bmi.mode = m;
+        d->bmi.mv.as_mv = *this_mv;
+
+    }
+    while (++i < 16);
+
+    cost += thismvcost ;
+    return cost;
+}
+
+static int rdcost_mbsegment_y(MACROBLOCK *mb, const int *labels, int which_label, TEMP_CONTEXT *t)
+{
+    int cost = 0;
+    int b;
+    MACROBLOCKD *x = &mb->e_mbd;
+
+
+    for (b = 0; b < 16; b++)
+        if (labels[ b] == which_label)
+            cost += cost_coeffs(mb, x->block + b, 3,
+                                t->a + vp8_block2above[b],
+                                t->l + vp8_block2left[b]);
+
+    return cost;
+
+}
+static unsigned int vp8_encode_inter_mb_segment(MACROBLOCK *x, int const *labels, int which_label, const vp8_encodemb_rtcd_vtable_t *rtcd)
+{
+    int i;
+    unsigned int distortion = 0;
+
+    for (i = 0; i < 16; i++)
+    {
+        if (labels[i] == which_label)
+        {
+            BLOCKD *bd = &x->e_mbd.block[i];
+            BLOCK *be = &x->block[i];
+
+
+            vp8_build_inter_predictors_b(bd, 16, x->e_mbd.subpixel_predict);
+            ENCODEMB_INVOKE(rtcd, subb)(be, bd, 16);
+            x->short_fdct4x4rd(be->src_diff, be->coeff, 32);
+
+            // set to 0 no way to account for 2nd order DC so discount
+            //be->coeff[0] = 0;
+            x->quantize_brd(be, bd);
+
+            distortion += ENCODEMB_INVOKE(rtcd, berr)(be->coeff, bd->dqcoeff);
+        }
+    }
+
+    return distortion;
+}
+
+static void macro_block_yrd(MACROBLOCK *mb, int *Rate, int *Distortion, const vp8_encodemb_rtcd_vtable_t *rtcd)
+{
+    int b;
+    MACROBLOCKD *const x = &mb->e_mbd;
+    BLOCK   *const mb_y2 = mb->block + 24;
+    BLOCKD *const x_y2  = x->block + 24;
+    short *Y2DCPtr = mb_y2->src_diff;
+    BLOCK *beptr;
+    int d;
+
+    ENCODEMB_INVOKE(rtcd, submby)(mb->src_diff, mb->src.y_buffer, mb->e_mbd.predictor, mb->src.y_stride);
+
+    // Fdct and building the 2nd order block
+    for (beptr = mb->block; beptr < mb->block + 16; beptr += 2)
+    {
+        mb->short_fdct8x4rd(beptr->src_diff, beptr->coeff, 32);
+        *Y2DCPtr++ = beptr->coeff[0];
+        *Y2DCPtr++ = beptr->coeff[16];
+    }
+
+    // 2nd order fdct
+    if (x->mbmi.mode != SPLITMV)
+    {
+        mb->short_walsh4x4(mb_y2->src_diff, mb_y2->coeff, 8);
+    }
+
+    // Quantization
+    for (b = 0; b < 16; b++)
+    {
+        mb->quantize_brd(&mb->block[b], &mb->e_mbd.block[b]);
+    }
+
+    // DC predication and Quantization of 2nd Order block
+    if (x->mbmi.mode != SPLITMV)
+    {
+
+        {
+            mb->quantize_brd(mb_y2, x_y2);
+        }
+    }
+
+    // Distortion
+    if (x->mbmi.mode == SPLITMV)
+        d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 0) << 2;
+    else
+    {
+        d = ENCODEMB_INVOKE(rtcd, mberr)(mb, 1) << 2;
+        d += ENCODEMB_INVOKE(rtcd, berr)(mb_y2->coeff, x_y2->dqcoeff);
+    }
+
+    *Distortion = (d >> 4);
+
+    // rate
+    *Rate = vp8_rdcost_mby(mb);
+}
+
+static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, MV *best_ref_mv, int best_rd, int *mdcounts, int *returntotrate, int *returnyrate, int *returndistortion, int compressor_speed, int *mvcost[2], int mvthresh, int fullpixel)
+{
+    int i, segmentation;
+    B_PREDICTION_MODE this_mode;
+    MACROBLOCKD *xc = &x->e_mbd;
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    BLOCK *c = &x->block[0];
+    BLOCKD *e = &x->e_mbd.block[0];
+    int const *labels;
+    int best_segment_rd = INT_MAX;
+    int best_seg = 0;
+    int br = 0;
+    int bd = 0;
+    int bsr = 0;
+    int bsd = 0;
+    int bestsegmentyrate = 0;
+
+    // FIX TO Rd error outrange bug PGW 9 june 2004
+    B_PREDICTION_MODE bmodes[16] = {ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+                                    ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+                                    ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
+                                    ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4
+                                   };
+
+    MV bmvs[16];
+    int beobs[16];
+
+    for (segmentation = 0; segmentation < VP8_NUMMBSPLITS; segmentation++)
+    {
+        int label_count;
+        int this_segment_rd = 0;
+        int label_mv_thresh;
+        int rate = 0;
+        int sbr = 0;
+        int sbd = 0;
+        int UNINITIALIZED_IS_SAFE(sseshift);
+        int segmentyrate = 0;
+
+        vp8_variance_fn_ptr_t v_fn_ptr;
+
+        TEMP_CONTEXT t;
+        TEMP_CONTEXT tb;
+        vp8_setup_temp_context(&t, xc->above_context[Y1CONTEXT], xc->left_context[Y1CONTEXT], 4);
+
+        br = 0;
+        bd = 0;
+
+        switch (segmentation)
+        {
+        case 0:
+            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
+            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
+            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
+            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
+            sseshift = 3;
+            break;
+        case 1:
+            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
+            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
+            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
+            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
+            sseshift = 3;
+            break;
+        case 2:
+            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
+            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
+            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
+            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
+            sseshift = 2;
+            break;
+        case 3:
+            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
+            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
+            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
+            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
+            sseshift = 0;
+            break;
+        }
+
+        labels = vp8_mbsplits[segmentation];
+        label_count = vp8_count_labels(labels);
+
+        // 64 makes this threshold really big effectively
+        // making it so that we very rarely check mvs on
+        // segments.   setting this to 1 would make mv thresh
+        // roughly equal to what it is for macroblocks
+        label_mv_thresh = 1 * mvthresh / label_count ;
+
+        // Segmentation method overheads
+        rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
+
+        rate += vp8_cost_mv_ref(SPLITMV, mdcounts);
+
+        this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb);
+        br += rate;
+
+        for (i = 0; i < label_count; i++)
+        {
+            MV mode_mv[B_MODE_COUNT];
+            int best_label_rd = INT_MAX;
+            B_PREDICTION_MODE mode_selected = ZERO4X4;
+            int j;
+            int bestlabelyrate = 0;
+
+            b = &x->block[0];
+            d = &x->e_mbd.block[0];
+
+
+            // find first label
+            for (j = 0; j < 16; j++)
+                if (labels[j] == i)
+                    break;
+
+            c = &x->block[j];
+            e = &x->e_mbd.block[j];
+
+            // search for the best motion vector on this segment
+            for (this_mode = LEFT4X4; this_mode <= NEW4X4 ; this_mode ++)
+            {
+                int distortion;
+                int this_rd;
+                int num00;
+                int labelyrate;
+
+                TEMP_CONTEXT ts;
+                vp8_setup_temp_context(&ts, &t.a[0], &t.l[0], 4);
+
+                if (this_mode == NEW4X4)
+                {
+                    int step_param = 0;
+                    int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param;
+                    int n;
+                    int thissme;
+                    int bestsme = INT_MAX;
+                    MV  temp_mv;
+
+                    // Is the best so far sufficiently good that we cant justify doing and new motion search.
+                    if (best_label_rd < label_mv_thresh)
+                        break;
+
+                    {
+                        int sadpb = x->sadperbit4;
+
+                        if (cpi->sf.search_method == HEX)
+                            bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr.vf, v_fn_ptr.sdf, x->mvsadcost, mvcost);
+                        else
+                        {
+                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+
+                            n = num00;
+                            num00 = 0;
+
+                            while (n < further_steps)
+                            {
+                                n++;
+
+                                if (num00)
+                                    num00--;
+                                else
+                                {
+                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+
+                                    if (thissme < bestsme)
+                                    {
+                                        bestsme = thissme;
+                                        mode_mv[NEW4X4].row = temp_mv.row;
+                                        mode_mv[NEW4X4].col = temp_mv.col;
+                                    }
+                                }
+                            }
+                        }
+
+                        // Should we do a full search (best quality only)
+                        if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000)
+                        {
+                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, &v_fn_ptr, x->mvcost, x->mvsadcost);
+
+                            if (thissme < bestsme)
+                            {
+                                bestsme = thissme;
+                                mode_mv[NEW4X4] = e->bmi.mv.as_mv;
+                            }
+                            else
+                            {
+                                // The full search result is actually worse so re-instate the previous best vector
+                                e->bmi.mv.as_mv = mode_mv[NEW4X4];
+                            }
+                        }
+                    }
+
+                    if (bestsme < INT_MAX)
+                    {
+                        if (!fullpixel)
+                            cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                        else
+                            vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                    }
+                }
+
+                rate = labels2mode(x, labels, i, this_mode, &mode_mv[this_mode], best_ref_mv, mvcost);
+
+                // Trap vectors that reach beyond the UMV borders
+                if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+                    ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+                {
+                    continue;
+                }
+
+                distortion = vp8_encode_inter_mb_segment(x, labels, i, IF_RTCD(&cpi->rtcd.encodemb)) / 4;
+
+                labelyrate = rdcost_mbsegment_y(x, labels, i, &ts);
+                rate += labelyrate;
+
+                this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+
+                if (this_rd < best_label_rd)
+                {
+                    sbr = rate;
+                    sbd = distortion;
+                    bestlabelyrate = labelyrate;
+                    mode_selected = this_mode;
+                    best_label_rd = this_rd;
+                    vp8_setup_temp_context(&tb, &ts.a[0], &ts.l[0], 4);
+
+                }
+            }
+
+            vp8_setup_temp_context(&t, &tb.a[0], &tb.l[0], 4);
+
+            labels2mode(x, labels, i, mode_selected, &mode_mv[mode_selected], best_ref_mv, mvcost);
+
+            br += sbr;
+            bd += sbd;
+            segmentyrate += bestlabelyrate;
+            this_segment_rd += best_label_rd;
+
+            if ((this_segment_rd > best_rd) || (this_segment_rd > best_segment_rd))
+                break;
+        }
+
+        if ((this_segment_rd <= best_rd) && (this_segment_rd < best_segment_rd))
+        {
+            bsr = br;
+            bsd = bd;
+            bestsegmentyrate = segmentyrate;
+            best_segment_rd = this_segment_rd;
+            best_seg = segmentation;
+
+            // store everything needed to come back to this!!
+            for (i = 0; i < 16; i++)
+            {
+                BLOCKD *bd = &x->e_mbd.block[i];
+
+                bmvs[i] = bd->bmi.mv.as_mv;
+                bmodes[i] = bd->bmi.mode;
+                beobs[i] = bd->eob;
+            }
+        }
+    }
+
+    // set it to the best
+    for (i = 0; i < 16; i++)
+    {
+        BLOCKD *bd = &x->e_mbd.block[i];
+
+        bd->bmi.mv.as_mv = bmvs[i];
+        bd->bmi.mode = bmodes[i];
+        bd->eob = beobs[i];
+    }
+
+    // Trap cases where the best split mode has all vectors coded 0,0 (or all the same)
+    if (FALSE)
+    {
+        int allsame = 1;
+
+        for (i = 1; i < 16; i++)
+        {
+            if ((bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row))
+            {
+                allsame = 0;
+                break;
+            }
+        }
+
+        if (allsame)
+        {
+            best_segment_rd = INT_MAX;
+        }
+    }
+
+    *returntotrate = bsr;
+    *returndistortion = bsd;
+    *returnyrate = bestsegmentyrate;
+
+
+
+    // save partitions
+    labels = vp8_mbsplits[best_seg];
+    x->e_mbd.mbmi.partitioning = best_seg;
+    x->e_mbd.mbmi.partition_count = vp8_count_labels(labels);
+
+    for (i = 0; i < x->e_mbd.mbmi.partition_count; i++)
+    {
+        int j;
+
+        for (j = 0; j < 16; j++)
+        {
+            if (labels[j] == i)
+                break;
+        }
+
+        x->e_mbd.mbmi.partition_bmi[i].mode = x->e_mbd.block[j].bmi.mode;
+        x->e_mbd.mbmi.partition_bmi[i].mv.as_mv = x->e_mbd.block[j].bmi.mv.as_mv;
+    }
+
+    return best_segment_rd;
+}
+
+
+int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra)
+{
+    BLOCK *b = &x->block[0];
+    BLOCKD *d = &x->e_mbd.block[0];
+    MACROBLOCKD *xd = &x->e_mbd;
+    B_MODE_INFO best_bmodes[16];
+    MB_MODE_INFO best_mbmode;
+    MV best_ref_mv;
+    MV mode_mv[MB_MODE_COUNT];
+    MB_PREDICTION_MODE this_mode;
+    int num00;
+    int best_mode_index = 0;
+
+    int i;
+    int mode_index;
+    int mdcounts[4];
+    int rate;
+    int distortion;
+    int best_rd = INT_MAX; // 1 << 30;
+    int ref_frame_cost[MAX_REF_FRAMES];
+    int rate2, distortion2;
+    int uv_intra_rate, uv_intra_distortion, uv_intra_rate_tokenonly;
+    int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
+
+    //int all_rds[MAX_MODES];        // Experimental debug code.
+    //int all_rates[MAX_MODES];
+    //int all_dist[MAX_MODES];
+    //int intermodecost[MAX_MODES];
+
+    MB_PREDICTION_MODE uv_intra_mode;
+    int sse;
+    int sum;
+    int uvintra_eob = 0;
+    int tteob = 0;
+    int force_no_skip = 0;
+
+    *returnintra = INT_MAX;
+
+    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+
+    x->skip = 0;
+
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
+
+    // Experimental code
+    // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb
+    //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) )
+    /*{
+        int tmprdmult;
+
+        //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs);
+        tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb;
+        //tmprdmult = tmprdmult;
+
+        //if ( tmprdmult > cpi->RDMULT * 2 )
+        //  tmprdmult = cpi->RDMULT * 2;
+        //else if ( tmprdmult < cpi->RDMULT / 2 )
+        //  tmprdmult = cpi->RDMULT / 2;
+
+        //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult;
+
+        //x->rdmult = tmprdmult;
+
+    }*/
+
+    // Special case treatment when GF and ARF are not sensible options for reference
+    if (cpi->ref_frame_flags == VP8_LAST_FLAG)
+    {
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_zero(255);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(255)
+                                        + vp8_cost_zero(128);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(255)
+                                        + vp8_cost_one(128);
+    }
+    else
+    {
+        ref_frame_cost[LAST_FRAME]    = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_zero(cpi->prob_last_coded);
+        ref_frame_cost[GOLDEN_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_zero(cpi->prob_gf_coded);
+        ref_frame_cost[ALTREF_FRAME]  = vp8_cost_one(cpi->prob_intra_coded)
+                                        + vp8_cost_one(cpi->prob_last_coded)
+                                        + vp8_cost_one(cpi->prob_gf_coded);
+    }
+
+    vpx_memset(mode_mv, 0, sizeof(mode_mv));
+
+    x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+    vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
+    uv_intra_mode = x->e_mbd.mbmi.uv_mode;
+    {
+        uvintra_eob = 0;
+
+        for (i = 16; i < 24; i++)
+            uvintra_eob += x->e_mbd.block[i].eob;
+    }
+
+    for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
+    {
+        int frame_cost;
+        int this_rd = INT_MAX;
+        int lf_or_gf = 0;           // Lat Frame (01) or gf/arf (1)
+        int disable_skip = 0;
+
+        force_no_skip = 0;
+
+        // Experimental debug code.
+        // Record of rd values recorded for this MB. -1 indicates not measured
+        //all_rds[mode_index] = -1;
+        //all_rates[mode_index] = -1;
+        //all_dist[mode_index] = -1;
+        //intermodecost[mode_index] = -1;  
+
+        // Test best rd so far against threshold for trying this mode.
+        if (best_rd <= cpi->rd_threshes[mode_index])
+            continue;
+
+
+
+        // These variables hold are rolling total cost and distortion for this mode
+        rate2 = 0;
+        distortion2 = 0;
+
+        // Where skip is allowable add in the default per mb cost for the no skip case.
+        // where we then decide to skip we have to delete this and replace it with the
+        // cost of signallying a skip
+        if (cpi->common.mb_no_coeff_skip)
+        {
+            rate2 += vp8_cost_bit(cpi->prob_skip_false, 0);
+        }
+
+        this_mode = vp8_mode_order[mode_index];
+
+        x->e_mbd.mbmi.mode = this_mode;
+        x->e_mbd.mbmi.uv_mode = DC_PRED;
+        x->e_mbd.mbmi.ref_frame = vp8_ref_frame_order[mode_index];
+
+        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
+        if (cpi->is_src_frame_alt_ref)
+        {
+            if (this_mode != ZEROMV || x->e_mbd.mbmi.ref_frame != ALTREF_FRAME)
+                continue;
+        }
+
+        if (x->e_mbd.mbmi.ref_frame == LAST_FRAME)
+        {
+            if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
+                continue;
+
+            lf_or_gf = 0;  // Local last frame vs Golden frame flag
+
+            // Set up pointers for this macro block into the previous frame recon buffer
+            x->e_mbd.pre.y_buffer = cpi->common.last_frame.y_buffer + recon_yoffset;
+            x->e_mbd.pre.u_buffer = cpi->common.last_frame.u_buffer + recon_uvoffset;
+            x->e_mbd.pre.v_buffer = cpi->common.last_frame.v_buffer + recon_uvoffset;
+        }
+        else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME)
+        {
+
+            // not supposed to reference gold frame
+            if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
+                continue;
+
+            lf_or_gf = 1;  // Local last frame vs Golden frame flag
+
+            // Set up pointers for this macro block into the previous frame recon buffer
+            x->e_mbd.pre.y_buffer = cpi->common.golden_frame.y_buffer + recon_yoffset;
+            x->e_mbd.pre.u_buffer = cpi->common.golden_frame.u_buffer + recon_uvoffset;
+            x->e_mbd.pre.v_buffer = cpi->common.golden_frame.v_buffer + recon_uvoffset;
+        }
+        else if (x->e_mbd.mbmi.ref_frame == ALTREF_FRAME)
+        {
+            // not supposed to reference alt ref frame
+            if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
+                continue;
+
+            //if ( !cpi->source_alt_ref_active )
+            //  continue;
+
+            lf_or_gf = 1;  // Local last frame vs Golden frame flag
+
+            // Set up pointers for this macro block into the previous frame recon buffer
+            x->e_mbd.pre.y_buffer = cpi->common.alt_ref_frame.y_buffer + recon_yoffset;
+            x->e_mbd.pre.u_buffer = cpi->common.alt_ref_frame.u_buffer + recon_uvoffset;
+            x->e_mbd.pre.v_buffer = cpi->common.alt_ref_frame.v_buffer + recon_uvoffset;
+        }
+
+        vp8_find_near_mvs(&x->e_mbd,
+                          x->e_mbd.mode_info_context,
+                          &mode_mv[NEARESTMV], &mode_mv[NEARMV], &best_ref_mv,
+                          mdcounts, x->e_mbd.mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
+
+
+        // Estimate the reference frame signaling cost and add it to the rolling cost variable.
+        frame_cost = ref_frame_cost[x->e_mbd.mbmi.ref_frame];
+        rate2 += frame_cost;
+
+        if (this_mode <= B_PRED)
+        {
+            for (i = 0; i < 16; i++)
+            {
+                vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+            }
+        }
+
+        // Check to see if the testing frequency for this mode is at its max
+        // If so then prevent it from being tested and increase the threshold for its testing
+        if (cpi->mode_test_hit_counts[mode_index] && (cpi->mode_check_freq[mode_index] > 1))
+        {
+            if (cpi->mbs_tested_so_far  <= cpi->mode_check_freq[mode_index] * cpi->mode_test_hit_counts[mode_index])
+            {
+                // Increase the threshold for coding this mode to make it less likely to be chosen
+                cpi->rd_thresh_mult[mode_index] += 4;
+
+                if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                    cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+                cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+
+                continue;
+            }
+        }
+
+        // We have now reached the point where we are going to test the current mode so increment the counter for the number of times it has been tested
+        cpi->mode_test_hit_counts[mode_index] ++;
+
+        // Experimental code. Special case for gf and arf zeromv modes. Increase zbin size to supress noise
+        if (cpi->zbin_mode_boost_enabled)
+        {
+            if ((vp8_mode_order[mode_index] == ZEROMV) && (vp8_ref_frame_order[mode_index] != LAST_FRAME))
+                cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
+            else
+                cpi->zbin_mode_boost = 0;
+
+            vp8cx_mb_init_quantizer(cpi, x);
+        }
+
+        switch (this_mode)
+        {
+        case B_PRED:
+
+            // Note the rate value returned here includes the cost of coding the BPRED mode : x->mbmode_cost[x->e_mbd.frame_type][BPRED];
+            vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate, &rate_y, &distortion);
+            rate2 += rate;
+            //rate_y = rate;
+            distortion2 += distortion;
+            rate2 += uv_intra_rate;
+            rate_uv = uv_intra_rate_tokenonly;
+            distortion2 += uv_intra_distortion;
+            break;
+
+        case SPLITMV:
+        {
+            int frame_cost_rd = RDFUNC(x->rdmult, x->rddiv, frame_cost, 0, cpi->target_bits_per_mb);
+            int saved_rate = rate2;
+
+            // vp8_rd_pick_best_mbsegmentation looks only at Y and does not account for frame_cost.
+            // (best_rd - frame_cost_rd) is thus a conservative breakout number.
+            int breakout_rd = best_rd - frame_cost_rd;
+            int tmp_rd;
+
+            if (x->e_mbd.mbmi.ref_frame == LAST_FRAME)
+                tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWMV], cpi->common.full_pixel) ;
+            else if (x->e_mbd.mbmi.ref_frame == GOLDEN_FRAME)
+                tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWG], cpi->common.full_pixel) ;
+            else
+                tmp_rd = vp8_rd_pick_best_mbsegmentation(cpi, x, &best_ref_mv, breakout_rd, mdcounts, &rate, &rate_y, &distortion, cpi->compressor_speed, x->mvcost, cpi->rd_threshes[THR_NEWA], cpi->common.full_pixel) ;
+
+            rate2 += rate;
+            distortion2 += distortion;
+
+            // If even the 'Y' rd value of split is higher than best so far then dont bother looking at UV
+            if (tmp_rd < breakout_rd)
+            {
+                // Now work out UV cost and add it in
+                vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel);
+                rate2 += rate;
+                rate_uv = rate;
+                distortion2 += distortion;
+
+            }
+            else
+            {
+                this_rd = INT_MAX;
+                disable_skip = 1;
+            }
+
+            // Trap cases where the best split mode has all vectors coded 0,0 (or all the same)
+            if (0)
+            {
+                int allsame = 1;
+
+                for (i = 1; i < 16; i++)
+                {
+                    BLOCKD *bd = &x->e_mbd.block[i];
+
+                    if (bd->bmi.mv.as_int != x->e_mbd.block[0].bmi.mv.as_int)   //(bmvs[i].col != bmvs[i-1].col) || (bmvs[i].row != bmvs[i-1].row ) )
+                    {
+                        allsame = 0;
+                        break;
+                    }
+                }
+
+                if (allsame)
+                {
+                    // reset mode and mv and jump to newmv
+                    this_mode = NEWMV;
+                    distortion2 = 0;
+                    rate2 = saved_rate;
+                    mode_mv[NEWMV].row = x->e_mbd.block[0].bmi.mv.as_mv.row;
+                    mode_mv[NEWMV].col = x->e_mbd.block[0].bmi.mv.as_mv.col;
+                    rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+                    goto mv_selected;
+                }
+            }
+
+            // trap cases where the 8x8s can be promoted to 8x16s or 16x8s
+            if (0)//x->e_mbd.mbmi.partition_count == 4)
+            {
+
+                if (x->e_mbd.mbmi.partition_bmi[0].mv.as_int == x->e_mbd.mbmi.partition_bmi[1].mv.as_int
+                    && x->e_mbd.mbmi.partition_bmi[2].mv.as_int == x->e_mbd.mbmi.partition_bmi[3].mv.as_int)
+                {
+                    const int *labels = vp8_mbsplits[2];
+                    x->e_mbd.mbmi.partitioning = 0;
+                    rate -= vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + 2);
+                    rate += vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings);
+                    //rate -=  x->inter_bmode_costs[  x->e_mbd.mbmi.partition_bmi[1]];
+                    //rate -=  x->inter_bmode_costs[  x->e_mbd.mbmi.partition_bmi[3]];
+                    x->e_mbd.mbmi.partition_bmi[1] = x->e_mbd.mbmi.partition_bmi[2];
+                }
+            }
+
+        }
+        break;
+        case DC_PRED:
+        case V_PRED:
+        case H_PRED:
+        case TM_PRED:
+            x->e_mbd.mbmi.ref_frame = INTRA_FRAME;
+            vp8_build_intra_predictors_mby_ptr(&x->e_mbd);
+            {
+                macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb)) ;
+                rate2 += rate;
+                rate_y = rate;
+                distortion2 += distortion;
+                rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mbmi.mode];
+                rate2 += uv_intra_rate;
+                rate_uv = uv_intra_rate_tokenonly;
+                distortion2 += uv_intra_distortion;
+            }
+            break;
+
+        case NEWMV:
+
+            // Decrement full search counter
+            if (cpi->check_freq[lf_or_gf] > 0)
+                cpi->check_freq[lf_or_gf] --;
+
+            {
+                int thissme;
+                int bestsme = INT_MAX;
+                int step_param = cpi->sf.first_step;
+                int search_range;
+                int further_steps;
+                int n;
+
+                // Work out how long a search we should do
+                search_range = MAXF(abs(best_ref_mv.col), abs(best_ref_mv.row)) >> 3;
+
+                if (search_range >= x->vector_range)
+                    x->vector_range = search_range;
+                else if (x->vector_range > cpi->sf.min_fs_radius)
+                    x->vector_range--;
+
+                // Initial step/diamond search
+                {
+                    int sadpb = x->sadperbit16;
+
+                    if (cpi->sf.search_method == HEX)
+                    {
+                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+                    }
+                    else
+                    {
+                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                        mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                        mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+                        // Further step/diamond searches as necessary
+                        n = 0;
+                        further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
+
+                        n = num00;
+                        num00 = 0;
+
+                        while (n < further_steps)
+                        {
+                            n++;
+
+                            if (num00)
+                                num00--;
+                            else
+                            {
+                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+
+                                if (thissme < bestsme)
+                                {
+                                    bestsme = thissme;
+                                    mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                                    mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+                                }
+                                else
+                                {
+                                    d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+                                    d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+                                }
+                            }
+                        }
+                    }
+
+                }
+
+                // Should we do a full search
+                if (!cpi->check_freq[lf_or_gf] || cpi->do_full[lf_or_gf])
+                {
+                    int thissme;
+                    int full_flag_thresh = 0;
+
+                    // Update x->vector_range based on best vector found in step search
+                    search_range = MAXF(abs(d->bmi.mv.as_mv.row), abs(d->bmi.mv.as_mv.col));
+
+                    if (search_range > x->vector_range)
+                        x->vector_range = search_range;
+                    else
+                        search_range = x->vector_range;
+
+                    // Apply limits
+                    search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range;
+                    {
+                        int sadpb = x->sadperbit16 >> 2;
+                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr, x->mvcost, x->mvsadcost);
+                    }
+
+                    // Barrier threshold to initiating full search
+                    // full_flag_thresh = 10 + (thissme >> 7);
+                    if ((thissme + full_flag_thresh) < bestsme)
+                    {
+                        cpi->do_full[lf_or_gf] ++;
+                        bestsme = thissme;
+                    }
+                    else if (thissme < bestsme)
+                        bestsme = thissme;
+                    else
+                    {
+                        cpi->do_full[lf_or_gf] = cpi->do_full[lf_or_gf] >> 1;
+                        cpi->check_freq[lf_or_gf] = cpi->sf.full_freq[lf_or_gf];
+
+                        // The full search result is actually worse so re-instate the previous best vector
+                        d->bmi.mv.as_mv.row = mode_mv[NEWMV].row;
+                        d->bmi.mv.as_mv.col = mode_mv[NEWMV].col;
+                    }
+                }
+
+                if (bestsme < INT_MAX)
+                    // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost);  // normal mvc=11
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, cpi->fn_ptr.svf, cpi->fn_ptr.vf, x->mvcost);
+
+                mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
+                mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
+
+                // Add the new motion vector cost to our rolling cost variable
+                rate2 += vp8_mv_bit_cost(&mode_mv[NEWMV], &best_ref_mv, x->mvcost, 96);
+
+            }
+
+        case NEARESTMV:
+        case NEARMV:
+
+            // Clip "next_nearest" so that it does not extend to far out of image
+            if (mode_mv[this_mode].col < (xd->mb_to_left_edge - LEFT_TOP_MARGIN))
+                mode_mv[this_mode].col = xd->mb_to_left_edge - LEFT_TOP_MARGIN;
+            else if (mode_mv[this_mode].col > xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN)
+                mode_mv[this_mode].col = xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN;
+
+            if (mode_mv[this_mode].row < (xd->mb_to_top_edge - LEFT_TOP_MARGIN))
+                mode_mv[this_mode].row = xd->mb_to_top_edge - LEFT_TOP_MARGIN;
+            else if (mode_mv[this_mode].row > xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN)
+                mode_mv[this_mode].row = xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN;
+
+            // Do not bother proceeding if the vector (from newmv,nearest or near) is 0,0 as this should then be coded using the zeromv mode.
+            if (((this_mode == NEARMV) || (this_mode == NEARESTMV)) &&
+                ((mode_mv[this_mode].row == 0) && (mode_mv[this_mode].col == 0)))
+                continue;
+
+        case ZEROMV:
+
+        mv_selected:
+
+            // Trap vectors that reach beyond the UMV borders
+            // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point
+            // because of the lack of break statements in the previous two cases.
+            if (((mode_mv[this_mode].row >> 3) < x->mv_row_min) || ((mode_mv[this_mode].row >> 3) > x->mv_row_max) ||
+                ((mode_mv[this_mode].col >> 3) < x->mv_col_min) || ((mode_mv[this_mode].col >> 3) > x->mv_col_max))
+                continue;
+
+            vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]);
+            vp8_build_inter_predictors_mby(&x->e_mbd);
+            VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);
+
+            if (cpi->active_map_enabled && x->active_ptr[0] == 0)
+            {
+                x->skip = 1;
+            }
+            else if (sse < x->encode_breakout)
+            {
+                // Check u and v to make sure skip is ok
+                int sse2 = 0;
+
+                sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
+
+                if (sse2 * 2 < x->encode_breakout)
+                {
+                    x->skip = 1;
+                    distortion2 = sse;
+                    rate2 = 500;
+
+                    disable_skip = 1;    // We have no real rate data so trying to adjust for rate_y and rate_uv below will cause problems.
+                    this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+
+                    break;              // (PGW) Move break here from below - for now at least
+                }
+                else
+                    x->skip = 0;
+            }
+
+            //intermodecost[mode_index] = vp8_cost_mv_ref(this_mode, mdcounts);   // Experimental debug code
+
+            // Add in the Mv/mode cost
+            rate2 += vp8_cost_mv_ref(this_mode, mdcounts);
+
+            // Y cost and distortion
+            macro_block_yrd(x, &rate, &distortion, IF_RTCD(&cpi->rtcd.encodemb));
+            rate2 += rate;
+            rate_y = rate;
+            distortion2 += distortion;
+
+            // UV cost and distortion
+            vp8_rd_inter_uv(cpi, x, &rate, &distortion, cpi->common.full_pixel);
+            rate2 += rate;
+            rate_uv = rate;
+            distortion2 += distortion;
+            break;
+
+        default:
+            break;
+        }
+
+        if (!disable_skip)
+        {
+            // Test for the condition where skip block will be activated because there are no non zero coefficients and make any necessary adjustment for rate
+            if (cpi->common.mb_no_coeff_skip)
+            {
+                tteob = 0;
+
+                for (i = 0; i <= 24; i++)
+                {
+                    tteob += x->e_mbd.block[i].eob;
+                }
+
+                if (tteob == 0)
+                {
+#if 1
+                    rate2 -= (rate_y + rate_uv);
+
+                    // Back out no skip flag costing and add in skip flag costing
+                    if (cpi->prob_skip_false)
+                    {
+                        rate2 += vp8_cost_bit(cpi->prob_skip_false, 1);
+                        rate2 -= vp8_cost_bit(cpi->prob_skip_false, 0);
+                    }
+
+#else
+                    int rateuseskip;
+                    int ratenotuseskip;
+
+
+
+                    ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rateuseskip    = vp8_cost_bit(cpi->prob_skip_false, 1);
+
+                    if (1) // rateuseskip<ratenotuseskip)
+                    {
+                        rate2 -= ratenotuseskip;
+                        rate2 += rateuseskip;
+                        force_no_skip = 0;
+                    }
+                    else
+                    {
+                        force_no_skip = 1;
+                    }
+
+#endif
+                }
+
+#if             0
+                else
+                {
+                    int rateuseskip;
+                    int ratenotuseskip;
+                    int maxdistortion;
+                    int minrate;
+                    int skip_rd;
+
+                    // distortion when no coeff is encoded
+                    maxdistortion = macro_block_max_error(x);
+
+                    ratenotuseskip = rate_y + rate_uv + vp8_cost_bit(cpi->prob_skip_false, 0);
+                    rateuseskip    = vp8_cost_bit(cpi->prob_skip_false, 1);
+
+                    minrate         = rateuseskip - ratenotuseskip;
+
+                    skip_rd = RDFUNC(x->rdmult, x->rddiv, minrate, maxdistortion - distortion2, cpi->target_bits_per_mb);
+
+                    if (skip_rd + 50 < 0 && x->e_mbd.mbmi.ref_frame != INTRA_FRAME && rate_y + rate_uv < 4000)
+                    {
+                        force_no_skip = 1;
+                        rate2       = rate2 + rateuseskip - ratenotuseskip;
+                        distortion2 =  maxdistortion;
+                    }
+                    else
+                    {
+                        force_no_skip = 0;
+                    }
+
+                }
+
+#endif
+
+            }
+
+            // Calculate the final RD estimate for this mode
+            this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+        }
+
+        // Experimental debug code.
+        //all_rds[mode_index] = this_rd;
+        //all_rates[mode_index] = rate2;
+        //all_dist[mode_index] = distortion2;
+
+        if ((x->e_mbd.mbmi.ref_frame == INTRA_FRAME)  && (this_rd < *returnintra))
+        {
+            *returnintra = this_rd ;
+        }
+
+        // Did this mode help.. i.i is it the new best mode
+        if (this_rd < best_rd || x->skip)
+        {
+            // Note index of best mode so far
+            best_mode_index = mode_index;
+            x->e_mbd.mbmi.force_no_skip = force_no_skip;
+
+            if (this_mode <= B_PRED)
+            {
+                x->e_mbd.mbmi.uv_mode = uv_intra_mode;
+            }
+
+            *returnrate = rate2;
+            *returndistortion = distortion2;
+            best_rd = this_rd;
+            vpx_memcpy(&best_mbmode, &x->e_mbd.mbmi, sizeof(MB_MODE_INFO));
+
+            for (i = 0; i < 16; i++)
+            {
+                vpx_memcpy(&best_bmodes[i], &x->e_mbd.block[i].bmi, sizeof(B_MODE_INFO));
+            }
+
+            // Testing this mode gave rise to an improvement in best error score. Lower threshold a bit for next time
+            cpi->rd_thresh_mult[mode_index] = (cpi->rd_thresh_mult[mode_index] >= (MIN_THRESHMULT + 2)) ? cpi->rd_thresh_mult[mode_index] - 2 : MIN_THRESHMULT;
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        // If the mode did not help improve the best error case then raise the threshold for testing that mode next time around.
+        else
+        {
+            cpi->rd_thresh_mult[mode_index] += 4;
+
+            if (cpi->rd_thresh_mult[mode_index] > MAX_THRESHMULT)
+                cpi->rd_thresh_mult[mode_index] = MAX_THRESHMULT;
+
+            cpi->rd_threshes[mode_index] = (cpi->rd_baseline_thresh[mode_index] >> 7) * cpi->rd_thresh_mult[mode_index];
+        }
+
+        if (x->skip)
+            break;
+    }
+
+    // Reduce the activation RD thresholds for the best choice mode
+    if ((cpi->rd_baseline_thresh[best_mode_index] > 0) && (cpi->rd_baseline_thresh[best_mode_index] < (INT_MAX >> 2)))
+    {
+        int best_adjustment = (cpi->rd_thresh_mult[best_mode_index] >> 2);
+
+        cpi->rd_thresh_mult[best_mode_index] = (cpi->rd_thresh_mult[best_mode_index] >= (MIN_THRESHMULT + best_adjustment)) ? cpi->rd_thresh_mult[best_mode_index] - best_adjustment : MIN_THRESHMULT;
+        cpi->rd_threshes[best_mode_index] = (cpi->rd_baseline_thresh[best_mode_index] >> 7) * cpi->rd_thresh_mult[best_mode_index];
+
+        // If we chose a split mode then reset the new MV thresholds as well
+        /*if ( vp8_mode_order[best_mode_index] == SPLITMV )
+        {
+            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWMV] >> 4);
+            cpi->rd_thresh_mult[THR_NEWMV] = (cpi->rd_thresh_mult[THR_NEWMV] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWMV]-best_adjustment: MIN_THRESHMULT;
+            cpi->rd_threshes[THR_NEWMV] = (cpi->rd_baseline_thresh[THR_NEWMV] >> 7) * cpi->rd_thresh_mult[THR_NEWMV];
+
+            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWG] >> 4);
+            cpi->rd_thresh_mult[THR_NEWG] = (cpi->rd_thresh_mult[THR_NEWG] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWG]-best_adjustment: MIN_THRESHMULT;
+            cpi->rd_threshes[THR_NEWG] = (cpi->rd_baseline_thresh[THR_NEWG] >> 7) * cpi->rd_thresh_mult[THR_NEWG];
+
+            best_adjustment = 4; //(cpi->rd_thresh_mult[THR_NEWA] >> 4);
+            cpi->rd_thresh_mult[THR_NEWA] = (cpi->rd_thresh_mult[THR_NEWA] >= (MIN_THRESHMULT+best_adjustment)) ? cpi->rd_thresh_mult[THR_NEWA]-best_adjustment: MIN_THRESHMULT;
+            cpi->rd_threshes[THR_NEWA] = (cpi->rd_baseline_thresh[THR_NEWA] >> 7) * cpi->rd_thresh_mult[THR_NEWA];
+        }*/
+
+    }
+
+    // If we have chosen new mv or split then decay the full search check count more quickly.
+    if ((vp8_mode_order[best_mode_index] == NEWMV) || (vp8_mode_order[best_mode_index] == SPLITMV))
+    {
+        int lf_or_gf = (vp8_ref_frame_order[best_mode_index] == LAST_FRAME) ? 0 : 1;
+
+        if (cpi->check_freq[lf_or_gf] && !cpi->do_full[lf_or_gf])
+        {
+            cpi->check_freq[lf_or_gf] --;
+        }
+    }
+
+    // Keep a record of best mode index that we chose
+    cpi->last_best_mode_index = best_mode_index;
+
+    // Note how often each mode chosen as best
+    cpi->mode_chosen_counts[best_mode_index] ++;
+
+
+    if (cpi->is_src_frame_alt_ref && (best_mbmode.mode != ZEROMV || best_mbmode.ref_frame != ALTREF_FRAME))
+    {
+        best_mbmode.mode = ZEROMV;
+        best_mbmode.ref_frame = ALTREF_FRAME;
+        best_mbmode.mv.as_int = 0;
+        best_mbmode.uv_mode = 0;
+        best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0;
+        best_mbmode.partitioning = 0;
+        best_mbmode.dc_diff = 0;
+
+        vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+        for (i = 0; i < 16; i++)
+        {
+            vpx_memset(&x->e_mbd.block[i].bmi, 0, sizeof(B_MODE_INFO));
+        }
+
+        x->e_mbd.mbmi.mv.as_int = 0;
+
+        return best_rd;
+    }
+
+
+    // macroblock modes
+    vpx_memcpy(&x->e_mbd.mbmi, &best_mbmode, sizeof(MB_MODE_INFO));
+
+    for (i = 0; i < 16; i++)
+    {
+        vpx_memcpy(&x->e_mbd.block[i].bmi, &best_bmodes[i], sizeof(B_MODE_INFO));
+    }
+
+    x->e_mbd.mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv;
+
+    return best_rd;
+}
+#endif
+

diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h
new file mode 100644
index 0000000..c6eae4b
--- /dev/null
+++ b/vp8/encoder/rdopt.h

@@ -0,0 +1,20 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_RDOPT_H
+#define __INC_RDOPT_H
+void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue);
+int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion);
+int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion);
+int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_to, int *distortion);
+extern int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+
+
+#endif

diff --git a/vp8/encoder/sad_c.c b/vp8/encoder/sad_c.c
new file mode 100644
index 0000000..74c6bd7
--- /dev/null
+++ b/vp8/encoder/sad_c.c

@@ -0,0 +1,248 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <stdlib.h>
+
+unsigned int vp8_sad16x16_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int max_sad)
+{
+
+    int r, c;
+    unsigned int sad = 0;
+
+    for (r = 0; r < 16; r++)
+    {
+        for (c = 0; c < 16; c++)
+        {
+            sad += abs(src_ptr[c] - ref_ptr[c]);
+        }
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+    }
+
+    return sad;
+}
+
+
+static __inline
+unsigned int sad_mx_n_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int m,
+    int n)
+{
+
+    int r, c;
+    unsigned int sad = 0;
+
+    for (r = 0; r < n; r++)
+    {
+        for (c = 0; c < m; c++)
+        {
+            sad += abs(src_ptr[c] - ref_ptr[c]);
+        }
+
+        src_ptr += src_stride;
+        ref_ptr += ref_stride;
+    }
+
+    return sad;
+}
+
+
+unsigned int vp8_sad8x8_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int max_sad)
+{
+
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 8);
+}
+
+
+unsigned int vp8_sad16x8_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int max_sad)
+{
+
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 16, 8);
+
+}
+
+
+unsigned int vp8_sad8x16_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int max_sad)
+{
+
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 8, 16);
+}
+
+
+unsigned int vp8_sad4x4_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    int max_sad)
+{
+
+    return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, 4, 4);
+}
+
+void vp8_sad16x16x3_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x8x3_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x8x3_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x16x3_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad4x4x3_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr,
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr  , ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 1, ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr + 2, ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x16x4d_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr[],
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+    sad_array[3] = vp8_sad16x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad16x8x4d_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr[],
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+    sad_array[3] = vp8_sad16x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x8x4d_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr[],
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+    sad_array[3] = vp8_sad8x8_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad8x16x4d_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr[],
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+    sad_array[3] = vp8_sad8x16_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}
+
+void vp8_sad4x4x4d_c(
+    unsigned char *src_ptr,
+    int  src_stride,
+    unsigned char *ref_ptr[],
+    int  ref_stride,
+    unsigned int *sad_array
+)
+{
+    sad_array[0] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[0], ref_stride, 0x7fffffff);
+    sad_array[1] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[1], ref_stride, 0x7fffffff);
+    sad_array[2] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[2], ref_stride, 0x7fffffff);
+    sad_array[3] = vp8_sad4x4_c(src_ptr, src_stride, ref_ptr[3], ref_stride, 0x7fffffff);
+}

diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
new file mode 100644
index 0000000..df214a8
--- /dev/null
+++ b/vp8/encoder/ssim.c

@@ -0,0 +1,521 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_scale/yv12config.h"
+#include "math.h"
+
+#define C1 (float)(64 * 64 * 0.01*255*0.01*255)
+#define C2 (float)(64 * 64 * 0.03*255*0.03*255)
+
+static int width_y;
+static int height_y;
+static int height_uv;
+static int width_uv;
+static int stride_uv;
+static int stride;
+static int lumimask;
+static int luminance;
+static double plane_summed_weights = 0;
+
+static short img12_sum_block[8*4096*4096*2] ;
+
+static short img1_sum[8*4096*2];
+static short img2_sum[8*4096*2];
+static int   img1_sq_sum[8*4096*2];
+static int   img2_sq_sum[8*4096*2];
+static int   img12_mul_sum[8*4096*2];
+
+
+double vp8_similarity
+(
+    int mu_x,
+    int mu_y,
+    int pre_mu_x2,
+    int pre_mu_y2,
+    int pre_mu_xy2
+)
+{
+    int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy;
+
+    mu_x2 = mu_x * mu_x;
+    mu_y2 = mu_y * mu_y;
+    mu_xy = mu_x * mu_y;
+
+    theta_x2 = 64 * pre_mu_x2 - mu_x2;
+    theta_y2 = 64 * pre_mu_y2 - mu_y2;
+    theta_xy = 64 * pre_mu_xy2 - mu_xy;
+
+    return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2));
+}
+
+double vp8_ssim
+(
+    const unsigned char *img1,
+    const unsigned char *img2,
+    int stride_img1,
+    int stride_img2,
+    int width,
+    int height
+)
+{
+    int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp;
+
+    double plane_quality, weight, mean;
+
+    short *img1_sum_ptr1, *img1_sum_ptr2;
+    short *img2_sum_ptr1, *img2_sum_ptr2;
+    int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2;
+    int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2;
+    int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2;
+
+    plane_quality = 0;
+
+    if (lumimask)
+        plane_summed_weights = 0.0f;
+    else
+        plane_summed_weights = (height - 7) * (width - 7);
+
+    //some prologue for the main loop
+    temp = 8 * width;
+
+    img1_sum_ptr1      = img1_sum + temp;
+    img2_sum_ptr1      = img2_sum + temp;
+    img1_sq_sum_ptr1   = img1_sq_sum + temp;
+    img2_sq_sum_ptr1   = img2_sq_sum + temp;
+    img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+    for (x = 0; x < width; x++)
+    {
+        img1_sum[x]      = img1[x];
+        img2_sum[x]      = img2[x];
+        img1_sq_sum[x]   = img1[x] * img1[x];
+        img2_sq_sum[x]   = img2[x] * img2[x];
+        img12_mul_sum[x] = img1[x] * img2[x];
+
+        img1_sum_ptr1[x]      = 0;
+        img2_sum_ptr1[x]      = 0;
+        img1_sq_sum_ptr1[x]   = 0;
+        img2_sq_sum_ptr1[x]   = 0;
+        img12_mul_sum_ptr1[x] = 0;
+    }
+
+    //the main loop
+    for (y = 1; y < height; y++)
+    {
+        img1 += stride_img1;
+        img2 += stride_img2;
+
+        temp = (y - 1) % 9 * width;
+
+        img1_sum_ptr1      = img1_sum + temp;
+        img2_sum_ptr1      = img2_sum + temp;
+        img1_sq_sum_ptr1   = img1_sq_sum + temp;
+        img2_sq_sum_ptr1   = img2_sq_sum + temp;
+        img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+        temp = y % 9 * width;
+
+        img1_sum_ptr2      = img1_sum + temp;
+        img2_sum_ptr2      = img2_sum + temp;
+        img1_sq_sum_ptr2   = img1_sq_sum + temp;
+        img2_sq_sum_ptr2   = img2_sq_sum + temp;
+        img12_mul_sum_ptr2 = img12_mul_sum + temp;
+
+        for (x = 0; x < width; x++)
+        {
+            img1_sum_ptr2[x]      = img1_sum_ptr1[x] + img1[x];
+            img2_sum_ptr2[x]      = img2_sum_ptr1[x] + img2[x];
+            img1_sq_sum_ptr2[x]   = img1_sq_sum_ptr1[x] + img1[x] * img1[x];
+            img2_sq_sum_ptr2[x]   = img2_sq_sum_ptr1[x] + img2[x] * img2[x];
+            img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x];
+        }
+
+        if (y > 6)
+        {
+            //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum
+            temp = (y + 1) % 9 * width;
+
+            img1_sum_ptr1      = img1_sum + temp;
+            img2_sum_ptr1      = img2_sum + temp;
+            img1_sq_sum_ptr1   = img1_sq_sum + temp;
+            img2_sq_sum_ptr1   = img2_sq_sum + temp;
+            img12_mul_sum_ptr1 = img12_mul_sum + temp;
+
+            for (x = 0; x < width; x++)
+            {
+                img1_sum_ptr1[x]      = img1_sum_ptr2[x] - img1_sum_ptr1[x];
+                img2_sum_ptr1[x]      = img2_sum_ptr2[x] - img2_sum_ptr1[x];
+                img1_sq_sum_ptr1[x]   = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x];
+                img2_sq_sum_ptr1[x]   = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x];
+                img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x];
+            }
+
+            //here we calculate the sum over the 8x8 block of pixels
+            //this is done by sliding a window across the column sums for the last 8 lines
+            //each time adding the new column sum, and subtracting the one which fell out of the window
+            img1_block      = 0;
+            img2_block      = 0;
+            img1_sq_block   = 0;
+            img2_sq_block   = 0;
+            img12_mul_block = 0;
+
+            //prologue, and calculation of simularity measure from the first 8 column sums
+            for (x = 0; x < 8; x++)
+            {
+                img1_block      += img1_sum_ptr1[x];
+                img2_block      += img2_sum_ptr1[x];
+                img1_sq_block   += img1_sq_sum_ptr1[x];
+                img2_sq_block   += img2_sq_sum_ptr1[x];
+                img12_mul_block += img12_mul_sum_ptr1[x];
+            }
+
+            if (lumimask)
+            {
+                y2 = y - 7;
+                x2 = 0;
+
+                if (luminance)
+                {
+                    mean = (img2_block + img1_block) / 128.0f;
+
+                    if (!(y2 % 2 || x2 % 2))
+                        *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
+                }
+                else
+                {
+                    mean = *(img12_sum_block + y2 * width_uv + x2);
+                    mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
+                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
+                    mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
+
+                    mean /= 512.0f;
+                }
+
+                weight = mean < 40 ? 0.0f :
+                         (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
+                plane_summed_weights += weight;
+
+                plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+            }
+            else
+                plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+
+            //and for the rest
+            for (x = 8; x < width; x++)
+            {
+                img1_block      = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8];
+                img2_block      = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8];
+                img1_sq_block   = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8];
+                img2_sq_block   = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8];
+                img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8];
+
+                if (lumimask)
+                {
+                    y2 = y - 7;
+                    x2 = x - 7;
+
+                    if (luminance)
+                    {
+                        mean = (img2_block + img1_block) / 128.0f;
+
+                        if (!(y2 % 2 || x2 % 2))
+                            *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block;
+                    }
+                    else
+                    {
+                        mean = *(img12_sum_block + y2 * width_uv + x2);
+                        mean += *(img12_sum_block + y2 * width_uv + x2 + 4);
+                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2);
+                        mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4);
+
+                        mean /= 512.0f;
+                    }
+
+                    weight = mean < 40 ? 0.0f :
+                             (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f);
+                    plane_summed_weights += weight;
+
+                    plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+                }
+                else
+                    plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block);
+            }
+        }
+    }
+
+    if (plane_summed_weights == 0)
+        return 1.0f;
+    else
+        return plane_quality / plane_summed_weights;
+}
+
+double vp8_calc_ssim
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    int lumamask,
+    double *weight
+)
+{
+    double a, b, c;
+    double frame_weight;
+    double ssimv;
+
+    width_y = source->y_width;
+    height_y = source->y_height;
+    height_uv = source->uv_height;
+    width_uv = source->uv_width;
+    stride_uv = dest->uv_stride;
+    stride = dest->y_stride;
+
+    lumimask = lumamask;
+
+    luminance = 1;
+    a = vp8_ssim(source->y_buffer, dest->y_buffer,
+                 source->y_stride, dest->y_stride, source->y_width, source->y_height);
+    luminance = 0;
+
+    frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7));
+
+    if (frame_weight == 0)
+        a = b = c = 1.0f;
+    else
+    {
+        b = vp8_ssim(source->u_buffer, dest->u_buffer,
+                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
+
+        c = vp8_ssim(source->v_buffer, dest->v_buffer,
+                     source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height);
+    }
+
+    ssimv = a * .8 + .1 * (b + c);
+
+    *weight = frame_weight;
+
+    return ssimv;
+}
+
+// Google version of SSIM
+// SSIM
+#define KERNEL 3
+#define KERNEL_SIZE  (2 * KERNEL + 1)
+
+typedef unsigned char uint8;
+typedef unsigned int uint32;
+
+static const int K[KERNEL_SIZE] =
+{
+    1, 4, 11, 16, 11, 4, 1    // 16 * exp(-0.3 * i * i)
+};
+static const double ki_w = 1. / 2304.;  // 1 / sum(i:0..6, j..6) K[i]*K[j]
+double get_ssimg(const uint8 *org, const uint8 *rec,
+                 int xo, int yo, int W, int H,
+                 const int stride1, const int stride2
+                )
+{
+    // TODO(skal): use summed tables
+    int y, x;
+
+    const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL;
+    const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL;
+    const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL;
+    const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL;
+    // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
+    // with a diff of 255, squares. That would a max error of 0x8ee0900,
+    // which fits into 32 bits integers.
+    uint32 w = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+    org += ymin * stride1;
+    rec += ymin * stride2;
+
+    for (y = ymin; y <= ymax; ++y, org += stride1, rec += stride2)
+    {
+        const int Wy = K[KERNEL + y - yo];
+
+        for (x = xmin; x <= xmax; ++x)
+        {
+            const  int Wxy = Wy * K[KERNEL + x - xo];
+            // TODO(skal): inlined assembly
+            w   += Wxy;
+            xm  += Wxy * org[x];
+            ym  += Wxy * rec[x];
+            xxm += Wxy * org[x] * org[x];
+            xym += Wxy * org[x] * rec[x];
+            yym += Wxy * rec[x] * rec[x];
+        }
+    }
+
+    {
+        const double iw = 1. / w;
+        const double iwx = xm * iw;
+        const double iwy = ym * iw;
+        double sxx = xxm * iw - iwx * iwx;
+        double syy = yym * iw - iwy * iwy;
+
+        // small errors are possible, due to rounding. Clamp to zero.
+        if (sxx < 0.) sxx = 0.;
+
+        if (syy < 0.) syy = 0.;
+
+        {
+            const double sxsy = sqrt(sxx * syy);
+            const double sxy = xym * iw - iwx * iwy;
+            static const double C11 = (0.01 * 0.01) * (255 * 255);
+            static const double C22 = (0.03 * 0.03) * (255 * 255);
+            static const double C33 = (0.015 * 0.015) * (255 * 255);
+            const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+            const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
+
+            const double s = (sxy + C33) / (sxsy + C33);
+            return l * c * s;
+
+        }
+    }
+
+}
+
+double get_ssimfull_kernelg(const uint8 *org, const uint8 *rec,
+                            int xo, int yo, int W, int H,
+                            const int stride1, const int stride2)
+{
+    // TODO(skal): use summed tables
+    // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1)
+    // with a diff of 255, squares. That would a max error of 0x8ee0900,
+    // which fits into 32 bits integers.
+    int y_, x_;
+    uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0;
+    org += (yo - KERNEL) * stride1;
+    org += (xo - KERNEL);
+    rec += (yo - KERNEL) * stride2;
+    rec += (xo - KERNEL);
+
+    for (y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride1, rec += stride2)
+    {
+        const int Wy = K[y_];
+
+        for (x_ = 0; x_ < KERNEL_SIZE; ++x_)
+        {
+            const int Wxy = Wy * K[x_];
+            // TODO(skal): inlined assembly
+            const int org_x = org[x_];
+            const int rec_x = rec[x_];
+            xm  += Wxy * org_x;
+            ym  += Wxy * rec_x;
+            xxm += Wxy * org_x * org_x;
+            xym += Wxy * org_x * rec_x;
+            yym += Wxy * rec_x * rec_x;
+        }
+    }
+
+    {
+        const double iw = ki_w;
+        const double iwx = xm * iw;
+        const double iwy = ym * iw;
+        double sxx = xxm * iw - iwx * iwx;
+        double syy = yym * iw - iwy * iwy;
+
+        // small errors are possible, due to rounding. Clamp to zero.
+        if (sxx < 0.) sxx = 0.;
+
+        if (syy < 0.) syy = 0.;
+
+        {
+            const double sxsy = sqrt(sxx * syy);
+            const double sxy = xym * iw - iwx * iwy;
+            static const double C11 = (0.01 * 0.01) * (255 * 255);
+            static const double C22 = (0.03 * 0.03) * (255 * 255);
+            static const double C33 = (0.015 * 0.015) * (255 * 255);
+            const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11);
+            const double c = (2. * sxsy      + C22) / (sxx + syy + C22);
+            const double s = (sxy + C33) / (sxsy + C33);
+            return l * c * s;
+        }
+    }
+}
+
+double calc_ssimg(const uint8 *org, const uint8 *rec,
+                  const int image_width, const int image_height,
+                  const int stride1, const int stride2
+                 )
+{
+    int j, i;
+    double SSIM = 0.;
+
+    for (j = 0; j < KERNEL; ++j)
+    {
+        for (i = 0; i < image_width; ++i)
+        {
+            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+        }
+    }
+
+    for (j = KERNEL; j < image_height - KERNEL; ++j)
+    {
+        for (i = 0; i < KERNEL; ++i)
+        {
+            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+        }
+
+        for (i = KERNEL; i < image_width - KERNEL; ++i)
+        {
+            SSIM += get_ssimfull_kernelg(org, rec, i, j,
+                                         image_width, image_height, stride1, stride2);
+        }
+
+        for (i = image_width - KERNEL; i < image_width; ++i)
+        {
+            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+        }
+    }
+
+    for (j = image_height - KERNEL; j < image_height; ++j)
+    {
+        for (i = 0; i < image_width; ++i)
+        {
+            SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2);
+        }
+    }
+
+    return SSIM;
+}
+
+
+double vp8_calc_ssimg
+(
+    YV12_BUFFER_CONFIG *source,
+    YV12_BUFFER_CONFIG *dest,
+    double *ssim_y,
+    double *ssim_u,
+    double *ssim_v
+)
+{
+    double ssim_all = 0;
+    int ysize  = source->y_width * source->y_height;
+    int uvsize = ysize / 4;
+
+    *ssim_y = calc_ssimg(source->y_buffer, dest->y_buffer,
+                         source->y_width, source->y_height,
+                         source->y_stride, dest->y_stride);
+
+
+    *ssim_u = calc_ssimg(source->u_buffer, dest->u_buffer,
+                         source->uv_width, source->uv_height,
+                         source->uv_stride, dest->uv_stride);
+
+
+    *ssim_v = calc_ssimg(source->v_buffer, dest->v_buffer,
+                         source->uv_width, source->uv_height,
+                         source->uv_stride, dest->uv_stride);
+
+    ssim_all = (*ssim_y + *ssim_u + *ssim_v) / (ysize + uvsize + uvsize);
+    *ssim_y /= ysize;
+    *ssim_u /= uvsize;
+    *ssim_v /= uvsize;
+    return ssim_all;
+}

diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c
new file mode 100644
index 0000000..33ddd64
--- /dev/null
+++ b/vp8/encoder/tokenize.c

@@ -0,0 +1,636 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+#include "onyx_int.h"
+#include "tokenize.h"
+#include "vpx_mem/vpx_mem.h"
+
+/* Global event counters used for accumulating statistics across several
+   compressions, then generating context.c = initial stats. */
+
+#ifdef ENTROPY_STATS
+_int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+#endif
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) ;
+void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x);
+
+TOKENEXTRA vp8_dct_value_tokens[DCT_MAX_VALUE*2];
+TOKENEXTRA *vp8_dct_value_tokens_ptr;
+int vp8_dct_value_cost[DCT_MAX_VALUE*2];
+int *vp8_dct_value_cost_ptr;
+#if 0
+int skip_true_count = 0;
+int skip_false_count = 0;
+#endif
+static void fill_value_tokens()
+{
+
+    TOKENEXTRA *const t = vp8_dct_value_tokens + DCT_MAX_VALUE;
+    vp8_extra_bit_struct *const e = vp8_extra_bits;
+
+    int i = -DCT_MAX_VALUE;
+    int sign = 1;
+
+    do
+    {
+        if (!i)
+            sign = 0;
+
+        {
+            const int a = sign ? -i : i;
+            int eb = sign;
+
+            if (a > 4)
+            {
+                int j = 4;
+
+                while (++j < 11  &&  e[j].base_val <= a) {}
+
+                t[i].Token = --j;
+                eb |= (a - e[j].base_val) << 1;
+            }
+            else
+                t[i].Token = a;
+
+            t[i].Extra = eb;
+        }
+
+        // initialize the cost for extra bits for all possible coefficient value.
+        {
+            int cost = 0;
+            vp8_extra_bit_struct *p = vp8_extra_bits + t[i].Token;
+
+            if (p->base_val)
+            {
+                const int extra = t[i].Extra;
+                const int Length = p->Len;
+
+                if (Length)
+                    cost += vp8_treed_cost(p->tree, p->prob, extra >> 1, Length);
+
+                cost += vp8_cost_bit(vp8_prob_half, extra & 1); /* sign */
+                vp8_dct_value_cost[i + DCT_MAX_VALUE] = cost;
+            }
+
+        }
+
+    }
+    while (++i < DCT_MAX_VALUE);
+
+    vp8_dct_value_tokens_ptr = vp8_dct_value_tokens + DCT_MAX_VALUE;
+    vp8_dct_value_cost_ptr   = vp8_dct_value_cost + DCT_MAX_VALUE;
+}
+
+static void tokenize2nd_order_b
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    int c = 0;          /* start at DC */
+    const int eob = b->eob;     /* one beyond last nonzero coeff */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    int x;
+    const short *qcoeff_ptr = b->qcoeff;
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    do
+    {
+        const int band = vp8_coef_bands[c];
+
+        if (c < eob)
+        {
+            int rc = vp8_default_zig_zag1d[c];
+            const int v = qcoeff_ptr[rc];
+
+            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            x        = vp8_dct_value_tokens_ptr[v].Token;
+        }
+        else
+            x = DCT_EOB_TOKEN;
+
+        t->Token = x;
+        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
+
+        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+        ++cpi->coef_counts       [type] [band] [pt] [x];
+    }
+    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 16);
+
+    *tp = t;
+    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+
+static void tokenize1st_order_b
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    int c = type ? 0 : 1;       /* start at DC unless type 0 */
+    const int eob = b->eob;     /* one beyond last nonzero coeff */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    int x;
+    const short *qcoeff_ptr = b->qcoeff;
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+
+    do
+    {
+        const int band = vp8_coef_bands[c];
+
+        x = DCT_EOB_TOKEN;
+
+        if (c < eob)
+        {
+            int rc = vp8_default_zig_zag1d[c];
+            const int v = qcoeff_ptr[rc];
+
+            assert(-DCT_MAX_VALUE <= v  &&  v < (DCT_MAX_VALUE));
+
+            t->Extra = vp8_dct_value_tokens_ptr[v].Extra;
+            x        = vp8_dct_value_tokens_ptr[v].Token;
+        }
+
+        t->Token = x;
+        t->context_tree = cpi->common.fc.coef_probs [type] [band] [pt];
+
+        t->section = frametype * BLOCK_TYPES * 2 + 2 * type + (c == 0);
+        t->skip_eob_node = pt == 0 && ((band > 0 && type > 0) || (band > 1 && type == 0));
+
+        ++cpi->coef_counts       [type] [band] [pt] [x];
+    }
+    while (pt = vp8_prev_token_class[x], ++t, c < eob  &&  ++c < 16);
+
+    *tp = t;
+    pt = (c != !type); /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+#if 0
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+    //int i;
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    int plane_type;
+    int b;
+
+    TOKENEXTRA *start = *t;
+    TOKENEXTRA *tp = *t;
+
+    x->mbmi.dc_diff = 1;
+
+    vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
+
+    if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+    {
+        plane_type = 3;
+    }
+    else
+    {
+        tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type,
+                            A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+        plane_type = 0;
+
+    }
+
+    for (b = 0; b < 16; b++)
+        tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type,
+                            A[vp8_block2context[b]] + vp8_block2above[b],
+                            L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+    for (b = 16; b < 24; b++)
+        tokenize1st_order_b(x->block + b, t, 2, x->frame_type,
+                            A[vp8_block2context[b]] + vp8_block2above[b],
+                            L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        x->mbmi.mb_skip_coeff = 1;
+
+        while ((tp != *t) && x->mbmi.mb_skip_coeff)
+        {
+            x->mbmi.mb_skip_coeff = (x->mbmi.mb_skip_coeff && (tp->Token == DCT_EOB_TOKEN));
+            tp ++;
+        }
+
+        if (x->mbmi.mb_skip_coeff == 1)
+        {
+            x->mbmi.dc_diff = 0;
+            //redo the coutnts
+            vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
+
+            *t = start;
+            cpi->skip_true_count++;
+
+            //skip_true_count++;
+        }
+        else
+        {
+
+            cpi->skip_false_count++;
+            //skip_false_count++;
+        }
+    }
+}
+#else
+void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+    //int i;
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    int plane_type;
+    int b;
+
+    TOKENEXTRA *start = *t;
+    TOKENEXTRA *tp = *t;
+
+    x->mbmi.dc_diff = 1;
+
+#if 0
+
+    if (x->mbmi.force_no_skip)
+    {
+        x->mbmi.mb_skip_coeff = 1;
+        //reset for next_mb.
+        x->mbmi.force_no_skip = 0;
+    }
+
+#endif
+
+#if 1
+
+    if (x->mbmi.mb_skip_coeff)
+    {
+
+        cpi->skip_true_count++;
+
+        if (!cpi->common.mb_no_coeff_skip)
+            vp8_stuff_mb(cpi, x, t) ;
+        else
+        {
+            vp8_fix_contexts(cpi, x);
+        }
+
+        if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+            x->mbmi.dc_diff = 0;
+        else
+            x->mbmi.dc_diff = 1;
+
+
+        return;
+    }
+
+    cpi->skip_false_count++;
+#endif
+#if 0
+
+    if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+    {
+        int i, skip = 1;
+
+        for (i = 0; i < 24; i++)
+            skip &= (!x->block[i].eob);
+
+        if (skip != x->mbmi.mb_skip_coeff)
+            skip += 0;
+
+        x->mbmi.mb_skip_coeff = skip;
+    }
+    else
+    {
+        int i, skip = 1;
+
+        for (i = 0; i < 16; i++)
+            skip &= (x->block[i].eob < 2);
+
+        for (i = 16; i < 25; i++)
+            skip &= (!x->block[i].eob);
+
+        if (skip != x->mbmi.mb_skip_coeff)
+            skip += 0;
+
+        x->mbmi.mb_skip_coeff = skip;
+    }
+
+    vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts));
+#endif
+
+    if (x->mbmi.mode == B_PRED || x->mbmi.mode == SPLITMV)
+    {
+        plane_type = 3;
+    }
+    else
+    {
+        tokenize2nd_order_b(x->block + 24, t, 1, x->frame_type,
+                            A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+        plane_type = 0;
+
+    }
+
+    for (b = 0; b < 16; b++)
+        tokenize1st_order_b(x->block + b, t, plane_type, x->frame_type,
+                            A[vp8_block2context[b]] + vp8_block2above[b],
+                            L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+    for (b = 16; b < 24; b++)
+        tokenize1st_order_b(x->block + b, t, 2, x->frame_type,
+                            A[vp8_block2context[b]] + vp8_block2above[b],
+                            L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+#if 0
+
+    if (cpi->common.mb_no_coeff_skip)
+    {
+        int skip = 1;
+
+        while ((tp != *t) && skip)
+        {
+            skip = (skip && (tp->Token == DCT_EOB_TOKEN));
+            tp ++;
+        }
+
+        if (skip != x->mbmi.mb_skip_coeff)
+            skip += 0;
+
+        x->mbmi.mb_skip_coeff = skip;
+
+        if (x->mbmi.mb_skip_coeff == 1)
+        {
+            x->mbmi.dc_diff = 0;
+            //redo the coutnts
+            vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts));
+
+            *t = start;
+            cpi->skip_true_count++;
+            //skip_true_count++;
+        }
+        else
+        {
+
+            cpi->skip_false_count++;
+            //skip_false_count++;
+        }
+    }
+
+#endif
+}
+#endif
+
+#ifdef ENTROPY_STATS
+
+void init_context_counters(void)
+{
+    vpx_memset(context_counters, 0, sizeof(context_counters));
+}
+
+void print_context_counters()
+{
+
+    int type, band, pt, t;
+
+    FILE *const f = fopen("context.c", "w");
+
+    fprintf(f, "#include \"entropy.h\"\n");
+
+    fprintf(f, "\n/* *** GENERATED FILE: DO NOT EDIT *** */\n\n");
+
+    fprintf(f, "int Contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];\n\n");
+
+    fprintf(f, "const int default_contexts[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens] = {");
+
+# define Comma( X) (X? ",":"")
+
+    type = 0;
+
+    do
+    {
+        fprintf(f, "%s\n  { /* block Type %d */", Comma(type), type);
+
+        band = 0;
+
+        do
+        {
+            fprintf(f, "%s\n    { /* Coeff Band %d */", Comma(band), band);
+
+            pt = 0;
+
+            do
+            {
+                fprintf(f, "%s\n      {", Comma(pt));
+
+                t = 0;
+
+                do
+                {
+                    const _int64 x = context_counters [type] [band] [pt] [t];
+                    const int y = (int) x;
+
+                    assert(x == (_int64) y);  /* no overflow handling yet */
+                    fprintf(f, "%s %d", Comma(t), y);
+
+                }
+                while (++t < vp8_coef_tokens);
+
+                fprintf(f, "}");
+            }
+            while (++pt < PREV_COEF_CONTEXTS);
+
+            fprintf(f, "\n    }");
+
+        }
+        while (++band < COEF_BANDS);
+
+        fprintf(f, "\n  }");
+    }
+    while (++type < BLOCK_TYPES);
+
+    fprintf(f, "\n};\n");
+    fclose(f);
+}
+#endif
+
+
+void vp8_tokenize_initialize()
+{
+    fill_value_tokens();
+}
+
+
+static __inline void stuff2nd_order_b
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [1] [0] [pt];
+    t->section = 11;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts       [1] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+
+    *tp = t;
+    pt = 0;
+    *a = *l = pt;
+
+}
+
+static __inline void stuff1st_order_b
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [0] [1] [pt];
+    t->section = 8;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts       [0] [1] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+static __inline
+void stuff1st_order_buv
+(
+    const BLOCKD *const b,
+    TOKENEXTRA **tp,
+    const int type,     /* which plane: 0=Y no DC, 1=Y2, 2=UV, 3=Y with DC */
+    const FRAME_TYPE frametype,
+    ENTROPY_CONTEXT *a,
+    ENTROPY_CONTEXT *l,
+    VP8_COMP *cpi
+)
+{
+    int pt; /* near block/prev token context index */
+    TOKENEXTRA *t = *tp;        /* store tokens starting here */
+    VP8_COMBINEENTROPYCONTEXTS(pt, *a, *l);
+    (void) frametype;
+    (void) type;
+    (void) b;
+
+    t->Token = DCT_EOB_TOKEN;
+    t->context_tree = cpi->common.fc.coef_probs [2] [0] [pt];
+    t->section = 13;
+    t->skip_eob_node = 0;
+    ++cpi->coef_counts[2] [0] [pt] [DCT_EOB_TOKEN];
+    ++t;
+    *tp = t;
+    pt = 0; /* 0 <-> all coeff data is zero */
+    *a = *l = pt;
+
+}
+
+void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t)
+{
+    //int i;
+    ENTROPY_CONTEXT **const A = x->above_context;
+    ENTROPY_CONTEXT(* const L)[4] = x->left_context;
+    int plane_type;
+    int b;
+
+    stuff2nd_order_b(x->block + 24, t, 1, x->frame_type,
+                     A[Y2CONTEXT] + vp8_block2above[24], L[Y2CONTEXT] + vp8_block2left[24], cpi);
+    plane_type = 0;
+
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+        x->mbmi.dc_diff = 0;
+    else
+        x->mbmi.dc_diff = 1;
+
+
+    for (b = 0; b < 16; b++)
+        stuff1st_order_b(x->block + b, t, plane_type, x->frame_type,
+                         A[vp8_block2context[b]] + vp8_block2above[b],
+                         L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+    for (b = 16; b < 24; b++)
+        stuff1st_order_buv(x->block + b, t, 2, x->frame_type,
+                           A[vp8_block2context[b]] + vp8_block2above[b],
+                           L[vp8_block2context[b]] + vp8_block2left[b], cpi);
+
+}
+void vp8_fix_contexts(VP8_COMP *cpi, MACROBLOCKD *x)
+{
+    x->left_context[Y1CONTEXT][0] = 0;
+    x->left_context[Y1CONTEXT][1] = 0;
+    x->left_context[Y1CONTEXT][2] = 0;
+    x->left_context[Y1CONTEXT][3] = 0;
+    x->left_context[UCONTEXT][0]  = 0;
+    x->left_context[VCONTEXT][0]  = 0;
+    x->left_context[UCONTEXT][1]  = 0;
+    x->left_context[VCONTEXT][1]  = 0;
+
+    x->above_context[Y1CONTEXT][0] = 0;
+    x->above_context[Y1CONTEXT][1] = 0;
+    x->above_context[Y1CONTEXT][2] = 0;
+    x->above_context[Y1CONTEXT][3] = 0;
+    x->above_context[UCONTEXT][0]  = 0;
+    x->above_context[VCONTEXT][0]  = 0;
+    x->above_context[UCONTEXT][1]  = 0;
+    x->above_context[VCONTEXT][1]  = 0;
+
+    if (x->mbmi.mode != B_PRED && x->mbmi.mode != SPLITMV)
+    {
+        x->left_context[Y2CONTEXT][0] = 0;
+        x->above_context[Y2CONTEXT][0] = 0;
+    }
+}

diff --git a/vp8/encoder/tokenize.h b/vp8/encoder/tokenize.h
new file mode 100644
index 0000000..02aacc2
--- /dev/null
+++ b/vp8/encoder/tokenize.h

@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef tokenize_h
+#define tokenize_h
+
+#include "entropy.h"
+#include "block.h"
+
+void vp8_tokenize_initialize();
+
+typedef struct
+{
+    int Token;
+    int Extra;
+    const vp8_prob *context_tree;
+    int skip_eob_node;
+    int section;
+} TOKENEXTRA;
+
+int rd_cost_mby(MACROBLOCKD *);
+
+#ifdef ENTROPY_STATS
+void init_context_counters();
+void print_context_counters();
+
+extern _int64 context_counters[BLOCK_TYPES] [COEF_BANDS] [PREV_COEF_CONTEXTS] [vp8_coef_tokens];
+#endif
+
+
+#endif  /* tokenize_h */

diff --git a/vp8/encoder/treewriter.c b/vp8/encoder/treewriter.c
new file mode 100644
index 0000000..e398044
--- /dev/null
+++ b/vp8/encoder/treewriter.c

@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "treewriter.h"
+
+static void cost(
+    int *const C,
+    vp8_tree T,
+    const vp8_prob *const P,
+    int i,
+    int c
+)
+{
+    const vp8_prob p = P [i>>1];
+
+    do
+    {
+        const vp8_tree_index j = T[i];
+        const int d = c + vp8_cost_bit(p, i & 1);
+
+        if (j <= 0)
+            C[-j] = d;
+        else
+            cost(C, T, P, j, d);
+    }
+    while (++i & 1);
+}
+void vp8_cost_tokens(int *c, const vp8_prob *p, vp8_tree t)
+{
+    cost(c, t, p, 0, 0);
+}

diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h
new file mode 100644
index 0000000..05ac74c
--- /dev/null
+++ b/vp8/encoder/treewriter.h

@@ -0,0 +1,121 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef __INC_TREEWRITER_H
+#define __INC_TREEWRITER_H
+
+/* Trees map alphabets into huffman-like codes suitable for an arithmetic
+   bit coder.  Timothy S Murphy  11 October 2004 */
+
+#include "treecoder.h"
+
+#include "boolhuff.h"       /* for now */
+
+typedef BOOL_CODER vp8_writer;
+
+#define vp8_write vp8_encode_bool
+#define vp8_write_literal vp8_encode_value
+#define vp8_write_bit( W, V) vp8_write( W, V, vp8_prob_half)
+
+#define vp8bc_write vp8bc_write_bool
+#define vp8bc_write_literal vp8bc_write_bits
+#define vp8bc_write_bit( W, V) vp8bc_write_bits( W, V, 1)
+
+
+/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+
+#define vp8_cost_zero( x) ( vp8_prob_cost[x])
+#define vp8_cost_one( x)  vp8_cost_zero( vp8_complement(x))
+
+#define vp8_cost_bit( x, b) vp8_cost_zero( (b)?  vp8_complement(x) : (x) )
+
+/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
+
+
+/* Both of these return bits, not scaled bits. */
+
+static __inline unsigned int vp8_cost_branch(const unsigned int ct[2], vp8_prob p)
+{
+    /* Imitate existing calculation */
+
+    return ((ct[0] * vp8_cost_zero(p))
+            + (ct[1] * vp8_cost_one(p))) >> 8;
+}
+
+/* Small functions to write explicit values and tokens, as well as
+   estimate their lengths. */
+
+static __inline void vp8_treed_write
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        vp8_write(w, b, p[i>>1]);
+        i = t[i+b];
+    }
+    while (n);
+}
+static __inline void vp8_write_token
+(
+    vp8_writer *const w,
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    vp8_treed_write(w, t, p, x->value, x->Len);
+}
+
+static __inline int vp8_treed_cost(
+    vp8_tree t,
+    const vp8_prob *const p,
+    int v,
+    int n               /* number of bits in v, assumed nonzero */
+)
+{
+    int c = 0;
+    vp8_tree_index i = 0;
+
+    do
+    {
+        const int b = (v >> --n) & 1;
+        c += vp8_cost_bit(p[i>>1], b);
+        i = t[i+b];
+    }
+    while (n);
+
+    return c;
+}
+static __inline int vp8_cost_token
+(
+    vp8_tree t,
+    const vp8_prob *const p,
+    vp8_token *const x
+)
+{
+    return vp8_treed_cost(t, p, x->value, x->Len);
+}
+
+/* Fill array of costs for all possible token values. */
+
+void vp8_cost_tokens(
+    int *Costs, const vp8_prob *, vp8_tree
+);
+
+#endif

diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
new file mode 100644
index 0000000..b3b55c3
--- /dev/null
+++ b/vp8/encoder/variance.h

@@ -0,0 +1,327 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_H
+#define VARIANCE_H
+
+#define prototype_sad(sym)\
+    unsigned int (sym)\
+    (\
+     unsigned char *src_ptr, \
+     int source_stride, \
+     unsigned char *ref_ptr, \
+     int  ref_stride, \
+     int max_sad\
+    )
+
+#define prototype_sad_multi_same_address(sym)\
+    void (sym)\
+    (\
+     unsigned char *src_ptr, \
+     int source_stride, \
+     unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned int *sad_array\
+    )
+
+#define prototype_sad_multi_dif_address(sym)\
+    void (sym)\
+    (\
+     unsigned char *src_ptr, \
+     int source_stride, \
+     unsigned char *ref_ptr[4], \
+     int  ref_stride, \
+     unsigned int *sad_array\
+    )
+
+#define prototype_variance(sym) \
+    unsigned int (sym) \
+    (\
+     unsigned char *src_ptr, \
+     int source_stride, \
+     unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned int *sse\
+    )
+
+#define prototype_variance2(sym) \
+    unsigned int (sym) \
+    (\
+     unsigned char *src_ptr, \
+     int source_stride, \
+     unsigned char *ref_ptr, \
+     int  ref_stride, \
+     unsigned int *sse,\
+     int *sum\
+    )
+
+#define prototype_subpixvariance(sym) \
+    unsigned int (sym) \
+    ( \
+      unsigned char  *src_ptr, \
+      int  source_stride, \
+      int  xoffset, \
+      int  yoffset, \
+      unsigned char *ref_ptr, \
+      int Refstride, \
+      unsigned int *sse \
+    );
+
+
+#define prototype_getmbss(sym) unsigned int (sym)(short *)
+
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/variance_x86.h"
+#endif
+
+#if ARCH_ARM
+#include "arm/variance_arm.h"
+#endif
+
+#ifndef vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_c
+#endif
+extern prototype_sad(vp8_variance_sad4x4);
+
+#ifndef vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_c
+#endif
+extern prototype_sad(vp8_variance_sad8x8);
+
+#ifndef vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_c
+#endif
+extern prototype_sad(vp8_variance_sad8x16);
+
+#ifndef vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_c
+#endif
+extern prototype_sad(vp8_variance_sad16x8);
+
+#ifndef vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_c
+#endif
+extern prototype_sad(vp8_variance_sad16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad16x16x3);
+
+#ifndef vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad16x8x3);
+
+#ifndef vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad8x8x3);
+
+#ifndef vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad8x16x3);
+
+#ifndef vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_c
+#endif
+extern prototype_sad_multi_same_address(vp8_variance_sad4x4x3);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4d vp8_sad16x16x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad16x16x4d);
+
+#ifndef vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad16x8x4d);
+
+#ifndef vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad8x8x4d);
+
+#ifndef vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad8x16x4d);
+
+#ifndef vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_c
+#endif
+extern prototype_sad_multi_dif_address(vp8_variance_sad4x4x4d);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_c
+#endif
+extern prototype_variance(vp8_variance_var4x4);
+
+#ifndef vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_c
+#endif
+extern prototype_variance(vp8_variance_var8x8);
+
+#ifndef vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_c
+#endif
+extern prototype_variance(vp8_variance_var8x16);
+
+#ifndef vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_c
+#endif
+extern prototype_variance(vp8_variance_var16x8);
+
+#ifndef vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_c
+#endif
+extern prototype_variance(vp8_variance_var16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar4x4);
+
+#ifndef vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar8x8);
+
+#ifndef vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar8x16);
+
+#ifndef vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar16x8);
+
+#ifndef vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
+
+#ifndef vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
+#endif
+extern prototype_subpixvariance(vp8_variance_subpixmse16x16);
+
+//-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
+
+#ifndef vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_c
+#endif
+extern prototype_getmbss(vp8_variance_getmbss);
+
+#ifndef vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_c
+#endif
+extern prototype_variance(vp8_variance_mse16x16);
+
+#ifndef vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_c
+#endif
+extern prototype_sad(vp8_variance_get16x16prederror);
+
+#ifndef vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_c
+#endif
+extern prototype_variance2(vp8_variance_get8x8var);
+
+#ifndef vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_c
+#endif
+extern prototype_variance2(vp8_variance_get16x16var);
+
+#ifndef vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c
+#endif
+extern prototype_sad(vp8_variance_get4x4sse_cs);
+
+
+typedef prototype_sad(*vp8_sad_fn_t);
+typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
+typedef prototype_sad_multi_dif_address(*vp8_sad_multi_d_fn_t);
+typedef prototype_variance(*vp8_variance_fn_t);
+typedef prototype_variance2(*vp8_variance2_fn_t);
+typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t);
+typedef prototype_getmbss(*vp8_getmbss_fn_t);
+typedef struct
+{
+    vp8_sad_fn_t             sad4x4;
+    vp8_sad_fn_t             sad8x8;
+    vp8_sad_fn_t             sad8x16;
+    vp8_sad_fn_t             sad16x8;
+    vp8_sad_fn_t             sad16x16;
+
+    vp8_variance_fn_t        var4x4;
+    vp8_variance_fn_t        var8x8;
+    vp8_variance_fn_t        var8x16;
+    vp8_variance_fn_t        var16x8;
+    vp8_variance_fn_t        var16x16;
+
+    vp8_subpixvariance_fn_t  subpixvar4x4;
+    vp8_subpixvariance_fn_t  subpixvar8x8;
+    vp8_subpixvariance_fn_t  subpixvar8x16;
+    vp8_subpixvariance_fn_t  subpixvar16x8;
+    vp8_subpixvariance_fn_t  subpixvar16x16;
+    vp8_subpixvariance_fn_t  subpixmse16x16;
+
+    vp8_getmbss_fn_t         getmbss;
+    vp8_variance_fn_t        mse16x16;
+
+    vp8_sad_fn_t             get16x16prederror;
+    vp8_variance2_fn_t       get8x8var;
+    vp8_variance2_fn_t       get16x16var;
+    vp8_sad_fn_t             get4x4sse_cs;
+
+    vp8_sad_multi_fn_t       sad16x16x3;
+    vp8_sad_multi_fn_t       sad16x8x3;
+    vp8_sad_multi_fn_t       sad8x16x3;
+    vp8_sad_multi_fn_t       sad8x8x3;
+    vp8_sad_multi_fn_t       sad4x4x3;
+
+    vp8_sad_multi_d_fn_t     sad16x16x4d;
+    vp8_sad_multi_d_fn_t     sad16x8x4d;
+    vp8_sad_multi_d_fn_t     sad8x16x4d;
+    vp8_sad_multi_d_fn_t     sad8x8x4d;
+    vp8_sad_multi_d_fn_t     sad4x4x4d;
+
+} vp8_variance_rtcd_vtable_t;
+
+typedef struct
+{
+    vp8_sad_fn_t  sdf;
+    vp8_sad_multi_fn_t sdx3f;
+    vp8_sad_multi_d_fn_t sdx4df;
+    vp8_variance_fn_t vf;
+    vp8_subpixvariance_fn_t svf;
+} vp8_variance_fn_ptr_t;
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#else
+#define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#endif
+
+/* TODO: Determine if this USEBILINEAR flag is necessary. */
+#define USEBILINEAR
+
+#endif

diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
new file mode 100644
index 0000000..85269b9
--- /dev/null
+++ b/vp8/encoder/variance_c.c

@@ -0,0 +1,527 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+
+const int vp8_six_tap[8][6] =
+{
+    { 0,  0,  128,    0,   0,  0 },         // note that 1/8 pel positions are just as per alpha -0.5 bicubic
+    { 0, -6,  123,   12,  -1,  0 },
+    { 2, -11, 108,   36,  -8,  1 },         // New 1/4 pel 6 tap filter
+    { 0, -9,   93,   50,  -6,  0 },
+    { 3, -16,  77,   77, -16,  3 },         // New 1/2 pel 6 tap filter
+    { 0, -6,   50,   93,  -9,  0 },
+    { 1, -8,   36,  108, -11,  2 },         // New 1/4 pel 6 tap filter
+    { 0, -1,   12,  123,  -6,  0 }
+};
+
+
+#ifdef USEBILINEAR
+const int VP8_FILTER_WEIGHT = 128;
+const int VP8_FILTER_SHIFT  =   7;
+const int vp8_bilinear_taps[8][2] =
+{
+    { 128,   0 },
+    { 112,  16 },
+    {  96,  32 },
+    {  80,  48 },
+    {  64,  64 },
+    {  48,  80 },
+    {  32,  96 },
+    {  16, 112 }
+};
+
+unsigned int vp8_get_mb_ss_c
+(
+    short *src_ptr
+)
+{
+    unsigned int i = 0, sum = 0;
+
+    do
+    {
+        sum += (src_ptr[i] * src_ptr[i]);
+        i++;
+    }
+    while (i < 256);
+
+    return sum;
+}
+
+
+void  vp8_variance(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    int  w,
+    int  h,
+    unsigned int *sse,
+    int *sum)
+{
+    int i, j;
+    int diff;
+
+    *sum = 0;
+    *sse = 0;
+
+    for (i = 0; i < h; i++)
+    {
+        for (j = 0; j < w; j++)
+        {
+            diff = src_ptr[j] - ref_ptr[j];
+            *sum += diff;
+            *sse += diff * diff;
+        }
+
+        src_ptr += source_stride;
+        ref_ptr += recon_stride;
+    }
+}
+
+unsigned int
+vp8_get8x8var_c
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+)
+{
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
+    return (*SSE - (((*Sum) * (*Sum)) >> 6));
+}
+
+unsigned int
+vp8_get16x16var_c
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+)
+{
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
+    return (*SSE - (((*Sum) * (*Sum)) >> 8));
+
+}
+
+
+
+unsigned int vp8_variance16x16_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance8x16_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+}
+
+unsigned int vp8_variance16x8_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+}
+
+
+unsigned int vp8_variance8x8_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
+    *sse = var;
+    return (var - ((avg * avg) >> 6));
+}
+
+unsigned int vp8_variance4x4_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
+    *sse = var;
+    return (var - ((avg * avg) >> 4));
+}
+
+
+unsigned int vp8_mse16x16_c(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
+    *sse = var;
+    return var;
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_first_pass
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement first-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil_first_pass
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int *vp8_filter
+)
+{
+    unsigned int i, j;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            // Apply bilinear filter
+            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
+                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil_second_pass
+ *
+ *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
+ *                  UINT32 output_height     : Input block height.
+ *                  UINT32 output_width      : Input block width.
+ *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
+ *                  either horizontal or vertical direction to produce the
+ *                  filtered output block. Used to implement second-pass
+ *                  of 2-D separable filter.
+ *
+ *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
+ *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
+ *                  pixel_step defines whether the filter is applied
+ *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
+ *                  It defines the offset required to move from one input
+ *                  to the next.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil_second_pass
+(
+    unsigned short *src_ptr,
+    unsigned char  *output_ptr,
+    unsigned int  src_pixels_per_line,
+    unsigned int  pixel_step,
+    unsigned int  output_height,
+    unsigned int  output_width,
+    const int *vp8_filter
+)
+{
+    unsigned int  i, j;
+    int  Temp;
+
+    for (i = 0; i < output_height; i++)
+    {
+        for (j = 0; j < output_width; j++)
+        {
+            // Apply filter
+            Temp = ((int)src_ptr[0]         * vp8_filter[0]) +
+                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
+                   (VP8_FILTER_WEIGHT / 2);
+            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
+            src_ptr++;
+        }
+
+        // Next row...
+        src_ptr    += src_pixels_per_line - output_width;
+        output_ptr += output_width;
+    }
+}
+
+
+/****************************************************************************
+ *
+ *  ROUTINE       : filter_block2d_bil
+ *
+ *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
+ *                  UINT32 src_pixels_per_line : Stride of input block.
+ *                  INT32  *HFilter         : Array of 2 horizontal filter taps.
+ *                  INT32  *VFilter         : Array of 2 vertical filter taps.
+ *
+ *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : 2-D filters an 8x8 input block by applying a 2-tap
+ *                  bi-linear filter horizontally followed by a 2-tap
+ *                  bi-linear filter vertically on the result.
+ *
+ *  SPECIAL NOTES : The intermediate horizontally filtered block must produce
+ *                  1 more point than the input block in each column. This
+ *                  is to ensure that the 2-tap filter has one extra data-point
+ *                  at the top of each column so filter taps do not extend
+ *                  beyond data. Thus the output of the first stage filter
+ *                  is an 8x9 (hx_v) block.
+ *
+ ****************************************************************************/
+void vp8e_filter_block2d_bil
+(
+    unsigned char  *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int src_pixels_per_line,
+    int  *HFilter,
+    int  *VFilter
+)
+{
+
+    unsigned short FData[20*16];    // Temp data bufffer used in filtering
+
+    // First filter 1-D horizontally...
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData, src_pixels_per_line, 1, 9, 8, HFilter);
+
+    // then 1-D vertically...
+    vp8e_filter_block2d_bil_second_pass(FData, output_ptr, 8, 8, 8, 8, VFilter);
+}
+
+
+
+unsigned int vp8_sub_pixel_variance4x4_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned char  temp2[20*16];
+    const int *HFilter, *VFilter;
+    unsigned short FData3[5*4]; // Temp data bufffer used in filtering
+
+    HFilter = vp8_bilinear_taps[xoffset];
+    VFilter = vp8_bilinear_taps[yoffset];
+
+    // First filter 1d Horizontal
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
+
+    // Now filter Verticaly
+    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
+
+    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[9*8]; // Temp data bufffer used in filtering
+    unsigned char  temp2[20*16];
+    const int *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_taps[xoffset];
+    VFilter = vp8_bilinear_taps[yoffset];
+
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
+    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
+
+    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[17*16];   // Temp data bufffer used in filtering
+    unsigned char  temp2[20*16];
+    const int *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_taps[xoffset];
+    VFilter = vp8_bilinear_taps[yoffset];
+
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
+    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
+
+    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_mse16x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_c(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[16*9];    // Temp data bufffer used in filtering
+    unsigned char  temp2[20*16];
+    const int *HFilter, *VFilter;
+
+    HFilter = vp8_bilinear_taps[xoffset];
+    VFilter = vp8_bilinear_taps[yoffset];
+
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
+    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
+
+    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
+unsigned int vp8_sub_pixel_variance8x16_c
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    unsigned short FData3[9*16];    // Temp data bufffer used in filtering
+    unsigned char  temp2[20*16];
+    const int *HFilter, *VFilter;
+
+
+    HFilter = vp8_bilinear_taps[xoffset];
+    VFilter = vp8_bilinear_taps[yoffset];
+
+
+    vp8e_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
+    vp8e_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
+
+    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+#endif

diff --git a/vp8/encoder/x86/csystemdependent.c b/vp8/encoder/x86/csystemdependent.c
new file mode 100644
index 0000000..186ee68
--- /dev/null
+++ b/vp8/encoder/x86/csystemdependent.c

@@ -0,0 +1,289 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "onyx_int.h"
+
+SADFunction *vp8_sad16x16;
+SADFunction *vp8_sad16x8;
+SADFunction *vp8_sad8x16;
+SADFunction *vp8_sad8x8;
+SADFunction *vp8_sad4x4;
+
+variance_function *vp8_variance4x4;
+variance_function *vp8_variance8x8;
+variance_function *vp8_variance8x16;
+variance_function *vp8_variance16x8;
+variance_function *vp8_variance16x16;
+
+
+variance_function *vp8_mse16x16;
+
+sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
+sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
+
+int (*vp8_block_error)(short *, short *);
+int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
+void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
+extern void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride);
+
+extern int vp8_block_error_c(short *, short *);
+extern int vp8_mbblock_error_c(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_mmx(short *, short *);
+extern int vp8_mbblock_error_mmx(MACROBLOCK *x, int dc);
+
+extern int vp8_block_error_xmm(short *, short *);
+extern int vp8_mbblock_error_xmm(MACROBLOCK *x, int dc);
+
+
+
+int (*vp8_mbuverror)(MACROBLOCK *mb);
+unsigned int (*vp8_get_mb_ss)(short *);
+void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
+void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
+
+void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
+void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
+unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// c imports
+extern int vp8_mbuverror_c(MACROBLOCK *mb);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern void vp8_short_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_c(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_c(short *input, short *output, int pitch);
+
+
+extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
+
+extern SADFunction vp8_sad16x16_c;
+extern SADFunction vp8_sad16x8_c;
+extern SADFunction vp8_sad8x16_c;
+extern SADFunction vp8_sad8x8_c;
+extern SADFunction vp8_sad4x4_c;
+
+extern SADFunction vp8_sad16x16_wmt;
+extern SADFunction vp8_sad16x8_wmt;
+extern SADFunction vp8_sad8x16_wmt;
+extern SADFunction vp8_sad8x8_wmt;
+extern SADFunction vp8_sad4x4_wmt;
+
+extern SADFunction vp8_sad16x16_mmx;
+extern SADFunction vp8_sad16x8_mmx;
+extern SADFunction vp8_sad8x16_mmx;
+extern SADFunction vp8_sad8x8_mmx;
+extern SADFunction vp8_sad4x4_mmx;
+
+extern variance_function vp8_variance16x16_c;
+extern variance_function vp8_variance8x16_c;
+extern variance_function vp8_variance16x8_c;
+extern variance_function vp8_variance8x8_c;
+extern variance_function vp8_variance4x4_c;
+extern variance_function vp8_mse16x16_c;
+
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_c;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_c;
+
+extern unsigned int vp8_get_mb_ss_c(short *);
+extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+// mmx imports
+extern int vp8_mbuverror_mmx(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d);
+extern void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch);
+extern void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
+extern void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch);
+extern void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch);
+extern void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_mmx;
+extern variance_function vp8_variance8x8_mmx;
+extern variance_function vp8_variance8x16_mmx;
+extern variance_function vp8_variance16x8_mmx;
+extern variance_function vp8_variance16x16_mmx;
+
+extern variance_function vp8_mse16x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_mmx;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_mmx;
+
+extern unsigned int vp8_get16x16pred_error_mmx(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_mmx(short *);
+extern unsigned int vp8_get8x8var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get4x4sse_cs_mmx(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
+
+
+// wmt imports
+extern int vp8_mbuverror_xmm(MACROBLOCK *mb);
+extern void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d);
+extern void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch);
+extern variance_function vp8_variance4x4_wmt;
+extern variance_function vp8_variance8x8_wmt;
+extern variance_function vp8_variance8x16_wmt;
+extern variance_function vp8_variance16x8_wmt;
+extern variance_function vp8_variance16x16_wmt;
+
+extern variance_function vp8_mse16x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_wmt;
+extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_wmt;
+extern unsigned int vp8_get16x16pred_error_sse2(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
+extern unsigned int vp8_get_mb_ss_sse2(short *src_ptr);
+extern unsigned int vp8_get8x8var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+extern unsigned int vp8_get16x16var_sse2(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
+
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+void vp8_cmachine_specific_config(void)
+{
+    int mmx_enabled;
+    int xmm_enabled;
+    int wmt_enabled;
+
+    vpx_get_processor_flags(&mmx_enabled, &xmm_enabled, &wmt_enabled);
+
+    if (wmt_enabled)         // Willamette
+    {
+        // Willamette instruction set available:
+        vp8_mbuverror                = vp8_mbuverror_xmm;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_sse;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_wmt;
+        vp8_subtract_b                = vp8_subtract_b_mmx;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
+        vp8_variance4x4              = vp8_variance4x4_mmx;
+        vp8_variance8x8              = vp8_variance8x8_mmx;
+        vp8_variance8x16             = vp8_variance8x16_wmt;
+        vp8_variance16x8             = vp8_variance16x8_wmt;
+        vp8_variance16x16            = vp8_variance16x16_wmt;
+        vp8_mse16x16                 = vp8_mse16x16_wmt;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_wmt;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_wmt;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_wmt;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_wmt;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_wmt;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_sse2;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_sse2;
+        vp8_get8x8var                = vp8_get8x8var_sse2;
+        vp8_get16x16var              = vp8_get16x16var_sse2;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
+        vp8_sad16x16                 = vp8_sad16x16_wmt;
+        vp8_sad16x8                  = vp8_sad16x8_wmt;
+        vp8_sad8x16                  = vp8_sad8x16_wmt;
+        vp8_sad8x8                   = vp8_sad8x8_wmt;
+        vp8_sad4x4                   = vp8_sad4x4_wmt;
+        vp8_block_error               = vp8_block_error_xmm;
+        vp8_mbblock_error             = vp8_mbblock_error_xmm;
+        vp8_subtract_mby              = vp8_subtract_mby_mmx;
+
+    }
+    else if (mmx_enabled)
+    {
+        // MMX instruction set available:
+        vp8_mbuverror                = vp8_mbuverror_mmx;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_mmx;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_mmx;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_mmx;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_mmx;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_mmx;
+        vp8_subtract_b                = vp8_subtract_b_mmx;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_mmx;
+        vp8_variance4x4              = vp8_variance4x4_mmx;
+        vp8_variance8x8              = vp8_variance8x8_mmx;
+        vp8_variance8x16             = vp8_variance8x16_mmx;
+        vp8_variance16x8             = vp8_variance16x8_mmx;
+        vp8_variance16x16            = vp8_variance16x16_mmx;
+        vp8_mse16x16                 = vp8_mse16x16_mmx;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_mmx;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_mmx;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_mmx;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_mmx;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_mmx;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_mmx;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_mmx;
+        vp8_get8x8var                = vp8_get8x8var_mmx;
+        vp8_get16x16var              = vp8_get16x16var_mmx;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_mmx;
+        vp8_sad16x16                 = vp8_sad16x16_mmx;
+        vp8_sad16x8                  = vp8_sad16x8_mmx;
+        vp8_sad8x16                  = vp8_sad8x16_mmx;
+        vp8_sad8x8                   = vp8_sad8x8_mmx;
+        vp8_sad4x4                   = vp8_sad4x4_mmx;
+        vp8_block_error               = vp8_block_error_mmx;
+        vp8_mbblock_error             = vp8_mbblock_error_mmx;
+        vp8_subtract_mby              = vp8_subtract_mby_mmx;
+
+    }
+    else
+    {
+        // Pure C:
+        vp8_mbuverror                = vp8_mbuverror_c;
+        vp8_fast_quantize_b            = vp8_fast_quantize_b_c;
+        vp8_short_fdct4x4             = vp8_short_fdct4x4_c;
+        vp8_short_fdct8x4             = vp8_short_fdct8x4_c;
+        vp8_fast_fdct4x4              = vp8_fast_fdct4x4_c;
+        vp8_fast_fdct8x4              = vp8_fast_fdct8x4_c;
+        vp8_subtract_b                = vp8_subtract_b_c;
+        vp8_subtract_mbuv             = vp8_subtract_mbuv_c;
+        vp8_variance4x4              = vp8_variance4x4_c;
+        vp8_variance8x8              = vp8_variance8x8_c;
+        vp8_variance8x16             = vp8_variance8x16_c;
+        vp8_variance16x8             = vp8_variance16x8_c;
+        vp8_variance16x16            = vp8_variance16x16_c;
+        vp8_mse16x16                 = vp8_mse16x16_c;
+        vp8_sub_pixel_variance4x4      = vp8_sub_pixel_variance4x4_c;
+        vp8_sub_pixel_variance8x8      = vp8_sub_pixel_variance8x8_c;
+        vp8_sub_pixel_variance8x16     = vp8_sub_pixel_variance8x16_c;
+        vp8_sub_pixel_variance16x8     = vp8_sub_pixel_variance16x8_c;
+        vp8_sub_pixel_variance16x16    = vp8_sub_pixel_variance16x16_c;
+        vp8_get_mb_ss                  = vp8_get_mb_ss_c;
+        vp8_get16x16pred_error        = vp8_get16x16pred_error_c;
+        vp8_get8x8var                = vp8_get8x8var_c;
+        vp8_get16x16var              = vp8_get16x16var_c;
+        vp8_get4x4sse_cs             = vp8_get4x4sse_cs_c;
+        vp8_sad16x16                 = vp8_sad16x16_c;
+        vp8_sad16x8                  = vp8_sad16x8_c;
+        vp8_sad8x16                  = vp8_sad8x16_c;
+        vp8_sad8x8                   = vp8_sad8x8_c;
+        vp8_sad4x4                   = vp8_sad4x4_c;
+        vp8_block_error               = vp8_block_error_c;
+        vp8_mbblock_error             = vp8_mbblock_error_c;
+        vp8_subtract_mby              = vp8_subtract_mby_c;
+    }
+
+}

diff --git a/vp8/encoder/x86/dct_mmx.asm b/vp8/encoder/x86/dct_mmx.asm
new file mode 100644
index 0000000..e134237
--- /dev/null
+++ b/vp8/encoder/x86/dct_mmx.asm

@@ -0,0 +1,846 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+section .text
+    global sym(vp8_short_fdct4x4_mmx)
+    global sym(vp8_fast_fdct4x4_mmx)
+    global sym(vp8_fast_fdct8x4_wmt)
+
+
+%define         DCTCONSTANTSBITS         (16)
+%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
+%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
+%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
+%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
+
+
+%define _1STSTAGESHIFT           14
+%define _2NDSTAGESHIFT           16
+
+; using matrix multiply with source and destbuffer has a pitch
+;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_short_fdct4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov         rsi,    arg(0) ;input
+        mov         rdi,    arg(1) ;output
+
+        movsxd      rax,    dword ptr arg(2) ;pitch
+        lea         rdx,    [dct_matrix GLOBAL]
+
+        movq        mm0,    [rsi   ]
+        movq        mm1,    [rsi + rax]
+
+        movq        mm2,    [rsi + rax*2]
+        lea         rsi,    [rsi + rax*2]
+
+        movq        mm3,    [rsi + rax]
+
+        ; first column
+        movq        mm4,    mm0
+        movq        mm7,    [rdx]
+
+        pmaddwd     mm4,    mm7
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    mm7
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+
+        pmaddwd     mm5,    mm7
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    mm7
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi],  mm4
+
+        ;second column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+8]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+8]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+8],  mm4
+
+
+        ;third column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+16]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+16]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _1STSTAGESHIFT
+        psrad       mm5,    _1STSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+16],  mm4
+
+        ;fourth column (this is the last column, so we do not have save the source any more)
+
+        pmaddwd     mm0,    [rdx+24]
+
+        pmaddwd     mm1,    [rdx+24]
+        movq        mm6,    mm0
+
+        punpckldq   mm0,    mm1
+        punpckhdq   mm6,    mm1
+
+        paddd       mm0,    mm6
+
+        pmaddwd     mm2,    [rdx+24]
+
+        pmaddwd     mm3,    [rdx+24]
+        movq        mm7,    mm2
+
+        punpckldq   mm2,    mm3
+        punpckhdq   mm7,    mm3
+
+        paddd       mm2,    mm7
+        movq        mm6,    [dct1st_stage_rounding_mmx GLOBAL]
+
+        paddd       mm0,    mm6
+        paddd       mm2,    mm6
+
+        psrad       mm0,    _1STSTAGESHIFT
+        psrad       mm2,    _1STSTAGESHIFT
+
+        packssdw    mm0,    mm2
+
+        movq        mm3,    mm0
+
+        ; done with one pass
+        ; now start second pass
+        movq        mm0,    [rdi   ]
+        movq        mm1,    [rdi+ 8]
+        movq        mm2,    [rdi+ 16]
+
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi],  mm4
+
+        ;second column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+8]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+8]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+8]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+8],  mm4
+
+
+        ;third column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+16]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+16]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+16]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+16],  mm4
+
+        ;fourth column
+        movq        mm4,    mm0
+
+        pmaddwd     mm4,    [rdx+24]
+        movq        mm5,    mm1
+
+        pmaddwd     mm5,    [rdx+24]
+        movq        mm6,    mm4
+
+        punpckldq   mm4,    mm5
+        punpckhdq   mm6,    mm5
+
+        paddd       mm4,    mm6
+        movq        mm5,    mm2
+
+        pmaddwd     mm5,    [rdx+24]
+        movq        mm6,    mm3
+
+        pmaddwd     mm6,    [rdx+24]
+        movq        mm7,    mm5
+
+        punpckldq   mm5,    mm6
+        punpckhdq   mm7,    mm6
+
+        paddd       mm5,    mm7
+        movq        mm6,    [dct2nd_stage_rounding_mmx GLOBAL]
+
+        paddd       mm4,    mm6
+        paddd       mm5,    mm6
+
+        psrad       mm4,    _2NDSTAGESHIFT
+        psrad       mm5,    _2NDSTAGESHIFT
+
+        packssdw    mm4,    mm5
+        movq        [rdi+24],  mm4
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_fast_fdct4x4_mmx(short *input, short *output, int pitch)
+sym(vp8_fast_fdct4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+        mov     rsi,    arg(0) ;input
+        mov     rdi,    arg(1) ;output
+
+        lea     rdx,    [dct_const_mmx GLOBAL]
+        movsxd  rax,    dword ptr arg(2) ;pitch
+
+        lea     rcx,    [rsi + rax*2]
+        ; read the input data
+        movq    mm0,    [rsi]
+        movq    mm1,    [rsi + rax    ]
+
+        movq    mm2,    [rcx]
+        movq    mm3,    [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        paddw   mm0,    mm0
+        paddw   mm1,    mm1
+
+        psllw   mm2,    1
+        psllw   mm3,    1
+
+        ; transpose for the second stage
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03
+
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13
+
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33
+
+
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30
+
+        punpckhdq   mm1,    mm2     ; 01 11 21 31
+
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32
+
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4
+
+
+        ; first stage
+        movq    mm5,    mm0
+        movq    mm4,    mm1
+
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2
+
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a
+
+        paddw   mm0,    mm1         ; a + b
+        psubw   mm2,    mm1         ; a - b
+
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03
+
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3
+
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d
+
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1
+
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13
+
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3
+
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1
+
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5
+
+
+        ; done with vertical
+        ; transpose for the second stage
+        movq    mm4,    mm0         ; 00 01 02 03
+        movq    mm5,    mm2         ; 10 11 12 03
+
+        punpcklwd   mm0,    mm1     ; 00 10 01 11
+        punpckhwd   mm4,    mm1     ; 02 12 03 13
+
+        punpcklwd   mm2,    mm3     ; 20 30 21 31
+        punpckhwd   mm5,    mm3     ; 22 32 23 33
+
+
+        movq        mm1,    mm0     ; 00 10 01 11
+        punpckldq   mm0,    mm2     ; 00 10 20 30
+
+        punpckhdq   mm1,    mm2     ; 01 11 21 31
+
+        movq        mm2,    mm4     ; 02 12 03 13
+        punpckldq   mm2,    mm5     ; 02 12 22 32
+
+        punpckhdq   mm4,    mm5     ; 03 13 23 33
+        movq        mm3,    mm4
+
+
+        ; first stage
+        movq    mm5,    mm0
+        movq    mm4,    mm1
+
+        paddw   mm0,    mm3         ; a = 0 + 3
+        paddw   mm1,    mm2         ; b = 1 + 2
+
+        psubw   mm4,    mm2         ; c = 1 - 2
+        psubw   mm5,    mm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movq    mm6,    [rdx +  16] ; c2
+        movq    mm2,    mm0         ; a
+        paddw   mm0,    mm1         ; a + b
+
+        psubw   mm2,    mm1         ; a - b
+
+        movq    mm1,    mm0         ; a + b
+        pmulhw  mm0,    mm6         ; 00 01 02 03
+
+        paddw   mm0,    mm1         ; output 00 01 02 03
+        pmulhw  mm6,    mm2         ; 20 21 22 23
+
+        paddw   mm2,    mm6         ; output 20 21 22 23
+
+
+        ; output 1 and 3
+        movq    mm6,    [rdx +  8]  ; c1
+        movq    mm7,    [rdx + 24]  ; c3
+
+        movq    mm1,    mm4         ; c
+        movq    mm3,    mm5         ; d
+
+        pmulhw  mm1,    mm7         ; c * c3
+        pmulhw  mm3,    mm6         ; d * c1
+
+        paddw   mm3,    mm5         ; d * c1 rounded
+        paddw   mm1,    mm3         ; output 10 11 12 13
+
+        movq    mm3,    mm4         ; c
+        pmulhw  mm5,    mm7         ; d * c3
+
+        pmulhw  mm4,    mm6         ; c * c1
+        paddw   mm3,    mm4         ; round c* c1
+
+        psubw   mm5,    mm3         ; output 30 31 32 33
+        movq    mm3,    mm5
+        ; done with vertical
+
+		pcmpeqw	mm4,	mm4
+		pcmpeqw	mm5,	mm5
+		psrlw	mm4,	15
+		psrlw	mm5,	15
+
+        paddw   mm0,    mm4
+        paddw   mm1,    mm5
+        paddw   mm2,    mm4
+        paddw   mm3,    mm5
+
+        psraw   mm0, 1
+        psraw   mm1, 1
+        psraw   mm2, 1
+        psraw   mm3, 1
+
+        movq        [rdi   ],   mm0
+        movq        [rdi+ 8],   mm1
+        movq        [rdi+16],   mm2
+        movq        [rdi+24],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_fast_fdct8x4_wmt(short *input, short *output, int pitch)
+sym(vp8_fast_fdct8x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+        mov         rsi,    arg(0) ;input
+        mov         rdi,    arg(1) ;output
+
+        lea         rdx,    [dct_const_xmm GLOBAL]
+        movsxd      rax,    dword ptr arg(2) ;pitch
+
+        lea         rcx,    [rsi + rax*2]
+        ; read the input data
+        movdqa      xmm0,       [rsi]
+        movdqa      xmm2,       [rsi + rax]
+
+        movdqa      xmm4,       [rcx]
+        movdqa      xmm3,       [rcx + rax]
+        ; get the constants
+        ;shift to left by 1 for prescision
+        psllw       xmm0,        1
+        psllw       xmm2,        1
+
+        psllw       xmm4,        1
+        psllw       xmm3,        1
+
+        ; transpose for the second stage
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 04 05 06 07
+        movdqa      xmm5,       xmm4         ; 20 21 22 23 24 25 26 27
+
+        punpcklwd   xmm0,       xmm2         ; 00 10 01 11 02 12 03 13
+        punpckhwd   xmm1,       xmm2         ; 04 14 05 15 06 16 07 17
+
+        punpcklwd   xmm4,       xmm3         ; 20 30 21 31 22 32 23 33
+        punpckhwd   xmm5,       xmm3         ; 24 34 25 35 26 36 27 37
+
+        movdqa      xmm2,       xmm0         ; 00 10 01 11 02 12 03 13
+        punpckldq   xmm0,       xmm4         ; 00 10 20 30 01 11 21 31
+
+        punpckhdq   xmm2,       xmm4         ; 02 12 22 32 03 13 23 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 14 05 15 06 16 07 17
+        punpckldq   xmm4,       xmm5         ; 04 14 24 34 05 15 25 35
+
+        punpckhdq   xmm1,       xmm5         ; 06 16 26 36 07 17 27 37
+        movdqa      xmm3,       xmm2         ; 02 12 22 32 03 13 23 33
+
+        punpckhqdq  xmm3,       xmm1         ; 03 13 23 33 07 17 27 37
+        punpcklqdq  xmm2,       xmm1         ; 02 12 22 32 06 16 26 36
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 01 11 21 31
+        punpcklqdq  xmm0,       xmm4         ; 00 10 20 30 04 14 24 34
+
+        punpckhqdq  xmm1,       xmm4         ; 01 11 21 32 05 15 25 35
+
+        ; xmm0 0
+        ; xmm1 1
+        ; xmm2 2
+        ; xmm3 3
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+
+
+        ; done with vertical
+        ; transpose for the second stage
+        movdqa      xmm4,       xmm2         ; 02 12 22 32 06 16 26 36
+        movdqa      xmm2,       xmm1         ; 01 11 21 31 05 15 25 35
+
+        movdqa      xmm1,       xmm0         ; 00 10 20 30 04 14 24 34
+        movdqa      xmm5,       xmm4         ; 02 12 22 32 06 16 26 36
+
+        punpcklwd   xmm0,       xmm2         ; 00 01 10 11 20 21 30 31
+        punpckhwd   xmm1,       xmm2         ; 04 05 14 15 24 25 34 35
+
+        punpcklwd   xmm4,       xmm3         ; 02 03 12 13 22 23 32 33
+        punpckhwd   xmm5,       xmm3         ; 06 07 16 17 26 27 36 37
+
+        movdqa      xmm2,       xmm0         ; 00 01 10 11 20 21 30 31
+        punpckldq   xmm0,       xmm4         ; 00 01 02 03 10 11 12 13
+
+        punpckhdq   xmm2,       xmm4         ; 20 21 22 23 30 31 32 33
+
+
+        movdqa      xmm4,       xmm1         ; 04 05 14 15 24 25 34 35
+        punpckldq   xmm4,       xmm5         ; 04 05 06 07 14 15 16 17
+
+        punpckhdq   xmm1,       xmm5         ; 24 25 26 27 34 35 36 37
+        movdqa      xmm3,       xmm2         ; 20 21 22 23 30 31 32 33
+
+        punpckhqdq  xmm3,       xmm1         ; 30 31 32 33 34 35 36 37
+        punpcklqdq  xmm2,       xmm1         ; 20 21 22 23 24 25 26 27
+
+        movdqa      xmm1,       xmm0         ; 00 01 02 03 10 11 12 13
+        punpcklqdq  xmm0,       xmm4         ; 00 01 02 03 04 05 06 07
+
+        punpckhqdq  xmm1,       xmm4         ; 10 11 12 13 14 15 16 17
+
+        ; first stage
+        movdqa      xmm5,       xmm0
+        movdqa      xmm4,       xmm1
+
+        paddw       xmm0,       xmm3         ; a = 0 + 3
+        paddw       xmm1,       xmm2         ; b = 1 + 2
+
+        psubw       xmm4,       xmm2         ; c = 1 - 2
+        psubw       xmm5,       xmm3         ; d = 0 - 3
+
+
+        ; output 0 and 2
+        movdqa      xmm6,       [rdx +  32] ; c2
+        movdqa      xmm2,       xmm0         ; a
+
+        paddw       xmm0,       xmm1         ; a + b
+        psubw       xmm2,       xmm1         ; a - b
+
+        movdqa      xmm1,       xmm0         ; a + b
+        pmulhw      xmm0,       xmm6         ; 00 01 02 03
+
+        paddw       xmm0,       xmm1         ; output 00 01 02 03
+        pmulhw      xmm6,       xmm2         ; 20 21 22 23
+
+        paddw       xmm2,       xmm6         ; output 20 21 22 23
+
+        ; output 1 and 3
+        movdqa      xmm6,       [rdx + 16]  ; c1
+        movdqa      xmm7,       [rdx + 48]  ; c3
+
+        movdqa      xmm1,       xmm4         ; c
+        movdqa      xmm3,       xmm5         ; d
+
+        pmulhw      xmm1,       xmm7         ; c * c3
+        pmulhw      xmm3,       xmm6         ; d * c1
+
+        paddw       xmm3,       xmm5         ; d * c1 rounded
+        paddw       xmm1,       xmm3         ; output 10 11 12 13
+
+        movdqa      xmm3,       xmm4         ; c
+        pmulhw      xmm5,       xmm7         ; d * c3
+
+        pmulhw      xmm4,       xmm6         ; c * c1
+        paddw       xmm3,       xmm4         ; round c* c1
+
+        psubw       xmm5,       xmm3         ; output 30 31 32 33
+        movdqa      xmm3,       xmm5
+        ; done with vertical
+
+
+        pcmpeqw		xmm4,		xmm4
+        pcmpeqw		xmm5,		xmm5;
+        psrlw		xmm4,		15
+        psrlw		xmm5,		15
+
+        paddw       xmm0,       xmm4
+        paddw       xmm1,       xmm5
+        paddw       xmm2,       xmm4
+        paddw       xmm3,       xmm5
+
+        psraw       xmm0,       1
+        psraw       xmm1,       1
+        psraw       xmm2,       1
+        psraw       xmm3,       1
+
+        movq        QWORD PTR[rdi   ],   xmm0
+        movq        QWORD PTR[rdi+ 8],   xmm1
+        movq        QWORD PTR[rdi+16],   xmm2
+        movq        QWORD PTR[rdi+24],   xmm3
+
+        psrldq      xmm0,       8
+        psrldq      xmm1,       8
+        psrldq      xmm2,       8
+        psrldq      xmm3,       8
+
+        movq        QWORD PTR[rdi+32],   xmm0
+        movq        QWORD PTR[rdi+40],   xmm1
+        movq        QWORD PTR[rdi+48],   xmm2
+        movq        QWORD PTR[rdi+56],   xmm3
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;static const unsigned int dct1st_stage_rounding_mmx[2] =
+align 16
+dct1st_stage_rounding_mmx:
+    times 2 dd 8192
+
+
+;static const unsigned int dct2nd_stage_rounding_mmx[2] =
+align 16
+dct2nd_stage_rounding_mmx:
+    times 2 dd 32768
+
+
+;static const short dct_matrix[4][4]=
+align 16
+dct_matrix:
+    times 4 dw 23170
+
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+
+    dw 23170
+    times 2 dw -23170
+    dw 23170
+
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
+
+
+;static const unsigned short dct_const_mmx[4 * 4]=
+align 16
+dct_const_mmx:
+    times 4 dw 0
+    times 4 dw 60547
+    times 4 dw 46341
+    times 4 dw 25080
+
+
+;static const unsigned short dct_const_xmm[8 * 4]=
+align 16
+dct_const_xmm:
+    times 8 dw 0
+    times 8 dw 60547
+    times 8 dw 46341
+    times 8 dw 25080

diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm
new file mode 100644
index 0000000..3e5e9a7
--- /dev/null
+++ b/vp8/encoder/x86/dct_sse2.asm

@@ -0,0 +1,260 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_short_fdct4x4_wmt)
+
+%define         DCTCONSTANTSBITS         (16)
+%define         DCTROUNDINGVALUE         (1<< (DCTCONSTANTSBITS-1))
+%define         x_c1                      (60547)          ; cos(pi  /8) * (1<<15)
+%define         x_c2                      (46341)          ; cos(pi*2/8) * (1<<15)
+%define         x_c3                      (25080)          ; cos(pi*3/8) * (1<<15)
+
+%define _1STSTAGESHIFT           14
+%define _2NDSTAGESHIFT           16
+
+
+;; using matrix multiply
+;void vp8_short_fdct4x4_wmt(short *input, short *output)
+sym(vp8_short_fdct4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    GET_GOT     rbx
+    ; end prolog
+
+        mov         rax,        arg(0) ;input
+        mov         rcx,        arg(1) ;output
+
+        lea         rdx,        [dct_matrix_sse2 GLOBAL]
+
+        movdqu      xmm0,       [rax   ]
+        movdqu      xmm1,       [rax+16]
+
+        ; first column
+        movdqa      xmm2,       xmm0
+        movdqa      xmm7,       [rdx]
+
+        pmaddwd     xmm2,       xmm7
+        movdqa      xmm3,       xmm1
+
+        pmaddwd     xmm3,       xmm7
+        movdqa      xmm4,       xmm2
+
+        punpckldq   xmm2,       xmm3
+        punpckhdq   xmm4,       xmm3
+
+        movdqa      xmm3,       xmm2
+        punpckldq   xmm2,       xmm4
+
+        punpckhdq   xmm3,       xmm4
+        paddd       xmm2,       xmm3
+
+
+        paddd       xmm2,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+        psrad       xmm2,       _1STSTAGESHIFT
+        ;second column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+16]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+16]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+        psrad       xmm3,       _1STSTAGESHIFT
+        packssdw    xmm2,       xmm3
+
+        ;third column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+32]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+32]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _1STSTAGESHIFT
+
+        ;fourth column (this is the last column, so we do not have save the source any more)
+        pmaddwd     xmm0,       [rdx+48]
+        pmaddwd     xmm1,       [rdx+48]
+
+        movdqa      xmm4,       xmm0
+        punpckldq   xmm0,       xmm1
+
+        punpckhdq   xmm4,       xmm1
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm4
+        punpckhdq   xmm1,       xmm4
+
+        paddd       xmm0,       xmm1
+        paddd       xmm0,       XMMWORD PTR [dct1st_stage_rounding_sse2 GLOBAL]
+
+
+        psrad       xmm0,       _1STSTAGESHIFT
+        packssdw    xmm3,       xmm0
+        ; done with one pass
+        ; now start second pass
+        movdqa      xmm0,       xmm2
+        movdqa      xmm1,       xmm3
+
+        pmaddwd     xmm2,       xmm7
+        pmaddwd     xmm3,       xmm7
+
+        movdqa      xmm4,       xmm2
+        punpckldq   xmm2,       xmm3
+
+        punpckhdq   xmm4,       xmm3
+        movdqa      xmm3,       xmm2
+
+        punpckldq   xmm2,       xmm4
+        punpckhdq   xmm3,       xmm4
+
+        paddd       xmm2,       xmm3
+        paddd       xmm2,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm2,       _2NDSTAGESHIFT
+
+        ;second column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+16]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+16]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _2NDSTAGESHIFT
+        packssdw    xmm2,       xmm3
+
+        movdqu      [rcx],      xmm2
+        ;third column
+        movdqa      xmm3,       xmm0
+        pmaddwd     xmm3,       [rdx+32]
+
+        movdqa      xmm4,       xmm1
+        pmaddwd     xmm4,       [rdx+32]
+
+        movdqa      xmm5,       xmm3
+        punpckldq   xmm3,       xmm4
+
+        punpckhdq   xmm5,       xmm4
+        movdqa      xmm4,       xmm3
+
+        punpckldq   xmm3,       xmm5
+        punpckhdq   xmm4,       xmm5
+
+        paddd       xmm3,       xmm4
+        paddd       xmm3,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm3,       _2NDSTAGESHIFT
+        ;fourth column
+        pmaddwd     xmm0,       [rdx+48]
+        pmaddwd     xmm1,       [rdx+48]
+
+        movdqa      xmm4,       xmm0
+        punpckldq   xmm0,       xmm1
+
+        punpckhdq   xmm4,       xmm1
+        movdqa      xmm1,       xmm0
+
+        punpckldq   xmm0,       xmm4
+        punpckhdq   xmm1,       xmm4
+
+        paddd       xmm0,       xmm1
+        paddd       xmm0,       XMMWORD PTR [dct2nd_stage_rounding_sse2 GLOBAL]
+
+        psrad       xmm0,       _2NDSTAGESHIFT
+        packssdw    xmm3,       xmm0
+
+        movdqu     [rcx+16],   xmm3
+
+    mov rsp, rbp
+    ; begin epilog
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;static unsigned int dct1st_stage_rounding_sse2[4] =
+align 16
+dct1st_stage_rounding_sse2:
+    times 4 dd 8192
+
+
+;static unsigned int dct2nd_stage_rounding_sse2[4] =
+align 16
+dct2nd_stage_rounding_sse2:
+    times 4 dd 32768
+
+;static short dct_matrix_sse2[4][8]=
+align 16
+dct_matrix_sse2:
+    times 8 dw 23170
+
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+    dw  30274
+    dw  12540
+    dw -12540
+    dw -30274
+
+    dw  23170
+    times 2 dw -23170
+    times 2 dw  23170
+    times 2 dw -23170
+    dw  23170
+
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540
+    dw  12540
+    dw -30274
+    dw  30274
+    dw -12540

diff --git a/vp8/encoder/x86/dct_x86.h b/vp8/encoder/x86/dct_x86.h
new file mode 100644
index 0000000..bc80e64
--- /dev/null
+++ b/vp8/encoder/x86/dct_x86.h

@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef DCT_X86_H
+#define DCT_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_fdct(vp8_short_fdct4x4_mmx);
+extern prototype_fdct(vp8_short_fdct8x4_mmx);
+extern prototype_fdct(vp8_fast_fdct4x4_mmx);
+extern prototype_fdct(vp8_fast_fdct8x4_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_mmx
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_mmx
+
+#undef  vp8_fdct_fast4x4
+#define vp8_fdct_fast4x4 vp8_fast_fdct4x4_mmx
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_fdct(vp8_short_fdct4x4_wmt);
+extern prototype_fdct(vp8_short_fdct8x4_wmt);
+extern prototype_fdct(vp8_fast_fdct8x4_wmt);
+
+extern prototype_fdct(vp8_short_walsh4x4_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#if 0
+/* short SSE2 DCT currently disabled, does not match the MMX version */
+#undef  vp8_fdct_short4x4
+#define vp8_fdct_short4x4 vp8_short_fdct4x4_wmt
+
+#undef  vp8_fdct_short8x4
+#define vp8_fdct_short8x4 vp8_short_fdct8x4_wmt
+#endif
+
+#undef  vp8_fdct_fast8x4
+#define vp8_fdct_fast8x4 vp8_fast_fdct8x4_wmt
+
+#undef vp8_fdct_walsh_short4x4
+#define vp8_fdct_walsh_short4x4  vp8_short_walsh4x4_sse2
+
+#endif
+
+
+#endif
+
+#endif

diff --git a/vp8/encoder/x86/encodemb_x86.h b/vp8/encoder/x86/encodemb_x86.h
new file mode 100644
index 0000000..9397a6c
--- /dev/null
+++ b/vp8/encoder/x86/encodemb_x86.h

@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef ENCODEMB_X86_H
+#define ENCODEMB_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_berr(vp8_block_error_mmx);
+extern prototype_mberr(vp8_mbblock_error_mmx);
+extern prototype_mbuverr(vp8_mbuverror_mmx);
+extern prototype_subb(vp8_subtract_b_mmx);
+extern prototype_submby(vp8_subtract_mby_mmx);
+extern prototype_submbuv(vp8_subtract_mbuv_mmx);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_mmx
+
+#undef  vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_mmx
+
+#undef  vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_mmx
+
+#undef  vp8_encodemb_subb
+#define vp8_encodemb_subb vp8_subtract_b_mmx
+
+#undef  vp8_encodemb_submby
+#define vp8_encodemb_submby vp8_subtract_mby_mmx
+
+#undef  vp8_encodemb_submbuv
+#define vp8_encodemb_submbuv vp8_subtract_mbuv_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_berr(vp8_block_error_xmm);
+extern prototype_mberr(vp8_mbblock_error_xmm);
+extern prototype_mbuverr(vp8_mbuverror_xmm);
+
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_encodemb_berr
+#define vp8_encodemb_berr vp8_block_error_xmm
+
+#undef  vp8_encodemb_mberr
+#define vp8_encodemb_mberr vp8_mbblock_error_xmm
+
+#undef  vp8_encodemb_mbuverr
+#define vp8_encodemb_mbuverr vp8_mbuverror_xmm
+
+#endif
+#endif
+
+
+#endif

diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
new file mode 100644
index 0000000..1940471
--- /dev/null
+++ b/vp8/encoder/x86/encodeopt.asm

@@ -0,0 +1,393 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+
+;int vp8_block_error_xmm(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_xmm)
+sym(vp8_block_error_xmm):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm7,       xmm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movdqa      xmm3,       [rsi]
+
+        movdqa      xmm4,       [rdi]
+        movdqa      xmm5,       [rsi+16]
+
+        movdqa      xmm6,       [rdi+16]
+        pxor        xmm1,       xmm1    ; from movd xmm1, dc; dc=0
+
+        movdqa      xmm2,       xmm7
+        psubw       xmm5,       xmm6
+
+        por         xmm1,       xmm2
+        pmaddwd     xmm5,       xmm5
+
+        pcmpeqw     xmm1,       xmm7
+        psubw       xmm3,       xmm4
+
+        pand        xmm1,       xmm3
+        pmaddwd     xmm1,       xmm1
+
+        paddd       xmm1,       xmm5
+        movdqa      xmm0,       xmm1
+
+        punpckldq   xmm0,       xmm7
+        punpckhdq   xmm1,       xmm7
+
+        paddd       xmm0,       xmm1
+        movdqa      xmm1,       xmm0
+
+        psrldq      xmm0,       8
+        paddd       xmm0,       xmm1
+
+        movd        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_block_error_mmx(short *coeff_ptr,  short *dcoef_ptr)
+global sym(vp8_block_error_mmx)
+sym(vp8_block_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        movq        mm3,        [rsi]
+
+        movq        mm4,        [rdi]
+        movq        mm5,        [rsi+8]
+
+        movq        mm6,        [rdi+8]
+        pxor        mm1,        mm1 ; from movd mm1, dc ; dc =0
+
+        movq        mm2,        mm7
+        psubw       mm5,        mm6
+
+        por         mm1,        mm2
+        pmaddwd     mm5,        mm5
+
+        pcmpeqw     mm1,        mm7
+        psubw       mm3,        mm4
+
+        pand        mm1,        mm3
+        pmaddwd     mm1,        mm1
+
+        paddd       mm1,        mm5
+        movq        mm3,        [rsi+16]
+
+        movq        mm4,        [rdi+16]
+        movq        mm5,        [rsi+24]
+
+        movq        mm6,        [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm3,        mm5
+
+        paddd       mm1,        mm3
+        movq        mm0,        mm1
+
+        psrlq       mm1,        32
+        paddd       mm0,        mm1
+
+        movd        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_mmx_impl)
+sym(vp8_mbblock_error_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        mm7,        mm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        mm2,        mm2
+
+        movd        mm1,        dword ptr arg(2) ;dc
+        por         mm1,        mm2
+
+        pcmpeqw     mm1,        mm7
+        mov         rcx,        16
+
+mberror_loop_mmx:
+        movq        mm3,       [rsi]
+        movq        mm4,       [rdi]
+
+        movq        mm5,       [rsi+8]
+        movq        mm6,       [rdi+8]
+
+
+        psubw       mm5,        mm6
+        pmaddwd     mm5,        mm5
+
+        psubw       mm3,        mm4
+        pand        mm3,        mm1
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        movq        mm3,       [rsi+16]
+
+        movq        mm4,       [rdi+16]
+        movq        mm5,       [rsi+24]
+
+        movq        mm6,       [rdi+24]
+        psubw       mm5,        mm6
+
+        pmaddwd     mm5,        mm5
+        psubw       mm3,        mm4
+
+        pmaddwd     mm3,        mm3
+        paddd       mm2,        mm5
+
+        paddd       mm2,        mm3
+        add         rsi,        32
+
+        add         rdi,        32
+        sub         rcx,        1
+
+        jnz         mberror_loop_mmx
+
+        movq        mm0,        mm2
+        psrlq       mm2,        32
+
+        paddd       mm0,        mm2
+        movd        rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+global sym(vp8_mbblock_error_xmm_impl)
+sym(vp8_mbblock_error_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov         rsi,        arg(0) ;coeff_ptr
+        pxor        xmm7,       xmm7
+
+        mov         rdi,        arg(1) ;dcoef_ptr
+        pxor        xmm2,       xmm2
+
+        movd        xmm1,       dword ptr arg(2) ;dc
+        por         xmm1,       xmm2
+
+        pcmpeqw     xmm1,       xmm7
+        mov         rcx,        16
+
+mberror_loop:
+        movdqa      xmm3,       [rsi]
+        movdqa      xmm4,       [rdi]
+
+        movdqa      xmm5,       [rsi+16]
+        movdqa      xmm6,       [rdi+16]
+
+
+        psubw       xmm5,       xmm6
+        pmaddwd     xmm5,       xmm5
+
+        psubw       xmm3,       xmm4
+        pand        xmm3,       xmm1
+
+        pmaddwd     xmm3,       xmm3
+        add         rsi,        32
+
+        add         rdi,        32
+
+        sub         rcx,        1
+        paddd       xmm2,       xmm5
+
+        paddd       xmm2,       xmm3
+        jnz         mberror_loop
+
+        movdqa      xmm0,       xmm2
+        punpckldq   xmm0,       xmm7
+
+        punpckhdq   xmm2,       xmm7
+        paddd       xmm0,       xmm2
+
+        movdqa      xmm1,       xmm0
+        psrldq      xmm0,       8
+
+        paddd       xmm0,       xmm1
+        movd        rax,        xmm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_mmx_impl)
+sym(vp8_mbuverror_mmx_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            mm7,        mm7
+
+mbuverror_loop_mmx:
+
+        movq            mm1,        [rsi]
+        movq            mm2,        [rdi]
+
+        psubw           mm1,        mm2
+        pmaddwd         mm1,        mm1
+
+
+        movq            mm3,        [rsi+8]
+        movq            mm4,        [rdi+8]
+
+        psubw           mm3,        mm4
+        pmaddwd         mm3,        mm3
+
+
+        paddd           mm7,        mm1
+        paddd           mm7,        mm3
+
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             mbuverror_loop_mmx
+
+        movq            mm0,        mm7
+        psrlq           mm7,        32
+
+        paddd           mm0,        mm7
+        movd            rax,        mm0
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+global sym(vp8_mbuverror_xmm_impl)
+sym(vp8_mbuverror_xmm_impl):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 2
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;s_ptr
+        mov             rdi,        arg(1) ;d_ptr
+
+        mov             rcx,        16
+        pxor            xmm7,       xmm7
+
+mbuverror_loop:
+
+        movdqa          xmm1,       [rsi]
+        movdqa          xmm2,       [rdi]
+
+        psubw           xmm1,       xmm2
+        pmaddwd         xmm1,       xmm1
+
+        paddd           xmm7,       xmm1
+
+        add             rsi,        16
+        add             rdi,        16
+
+        dec             rcx
+        jnz             mbuverror_loop
+
+        pxor        xmm0,           xmm0
+        movdqa      xmm1,           xmm7
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        paddd       xmm1,           xmm2
+
+        movdqa      xmm2,           xmm1
+
+        psrldq      xmm1,           8
+        paddd       xmm1,           xmm2
+
+        movd            rax,            xmm1
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm
new file mode 100644
index 0000000..7d86201
--- /dev/null
+++ b/vp8/encoder/x86/fwalsh_sse2.asm

@@ -0,0 +1,117 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_short_walsh4x4_sse2(short *input, short *output, int pitch)
+global sym(vp8_short_walsh4x4_sse2)
+sym(vp8_short_walsh4x4_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 3
+    push        rsi
+    push        rdi
+    ; end prolog
+
+    mov     rsi, arg(0)
+    mov     rdi, arg(1)
+
+    movdqu    xmm4, [rsi + 0]       ;ip[4] ip[0]
+    movdqu    xmm0, [rsi + 16]      ;ip[12] ip[8]
+
+    pxor  xmm7, xmm7
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm3, xmm4          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm4, xmm0          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm3, xmm0          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm4          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm4, xmm3          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm3          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm4          ;ip[4] ip[0]
+
+    paddw   xmm4, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm5, xmm4
+    punpcklqdq  xmm4, xmm3          ;d1 a1
+    punpckhqdq  xmm5, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm5          ;c1 b1
+    paddw   xmm5, xmm4          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm4, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    ; 13 12 11 10 03 02 01 00
+    ;
+    ; 33 32 31 30 23 22 21 20
+    ;
+    movdqa    xmm0, xmm5          ; 13 12 11 10 03 02 01 00
+    punpcklwd xmm5, xmm4          ; 23 03 22 02 21 01 20 00
+    punpckhwd xmm0, xmm4          ; 33 13 32 12 31 11 30 10
+    movdqa    xmm1, xmm5          ; 23 03 22 02 21 01 20 00
+    punpcklwd xmm5, xmm0          ; 31 21 11 01 30 20 10 00
+    punpckhwd xmm1, xmm0          ; 33 23 13 03 32 22 12 02
+    ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+    pshufd    xmm2, xmm1, 4eh       ;ip[8] ip[12]
+    movdqa    xmm3, xmm5          ;ip[4] ip[0]
+
+    paddw   xmm5, xmm2          ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1
+    psubw   xmm3, xmm2          ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1
+
+    movdqa    xmm6, xmm5
+    punpcklqdq  xmm5, xmm3          ;d1 a1
+    punpckhqdq  xmm6, xmm3          ;c1 b1
+
+    movdqa    xmm1, xmm6          ;c1 b1
+    paddw   xmm6, xmm5          ;dl+cl a1+b1 aka op[4] op[0]
+    psubw   xmm5, xmm1          ;d1-c1 a1-b1 aka op[12] op[8]
+
+    movdqa    xmm0, xmm6          ;aka b2 a2
+    movdqa    xmm1, xmm5          ;aka d2 c2
+
+    pcmpgtw   xmm0, xmm7
+    pcmpgtw   xmm1, xmm7
+
+    psrlw   xmm0, 15
+    psrlw   xmm1, 15
+
+    paddw   xmm6, xmm0
+    paddw   xmm5, xmm1
+
+    psraw   xmm6, 1
+    psraw   xmm5, 1
+
+    ;   a2 = a1 + b1;
+    ;   b2 = c1 + d1;
+    ;   c2 = a1 - b1;
+    ;   d2 = d1 - c1;
+    ;        a2 += (a2>0);
+    ;        b2 += (b2>0);
+    ;        c2 += (c2>0);
+    ;        d2 += (d2>0);
+    ;   op[0] = (a2)>>1;
+    ;   op[4] = (b2)>>1;
+    ;   op[8] = (c2)>>1;
+    ;   op[12]= (d2)>>1;
+
+    movdqu  [rdi + 0], xmm6
+    movdqu  [rdi + 16], xmm5
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/mcomp_x86.h b/vp8/encoder/x86/mcomp_x86.h
new file mode 100644
index 0000000..5661491
--- /dev/null
+++ b/vp8/encoder/x86/mcomp_x86.h

@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef MCOMP_X86_H
+#define MCOMP_X86_H
+
+#if HAVE_SSE3
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_search_full_search
+#define vp8_search_full_search vp8_full_search_sadx3
+
+#undef  vp8_search_diamond_search
+#define vp8_search_diamond_search vp8_diamond_search_sadx4
+
+#endif
+#endif
+
+#endif
+

diff --git a/vp8/encoder/x86/preproc_mmx.c b/vp8/encoder/x86/preproc_mmx.c
new file mode 100644
index 0000000..69617ca
--- /dev/null
+++ b/vp8/encoder/x86/preproc_mmx.c

@@ -0,0 +1,297 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "memory.h"
+#include "preproc.h"
+#include "pragmas.h"
+
+/****************************************************************************
+*  Macros
+****************************************************************************/
+#define FRAMECOUNT 7
+#define ROUNDUP32(X) ( ( ( (unsigned long) X ) + 31 )&( 0xFFFFFFE0 ) )
+
+/****************************************************************************
+*  Imports
+****************************************************************************/
+extern void vpx_get_processor_flags(int *mmx_enabled, int *xmm_enabled, int *wmt_enabled);
+
+/****************************************************************************
+*  Exported Global Variables
+****************************************************************************/
+void (*temp_filter)(pre_proc_instance *ppi, unsigned char *s, unsigned char *d, int bytes, int strength);
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_wmt
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_wmt
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3, 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16, 16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 8; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 8; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 8;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        xmm1, xmm1              // accumulator
+
+                pxor        xmm7, xmm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        xmm2, xmm2              // count
+
+                movq        xmm3, QWORD PTR [edi]
+
+                movq        QWORD PTR [esi+8*eax], xmm3
+
+                punpcklbw   xmm3, xmm2              // xmm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movq        xmm4, QWORD PTR [esi]   // get frame buffer values
+                punpcklbw   xmm4, xmm7              // xmm4 frame buffer pixels
+                movdqa      xmm6, xmm4              // save the pixel values
+                psubsw      xmm4, xmm3              // subtracted pixel values
+                pmullw      xmm4, xmm4              // square xmm4
+                movd        xmm5, strength
+                psrlw       xmm4, xmm5              // should be strength
+                pmullw      xmm4, threes            // 3 * modifier
+                movdqa      xmm5, sixteens          // 16s
+                psubusw     xmm5, xmm4              // 16 - modifiers
+                movdqa      xmm4, xmm5              // save the modifiers
+                pmullw      xmm4, xmm6              // multiplier values
+                paddusw     xmm1, xmm4              // accumulator
+                paddusw     xmm2, xmm5              // count
+                add         esi, 8                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movdqa      counts, xmm2
+                psrlw       xmm2, 1                 // divide count by 2 for rounding
+                paddusw     xmm1, xmm2              // rounding added in
+
+                mov         frameptr, esi
+
+                movdqa      sums, xmm1
+            }
+
+            for (i = 0; i < 8; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 8;
+            d += 8;
+            byte += 8;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}
+
+/****************************************************************************
+ *
+ *  ROUTINE       : temp_filter_mmx
+ *
+ *  INPUTS        : pre_proc_instance *ppi : Pointer to pre-processor instance.
+ *                  unsigned char *s     : Pointer to source frame.
+ *                  unsigned char *d     : Pointer to destination frame.
+ *                  int bytes            : Number of bytes to filter.
+ *                  int strength         : Strength of filter to apply.
+ *
+ *  OUTPUTS       : None.
+ *
+ *  RETURNS       : void
+ *
+ *  FUNCTION      : Performs a closesness adjusted temporarl blur
+ *
+ *  SPECIAL NOTES : Destination frame can be same as source frame.
+ *
+ ****************************************************************************/
+void temp_filter_mmx
+(
+    pre_proc_instance *ppi,
+    unsigned char *s,
+    unsigned char *d,
+    int bytes,
+    int strength
+)
+{
+    int byte = 0;
+    unsigned char *frameptr = ppi->frame_buffer;
+
+    __declspec(align(16)) unsigned short threes[]  = { 3, 3, 3, 3};
+    __declspec(align(16)) unsigned short sixteens[] = {16, 16, 16, 16};
+
+    if (ppi->frame == 0)
+    {
+        do
+        {
+            int i;
+            int frame = 0;
+
+            do
+            {
+                for (i = 0; i < 4; i++)
+                {
+                    *frameptr = s[byte+i];
+                    ++frameptr;
+                }
+
+                ++frame;
+            }
+            while (frame < FRAMECOUNT);
+
+            for (i = 0; i < 4; i++)
+                d[byte+i] = s[byte+i];
+
+            byte += 4;
+
+        }
+        while (byte < bytes);
+    }
+    else
+    {
+        int i;
+        int offset2 = (ppi->frame % FRAMECOUNT);
+
+        do
+        {
+            __declspec(align(16)) unsigned short counts[8];
+            __declspec(align(16)) unsigned short sums[8];
+            __asm
+            {
+
+                mov         eax, offset2
+                mov         edi, s                  // source pixels
+                pxor        mm1, mm1                // accumulator
+                pxor        mm7, mm7
+
+                mov         esi, frameptr           // accumulator
+                pxor        mm2, mm2                // count
+
+                movd        mm3, DWORD PTR [edi]
+                movd        DWORD PTR [esi+4*eax], mm3
+
+                punpcklbw   mm3, mm2                // mm3 source pixels
+                mov         ecx,  FRAMECOUNT
+
+                next_frame:
+                movd        mm4, DWORD PTR [esi]    // get frame buffer values
+                punpcklbw   mm4, mm7                // mm4 frame buffer pixels
+                movq        mm6, mm4                // save the pixel values
+                psubsw      mm4, mm3                // subtracted pixel values
+                pmullw      mm4, mm4                // square mm4
+                movd        mm5, strength
+                psrlw       mm4, mm5                // should be strength
+                pmullw      mm4, threes             // 3 * modifier
+                movq        mm5, sixteens           // 16s
+                psubusw     mm5, mm4                // 16 - modifiers
+                movq        mm4, mm5                // save the modifiers
+                pmullw      mm4, mm6                // multiplier values
+                paddusw     mm1, mm4                // accumulator
+                paddusw     mm2, mm5                // count
+                add         esi, 4                  // next frame
+                dec         ecx                     // next set of eight pixels
+                jnz         next_frame
+
+                movq        counts, mm2
+                psrlw       mm2, 1                  // divide count by 2 for rounding
+                paddusw     mm1, mm2                // rounding added in
+
+                mov         frameptr, esi
+
+                movq        sums, mm1
+
+            }
+
+            for (i = 0; i < 4; i++)
+            {
+                int blurvalue = sums[i] * ppi->fixed_divide[counts[i]];
+                blurvalue >>= 16;
+                d[i] = blurvalue;
+            }
+
+            s += 4;
+            d += 4;
+            byte += 4;
+        }
+        while (byte < bytes);
+    }
+
+    ++ppi->frame;
+    __asm emms
+}

diff --git a/vp8/encoder/x86/quantize_mmx.asm b/vp8/encoder/x86/quantize_mmx.asm
new file mode 100644
index 0000000..847fc6e
--- /dev/null
+++ b/vp8/encoder/x86/quantize_mmx.asm

@@ -0,0 +1,438 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_mmx)
+sym(vp8_fast_quantize_b_impl_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movq            mm0,        [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm1,        [rax]
+
+        movq            mm3,        mm0
+        psraw           mm0,        15
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0         ; abs
+
+        movq            mm2,        mm3
+        pcmpgtw         mm1,        mm2
+
+        pandn           mm1,        mm2
+        movq            mm3,        mm1
+
+        mov             rdx,        arg(6) ;quant_ptr
+        movq            mm1,        [rdx]
+
+        mov             rcx,        arg(5) ;round_ptr
+        movq            mm2,        [rcx]
+
+        paddw           mm3,        mm2
+        pmulhuw         mm3,        mm1
+
+        pxor            mm3,        mm0
+        psubw           mm3,        mm0     ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movq            mm0,        mm3
+
+        movq            [rdi],      mm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm2,        [rax]
+
+        pmullw          mm3,        mm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax],      mm3
+
+        ; next 8
+        movq            mm4,        [rsi+8]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+8]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+8]
+        movq            mm6,        [rcx+8]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+8],    mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+8]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+8],    mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+16]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+16]
+        movq            mm6,        [rcx+16]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+16],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+16]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+16],   mm7
+
+
+                ; next 8
+        movq            mm4,        [rsi+24]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movq            mm5,        [rax+24]
+
+        movq            mm7,        mm4
+        psraw           mm4,        15
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4         ; abs
+
+        movq            mm6,        mm7
+        pcmpgtw         mm5,        mm6
+
+        pandn           mm5,        mm6
+        movq            mm7,        mm5
+
+        movq            mm5,        [rdx+24]
+        movq            mm6,        [rcx+24]
+
+        paddw           mm7,        mm6
+        pmulhuw         mm7,        mm5
+
+        pxor            mm7,        mm4
+        psubw           mm7,        mm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movq            mm1,        mm7
+        movq            [rdi+24],   mm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movq            mm6,        [rax+24]
+
+        pmullw          mm7,        mm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movq            [rax+24],   mm7
+
+
+
+        mov             rdi,        arg(4) ;scan_mask
+        mov             rsi,        arg(2) ;qcoeff_ptr
+
+        pxor            mm5,        mm5
+        pxor            mm7,        mm7
+
+        movq            mm0,        [rsi]
+        movq            mm1,        [rsi+8]
+
+        movq            mm2,        [rdi]
+        movq            mm3,        [rdi+8];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        movq            mm5,        mm0
+
+        paddd           mm5,        mm1
+
+        movq            mm0,        [rsi+16]
+        movq            mm1,        [rsi+24]
+
+        movq            mm2,        [rdi+16]
+        movq            mm3,        [rdi+24];
+
+        pcmpeqw         mm0,        mm7
+        pcmpeqw         mm1,        mm7
+
+        pcmpeqw         mm6,        mm6
+        pxor            mm0,        mm6
+
+        pxor            mm1,        mm6
+        psrlw           mm0,        15
+
+        psrlw           mm1,        15
+        pmaddwd         mm0,        mm2
+
+        pmaddwd         mm1,        mm3
+        paddd           mm5,        mm0
+
+        paddd           mm5,        mm1
+        movq            mm0,        mm5
+
+        psrlq           mm5,        32
+        paddd           mm0,        mm5
+
+        ; eob adjustment begins here
+        movd            rcx,        mm0
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx ; rdx=-rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+        ; Substitute the sse assembly for the old mmx mixed assembly/C. The
+        ; following is kept as reference
+        ;    movd            rcx,        mm0
+        ;    bsr             rax,        rcx
+        ;
+        ;    mov             eob,        rax
+        ;    mov             eee,        rcx
+        ;
+        ;if(eee==0)
+        ;{
+        ;    eob=-1;
+        ;}
+        ;else if(eee<0)
+        ;{
+        ;    eob=15;
+        ;}
+        ;d->eob = eob+1;
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+;                           short *qcoeff_ptr,short *dequant_ptr,
+;                           short *scan_mask, short *round_ptr,
+;                           short *quant_ptr, short *dqcoeff_ptr);
+global sym(vp8_fast_quantize_b_impl_sse)
+sym(vp8_fast_quantize_b_impl_sse):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;coeff_ptr
+        movdqa          xmm0,       [rsi]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movdqa          xmm1,       [rax]
+
+        movdqa          xmm3,       xmm0
+        psraw           xmm0,       15
+
+        pxor            xmm3,       xmm0
+        psubw           xmm3,       xmm0            ; abs
+
+        movdqa          xmm2,       xmm3
+        pcmpgtw         xmm1,       xmm2
+
+        pandn           xmm1,       xmm2
+        movdqa          xmm3,       xmm1
+
+        mov             rdx,        arg(6) ; quant_ptr
+        movdqa          xmm1,       [rdx]
+
+        mov             rcx,        arg(5) ; round_ptr
+        movdqa          xmm2,       [rcx]
+
+        paddw           xmm3,       xmm2
+        pmulhuw         xmm3,       xmm1
+
+        pxor            xmm3,       xmm0
+        psubw           xmm3,       xmm0        ;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+        movdqa          xmm0,       xmm3
+
+        movdqa          [rdi],      xmm3
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movdqa          xmm2,       [rax]
+
+        pmullw          xmm3,       xmm2
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movdqa          [rax],      xmm3
+
+        ; next 8
+        movdqa          xmm4,       [rsi+16]
+
+        mov             rax,        arg(1) ;zbin_ptr
+        movdqa          xmm5,       [rax+16]
+
+        movdqa          xmm7,       xmm4
+        psraw           xmm4,       15
+
+        pxor            xmm7,       xmm4
+        psubw           xmm7,       xmm4            ; abs
+
+        movdqa          xmm6,       xmm7
+        pcmpgtw         xmm5,       xmm6
+
+        pandn           xmm5,       xmm6
+        movdqa          xmm7,       xmm5
+
+        movdqa          xmm5,       [rdx+16]
+        movdqa          xmm6,       [rcx+16]
+
+
+        paddw           xmm7,       xmm6
+        pmulhuw         xmm7,       xmm5
+
+        pxor            xmm7,       xmm4
+        psubw           xmm7,       xmm4;gain the sign back
+
+        mov             rdi,        arg(2) ;qcoeff_ptr
+
+        movdqa          xmm1,       xmm7
+        movdqa          [rdi+16],   xmm7
+
+        mov             rax,        arg(3) ;dequant_ptr
+        movdqa          xmm6,       [rax+16]
+
+        pmullw          xmm7,       xmm6
+        mov             rax,        arg(7) ;dqcoeff_ptr
+
+        movdqa          [rax+16],   xmm7
+        mov             rdi,        arg(4) ;scan_mask
+
+        pxor            xmm7,       xmm7
+        movdqa          xmm2,       [rdi]
+
+        movdqa          xmm3,       [rdi+16];
+        pcmpeqw         xmm0,       xmm7
+
+        pcmpeqw         xmm1,       xmm7
+        pcmpeqw         xmm6,       xmm6
+
+        pxor            xmm0,       xmm6
+        pxor            xmm1,       xmm6
+
+        psrlw           xmm0,       15
+        psrlw           xmm1,       15
+
+        pmaddwd         xmm0,       xmm2
+        pmaddwd         xmm1,       xmm3
+
+        movq            xmm2,       xmm0
+        movq            xmm3,       xmm1
+
+        psrldq          xmm0,       8
+        psrldq          xmm1,       8
+
+        paddd           xmm0,       xmm1
+        paddd           xmm2,       xmm3
+
+        paddd           xmm0,       xmm2
+        movq            xmm1,       xmm0
+
+        psrldq          xmm0,       4
+        paddd           xmm1,       xmm0
+
+        movd            rcx,        xmm1
+        and             rcx,        0xffff
+
+        xor             rdx,        rdx
+        sub             rdx,        rcx
+
+        bsr             rax,        rcx
+        inc             rax
+
+        sar             rdx,        31
+        and             rax,        rdx
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
new file mode 100644
index 0000000..a825698
--- /dev/null
+++ b/vp8/encoder/x86/sad_mmx.asm

@@ -0,0 +1,428 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+global sym(vp8_sad16x16_mmx)
+global sym(vp8_sad8x16_mmx)
+global sym(vp8_sad8x8_mmx)
+global sym(vp8_sad4x4_mmx)
+global sym(vp8_sad16x8_mmx)
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x16x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpcklbw       mm2,        mm6
+
+        punpckhbw       mm1,        mm6
+        punpckhbw       mm3,        mm6
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+
+        lea             rsi,        [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm1
+
+        cmp             rsi,        rcx
+        jne             x16x16sad_mmx_loop
+
+
+        movq            mm0,        mm7
+
+        punpcklwd       mm0,        mm6
+        punpckhwd       mm7,        mm6
+
+        paddw           mm0,        mm7
+        movq            mm7,        mm0
+
+
+        psrlq           mm0,        32
+        paddw           mm7,        mm0
+
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x16_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x16_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x8x16sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        paddw           mm7,        mm2
+        cmp             rsi,        rcx
+
+        jne             x8x16sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad8x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x8x8sad_mmx_loop:
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        punpcklbw       mm0,        mm6
+
+        punpckhbw       mm2,        mm6
+        paddw           mm0,        mm2
+
+        lea             rsi,       [rsi+rax]
+        add             rdi,        rdx
+
+        paddw           mm7,       mm0
+        cmp             rsi,        rcx
+
+        jne             x8x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad4x4_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad4x4_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,       QWORD PTR [rsi]
+        movd            mm1,       QWORD PTR [rdi]
+
+        movd            mm2,       QWORD PTR [rsi+rax]
+        movd            mm3,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movq            mm2,        mm0
+        psubusb         mm0,        mm1
+
+        psubusb         mm1,        mm2
+        por             mm0,        mm1
+
+        movq            mm2,        mm0
+        pxor            mm3,        mm3
+
+        punpcklbw       mm0,        mm3
+        punpckhbw       mm2,        mm3
+
+        paddw           mm0,        mm2
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm4,       QWORD PTR [rsi]
+        movd            mm5,       QWORD PTR [rdi]
+
+        movd            mm6,       QWORD PTR [rsi+rax]
+        movd            mm7,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm4,        mm6
+        punpcklbw       mm5,        mm7
+
+        movq            mm6,        mm4
+        psubusb         mm4,        mm5
+
+        psubusb         mm5,        mm6
+        por             mm4,        mm5
+
+        movq            mm5,        mm4
+        punpcklbw       mm4,        mm3
+
+        punpckhbw       mm5,        mm3
+        paddw           mm4,        mm5
+
+        paddw           mm0,        mm4
+        movq            mm1,        mm0
+
+        punpcklwd       mm0,        mm3
+        punpckhwd       mm1,        mm3
+
+        paddw           mm0,        mm1
+        movq            mm1,        mm0
+
+        psrlq           mm0,        32
+        paddw           mm0,        mm1
+
+        movd            rax,        mm0
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_mmx(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+sym(vp8_sad16x8_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+        pxor            mm7,        mm7
+
+        pxor            mm6,        mm6
+
+x16x8sad_mmx_loop:
+
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+
+        movq            mm2,        [rsi+8]
+        movq            mm3,        [rdi+8]
+
+        movq            mm4,        mm0
+        movq            mm5,        mm2
+
+        psubusb         mm0,        mm1
+        psubusb         mm1,        mm4
+
+        psubusb         mm2,        mm3
+        psubusb         mm3,        mm5
+
+        por             mm0,        mm1
+        por             mm2,        mm3
+
+        movq            mm1,        mm0
+        movq            mm3,        mm2
+
+        punpcklbw       mm0,        mm6
+        punpckhbw       mm1,        mm6
+
+        punpcklbw       mm2,        mm6
+        punpckhbw       mm3,        mm6
+
+
+        paddw           mm0,        mm2
+        paddw           mm1,        mm3
+
+        paddw           mm0,        mm1
+        lea             rsi,        [rsi+rax]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             x16x8sad_mmx_loop
+
+        movq            mm0,        mm7
+        punpcklwd       mm0,        mm6
+
+        punpckhwd       mm7,        mm6
+        paddw           mm0,        mm7
+
+        movq            mm7,        mm0
+        psrlq           mm0,        32
+
+        paddw           mm7,        mm0
+        movd            rax,        mm7
+
+    pop rdi
+    pop rsi
+    mov rsp, rbp
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
new file mode 100644
index 0000000..53240bb
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse2.asm

@@ -0,0 +1,329 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+;unsigned int vp8_sad16x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x16_wmt)
+sym(vp8_sad16x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rax*8]
+
+        lea             rcx,        [rcx+rax*8]
+        pxor            xmm7,       xmm7
+
+x16x16sad_wmt_loop:
+
+        movq            xmm0,       QWORD PTR [rsi]
+        movq            xmm2,       QWORD PTR [rsi+8]
+
+        movq            xmm1,       QWORD PTR [rdi]
+        movq            xmm3,       QWORD PTR [rdi+8]
+
+        movq            xmm4,       QWORD PTR [rsi+rax]
+        movq            xmm5,       QWORD PTR [rdi+rdx]
+
+
+        punpcklbw       xmm0,       xmm2
+        punpcklbw       xmm1,       xmm3
+
+        psadbw          xmm0,       xmm1
+        movq            xmm6,       QWORD PTR [rsi+rax+8]
+
+        movq            xmm3,       QWORD PTR [rdi+rdx+8]
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        punpcklbw       xmm4,       xmm6
+
+        punpcklbw       xmm5,       xmm3
+        psadbw          xmm4,       xmm5
+
+        paddw           xmm7,       xmm0
+        paddw           xmm7,       xmm4
+
+        cmp             rsi,        rcx
+        jne             x16x16sad_wmt_loop
+
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            rax,        xmm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad8x16_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+global sym(vp8_sad8x16_wmt)
+sym(vp8_sad8x16_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+x8x16sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x8x16sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        movq            mm2,        QWORD PTR [rsi+rbx]
+        movq            mm3,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm2
+
+        cmp             rsi,        rcx
+        jne             x8x16sad_wmt_loop
+
+        movd            rax,        mm7
+
+x8x16sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad8x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad8x8_wmt)
+sym(vp8_sad8x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+x8x8sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x8x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm1,        QWORD PTR [rdi]
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rbx]
+
+        add             rdi,        rdx
+        paddw           mm7,        mm0
+
+        cmp             rsi,        rcx
+        jne             x8x8sad_wmt_loop
+
+        movd            rax,        mm7
+x8x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad4x4_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad4x4_wmt)
+sym(vp8_sad4x4_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,       QWORD PTR [rsi]
+        movd            mm1,       QWORD PTR [rdi]
+
+        movd            mm2,       QWORD PTR [rsi+rax]
+        movd            mm3,       QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        psadbw          mm0,        mm1
+        lea             rsi,        [rsi+rax*2]
+
+        lea             rdi,        [rdi+rdx*2]
+        movd            mm4,       QWORD PTR [rsi]
+
+        movd            mm5,       QWORD PTR [rdi]
+        movd            mm6,       QWORD PTR [rsi+rax]
+
+        movd            mm7,       QWORD PTR [rdi+rdx]
+        punpcklbw       mm4,        mm6
+
+        punpcklbw       mm5,        mm7
+        psadbw          mm4,        mm5
+
+        paddw           mm0,        mm4
+        movd            rax,        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_sad16x8_wmt(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride)
+global sym(vp8_sad16x8_wmt)
+sym(vp8_sad16x8_wmt):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+        pxor            mm7,        mm7
+
+x16x8sad_wmt_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              x16x8sad_wmt_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             x16x8sad_wmt_loop
+
+        movd            rax,        mm7
+
+x16x8sad_wmt_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
new file mode 100644
index 0000000..38cc029
--- /dev/null
+++ b/vp8/encoder/x86/sad_sse3.asm

@@ -0,0 +1,939 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm5,       [rdi]
+        lddqu           xmm6,       [rdi+1]
+        lddqu           xmm7,       [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rdi]
+        lddqu           xmm2,       [rdi+1]
+        lddqu           xmm3,       [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rdi+rdx]
+        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_8X2X3 1
+%if %1
+        movq            mm0,       [rsi]
+        movq            mm5,       [rdi]
+        movq            mm6,       [rdi+1]
+        movq            mm7,       [rdi+2]
+
+        psadbw          mm5,       mm0
+        psadbw          mm6,       mm0
+        psadbw          mm7,       mm0
+%else
+        movq            mm0,       [rsi]
+        movq            mm1,       [rdi]
+        movq            mm2,       [rdi+1]
+        movq            mm3,       [rdi+2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endif
+        movq            mm0,       QWORD PTR [rsi+rax]
+        movq            mm1,       QWORD PTR [rdi+rdx]
+        movq            mm2,       QWORD PTR [rdi+rdx+1]
+        movq            mm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,       [rsi+rax*2]
+        lea             rdi,       [rdi+rdx*2]
+
+        psadbw          mm1,       mm0
+        psadbw          mm2,       mm0
+        psadbw          mm3,       mm0
+
+        paddw           mm5,       mm1
+        paddw           mm6,       mm2
+        paddw           mm7,       mm3
+%endmacro
+
+%macro LOAD_X4_ADDRESSES 5
+        mov             %2,         [%1+REG_SZ_BYTES*0]
+        mov             %3,         [%1+REG_SZ_BYTES*1]
+
+        mov             %4,         [%1+REG_SZ_BYTES*2]
+        mov             %5,         [%1+REG_SZ_BYTES*3]
+%endmacro
+
+%macro PROCESS_16X2X4 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm4,       [rcx]
+        lddqu           xmm5,       [rdx]
+        lddqu           xmm6,       [rbx]
+        lddqu           xmm7,       [rdi]
+
+        psadbw          xmm4,       xmm0
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rcx]
+        lddqu           xmm2,       [rdx]
+        lddqu           xmm3,       [rbx]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       [rdi]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rcx+rbp]
+        lddqu           xmm2,       QWORD PTR [rdx+rbp]
+        lddqu           xmm3,       QWORD PTR [rbx+rbp]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm4,       xmm1
+        lddqu           xmm1,       QWORD PTR [rdi+rbp]
+        paddw           xmm5,       xmm2
+        paddw           xmm6,       xmm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        psadbw          xmm1,       xmm0
+        paddw           xmm7,       xmm1
+
+%endmacro
+
+%macro PROCESS_8X2X4 1
+%if %1
+        movq            mm0,        [rsi]
+        movq            mm4,        [rcx]
+        movq            mm5,        [rdx]
+        movq            mm6,        [rbx]
+        movq            mm7,        [rdi]
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+        psadbw          mm7,        mm0
+%else
+        movq            mm0,        [rsi]
+        movq            mm1,        [rcx]
+        movq            mm2,        [rdx]
+        movq            mm3,        [rbx]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        [rdi]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+%endif
+        movq            mm0,        QWORD PTR [rsi+rax]
+        movq            mm1,        QWORD PTR [rcx+rbp]
+        movq            mm2,        QWORD PTR [rdx+rbp]
+        movq            mm3,        QWORD PTR [rbx+rbp]
+
+        psadbw          mm1,        mm0
+        psadbw          mm2,        mm0
+        psadbw          mm3,        mm0
+
+        paddw           mm4,        mm1
+        movq            mm1,        QWORD PTR [rdi+rbp]
+        paddw           mm5,        mm2
+        paddw           mm6,        mm3
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        psadbw          mm1,        mm0
+        paddw           mm7,        mm1
+
+%endmacro
+
+;void int vp8_sad16x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_sse3)
+sym(vp8_sad16x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad16x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_sse3)
+sym(vp8_sad16x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x16x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x3_sse3)
+sym(vp8_sad8x16x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x8x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x3_sse3)
+sym(vp8_sad8x8x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        PROCESS_8X2X3 1
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+        PROCESS_8X2X3 0
+
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm5
+        movd            [rdi+4],    mm6
+        movd            [rdi+8],    mm7
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad4x4x3_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x3_sse3)
+sym(vp8_sad4x4x3_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm1,        QWORD PTR [rdi]
+
+        movd            mm2,        QWORD PTR [rsi+rax]
+        movd            mm3,        QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        QWORD PTR [rdi+1]
+        movd            mm5,        QWORD PTR [rdi+2]
+
+        movd            mm2,        QWORD PTR [rdi+rdx+1]
+        movd            mm3,        QWORD PTR [rdi+rdx+2]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        psadbw          mm4,        mm0
+        psadbw          mm5,        mm0
+
+
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm2,        QWORD PTR [rdi]
+
+        movd            mm3,        QWORD PTR [rsi+rax]
+        movd            mm6,        QWORD PTR [rdi+rdx]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm6
+
+        movd            mm3,        QWORD PTR [rdi+1]
+        movd            mm7,        QWORD PTR [rdi+2]
+
+        psadbw          mm2,        mm0
+
+        paddw           mm1,        mm2
+
+        movd            mm2,        QWORD PTR [rdi+rdx+1]
+        movd            mm6,        QWORD PTR [rdi+rdx+2]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm6
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        mov             rdi,        arg(4) ;Results
+        movd            [rdi],      mm1
+
+        movd            [rdi+4],    mm3
+        movd            [rdi+8],    mm7
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_sad16x16_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  max_err)
+;%define lddqu movdqu
+global sym(vp8_sad16x16_sse3)
+sym(vp8_sad16x16_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rbx
+    push        rsi
+    push        rdi
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        lea             rcx,        [rsi+rbx*8]
+
+        lea             rcx,        [rcx+rbx*8]
+        pxor            mm7,        mm7
+
+vp8_sad16x16_sse3_loop:
+
+        movd            rax,        mm7
+        cmp             rax,        arg(4)
+        jg              vp8_sad16x16_early_exit
+
+        movq            mm0,        QWORD PTR [rsi]
+        movq            mm2,        QWORD PTR [rsi+8]
+
+        movq            mm1,        QWORD PTR [rdi]
+        movq            mm3,        QWORD PTR [rdi+8]
+
+        movq            mm4,        QWORD PTR [rsi+rbx]
+        movq            mm5,        QWORD PTR [rdi+rdx]
+
+        psadbw          mm0,        mm1
+        psadbw          mm2,        mm3
+
+        movq            mm1,        QWORD PTR [rsi+rbx+8]
+        movq            mm3,        QWORD PTR [rdi+rdx+8]
+
+        psadbw          mm4,        mm5
+        psadbw          mm1,        mm3
+
+        lea             rsi,        [rsi+rbx*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        paddw           mm0,        mm2
+        paddw           mm4,        mm1
+
+        paddw           mm7,        mm0
+        paddw           mm7,        mm4
+
+        cmp             rsi,        rcx
+        jne             vp8_sad16x16_sse3_loop
+
+        movd            rax,        mm7
+
+vp8_sad16x16_early_exit:
+
+    ; begin epilog
+    pop         rdi
+    pop         rsi
+    pop         rbx
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_sad16x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x4d_sse3)
+sym(vp8_sad16x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+12],   xmm0
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_sad16x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr_base,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x4d_sse3)
+sym(vp8_sad16x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_16X2X4 1
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+        PROCESS_16X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm4
+        psrldq          xmm4,       8
+
+        paddw           xmm0,       xmm4
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+8],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+12],   xmm0
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x16x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x16x4d_sse3)
+sym(vp8_sad8x16x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad8x8x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad8x8x4d_sse3)
+sym(vp8_sad8x8x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        PROCESS_8X2X4 1
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+        PROCESS_8X2X4 0
+
+        pop             rbp
+        mov             rdi,        arg(4) ;Results
+
+        movd            [rdi],      mm4
+        movd            [rdi+4],    mm5
+        movd            [rdi+8],    mm6
+        movd            [rdi+12],   mm7
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad4x4x4d_sse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad4x4x4d_sse3)
+sym(vp8_sad4x4x4d_sse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rbx
+    ; end prolog
+
+        push            rbp
+        mov             rdi,        arg(2) ; ref_ptr_base
+
+        LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
+
+        mov             rsi,        arg(0) ;src_ptr
+
+        movsxd          rbx,        dword ptr arg(1) ;src_stride
+        movsxd          rbp,        dword ptr arg(3) ;ref_stride
+
+        xchg            rbx,        rax
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm1,        QWORD PTR [rcx]
+
+        movd            mm2,        QWORD PTR [rsi+rax]
+        movd            mm3,        QWORD PTR [rcx+rbp]
+
+        punpcklbw       mm0,        mm2
+        punpcklbw       mm1,        mm3
+
+        movd            mm4,        QWORD PTR [rdx]
+        movd            mm5,        QWORD PTR [rbx]
+
+        movd            mm6,        QWORD PTR [rdi]
+        movd            mm2,        QWORD PTR [rdx+rbp]
+
+        movd            mm3,        QWORD PTR [rbx+rbp]
+        movd            mm7,        QWORD PTR [rdi+rbp]
+
+        psadbw          mm1,        mm0
+
+        punpcklbw       mm4,        mm2
+        punpcklbw       mm5,        mm3
+
+        punpcklbw       mm6,        mm7
+        psadbw          mm4,        mm0
+
+        psadbw          mm5,        mm0
+        psadbw          mm6,        mm0
+
+
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rcx,        [rcx+rbp*2]
+
+        lea             rdx,        [rdx+rbp*2]
+        lea             rbx,        [rbx+rbp*2]
+
+        lea             rdi,        [rdi+rbp*2]
+
+        movd            mm0,        QWORD PTR [rsi]
+        movd            mm2,        QWORD PTR [rcx]
+
+        movd            mm3,        QWORD PTR [rsi+rax]
+        movd            mm7,        QWORD PTR [rcx+rbp]
+
+        punpcklbw       mm0,        mm3
+        punpcklbw       mm2,        mm7
+
+        movd            mm3,        QWORD PTR [rdx]
+        movd            mm7,        QWORD PTR [rbx]
+
+        psadbw          mm2,        mm0
+        mov             rax,        rbp
+
+        pop             rbp
+        mov             rsi,        arg(4) ;Results
+
+        paddw           mm1,        mm2
+        movd            [rsi],      mm1
+
+        movd            mm2,        QWORD PTR [rdx+rax]
+        movd            mm1,        QWORD PTR [rbx+rax]
+
+        punpcklbw       mm3,        mm2
+        punpcklbw       mm7,        mm1
+
+        psadbw          mm3,        mm0
+        psadbw          mm7,        mm0
+
+        movd            mm2,        QWORD PTR [rdi]
+        movd            mm1,        QWORD PTR [rdi+rax]
+
+        paddw           mm3,        mm4
+        paddw           mm7,        mm5
+
+        movd            [rsi+4],    mm3
+        punpcklbw       mm2,        mm1
+
+        movd            [rsi+8],    mm7
+        psadbw          mm2,        mm0
+
+        paddw           mm2,        mm6
+        movd            [rsi+12],   mm2
+
+
+    ; begin epilog
+    pop         rbx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
new file mode 100644
index 0000000..1bb9561
--- /dev/null
+++ b/vp8/encoder/x86/sad_ssse3.asm

@@ -0,0 +1,367 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%idefine QWORD
+
+%macro PROCESS_16X2X3 1
+%if %1
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm5,       [rdi]
+        lddqu           xmm6,       [rdi+1]
+        lddqu           xmm7,       [rdi+2]
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        lddqu           xmm1,       [rdi]
+        lddqu           xmm2,       [rdi+1]
+        lddqu           xmm3,       [rdi+2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        lddqu           xmm1,       QWORD PTR [rdi+rdx]
+        lddqu           xmm2,       QWORD PTR [rdi+rdx+1]
+        lddqu           xmm3,       QWORD PTR [rdi+rdx+2]
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X2X3_OFFSET 2
+%if %1
+        movdqa          xmm0,       [rsi]
+        movdqa          xmm4,       [rdi]
+        movdqa          xmm7,       [rdi+16]
+
+        movdqa          xmm5,       xmm7
+        palignr         xmm5,       xmm4,       %2
+
+        movdqa          xmm6,       xmm7
+        palignr         xmm6,       xmm4,       (%2+1)
+
+        palignr         xmm7,       xmm4,       (%2+2)
+
+        psadbw          xmm5,       xmm0
+        psadbw          xmm6,       xmm0
+        psadbw          xmm7,       xmm0
+%else
+        movdqa          xmm0,       [rsi]
+        movdqa          xmm4,       [rdi]
+        movdqa          xmm3,       [rdi+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endif
+        movdqa          xmm0,       QWORD PTR [rsi+rax]
+        movdqa          xmm4,       QWORD PTR [rdi+rdx]
+        movdqa          xmm3,       QWORD PTR [rdi+rdx+16]
+
+        movdqa          xmm1,       xmm3
+        palignr         xmm1,       xmm4,       %2
+
+        movdqa          xmm2,       xmm3
+        palignr         xmm2,       xmm4,       (%2+1)
+
+        palignr         xmm3,       xmm4,       (%2+2)
+
+        lea             rsi,        [rsi+rax*2]
+        lea             rdi,        [rdi+rdx*2]
+
+        psadbw          xmm1,       xmm0
+        psadbw          xmm2,       xmm0
+        psadbw          xmm3,       xmm0
+
+        paddw           xmm5,       xmm1
+        paddw           xmm6,       xmm2
+        paddw           xmm7,       xmm3
+%endmacro
+
+%macro PROCESS_16X16X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+%macro PROCESS_16X8X3_OFFSET 2
+%2_aligned_by_%1:
+
+        sub             rdi,        %1
+
+        PROCESS_16X2X3_OFFSET 1, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+        PROCESS_16X2X3_OFFSET 0, %1
+
+        jmp             %2_store_off
+
+%endmacro
+
+;void int vp8_sad16x16x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x16x3_ssse3)
+sym(vp8_sad16x16x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp vp8_sad16x16x3_ssse3_skiptable
+vp8_sad16x16x3_ssse3_jumptable:
+        dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
+        dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_skiptable:
+
+        call vp8_sad16x16x3_ssse3_do_jump
+vp8_sad16x16x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+
+vp8_sad16x16x3_ssse3_aligned_by_15:
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+vp8_sad16x16x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void int vp8_sad16x8x3_ssse3(
+;    unsigned char *src_ptr,
+;    int  src_stride,
+;    unsigned char *ref_ptr,
+;    int  ref_stride,
+;    int  *results)
+global sym(vp8_sad16x8x3_ssse3)
+sym(vp8_sad16x8x3_ssse3):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push        rsi
+    push        rdi
+    push        rcx
+    ; end prolog
+
+        mov             rsi,        arg(0) ;src_ptr
+        mov             rdi,        arg(2) ;ref_ptr
+
+        mov             rdx,        0xf
+        and             rdx,        rdi
+
+        jmp vp8_sad16x8x3_ssse3_skiptable
+vp8_sad16x8x3_ssse3_jumptable:
+        dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
+        dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_skiptable:
+
+        call vp8_sad16x8x3_ssse3_do_jump
+vp8_sad16x8x3_ssse3_do_jump:
+        pop             rcx                         ; get the address of do_jump
+        mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+        add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
+
+        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
+        add             rcx,        rax
+
+        movsxd          rax,        dword ptr arg(1) ;src_stride
+        movsxd          rdx,        dword ptr arg(3) ;ref_stride
+
+        jmp             rcx
+
+        PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+
+vp8_sad16x8x3_ssse3_aligned_by_15:
+
+        PROCESS_16X2X3 1
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+        PROCESS_16X2X3 0
+
+vp8_sad16x8x3_ssse3_store_off:
+        mov             rdi,        arg(4) ;Results
+
+        movq            xmm0,       xmm5
+        psrldq          xmm5,       8
+
+        paddw           xmm0,       xmm5
+        movd            [rdi],      xmm0
+;-
+        movq            xmm0,       xmm6
+        psrldq          xmm6,       8
+
+        paddw           xmm0,       xmm6
+        movd            [rdi+4],    xmm0
+;-
+        movq            xmm0,       xmm7
+        psrldq          xmm7,       8
+
+        paddw           xmm0,       xmm7
+        movd            [rdi+8],    xmm0
+
+    ; begin epilog
+    pop         rcx
+    pop         rdi
+    pop         rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
new file mode 100644
index 0000000..ce3e610
--- /dev/null
+++ b/vp8/encoder/x86/subtract_mmx.asm

@@ -0,0 +1,431 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+;                            unsigned short *diff, unsigned char *Predictor,
+;                            int pitch);
+global sym(vp8_subtract_b_mmx_impl)
+sym(vp8_subtract_b_mmx_impl)
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+
+        mov     rdi,        arg(2) ;diff
+        mov     rax,        arg(3) ;Predictor
+        mov     rsi,        arg(0) ;z
+        movsxd  rdx,        dword ptr arg(1);src_stride;
+        movsxd  rcx,        dword ptr arg(4);pitch
+        pxor    mm7,        mm7
+
+        movd    mm0,        [rsi]
+        movd    mm1,        [rax]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi],      mm0
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],mm0
+
+
+        movd    mm0,        [rsi+rdx*2]
+        movd    mm1,        [rax+rcx*2]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*4],        mm0
+
+        lea     rsi,        [rsi+rdx*2]
+        lea     rcx,        [rcx+rcx*2]
+
+
+
+        movd    mm0,        [rsi+rdx]
+        movd    mm1,        [rax+rcx]
+        punpcklbw   mm0,    mm7
+        punpcklbw   mm1,    mm7
+        psubw   mm0,        mm1
+        movq    [rdi+rcx*2],        mm0
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
+global sym(vp8_subtract_mby_mmx)
+sym(vp8_subtract_mby_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    ; end prolog
+
+
+            mov         rsi,            arg(1) ;src
+            mov         rdi,            arg(0) ;diff
+
+            mov         rax,            arg(2) ;pred
+            movsxd      rdx,            dword ptr arg(3) ;stride
+
+            mov         rcx,            16
+            pxor        mm0,            mm0
+
+submby_loop:
+
+            movq        mm1,            [rsi]
+            movq        mm3,            [rax]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi],          mm1
+            movq        [rdi+8],        mm2
+
+
+            movq        mm1,            [rsi+8]
+            movq        mm3,            [rax+8]
+
+            movq        mm2,            mm1
+            movq        mm4,            mm3
+
+            punpcklbw   mm1,            mm0
+            punpcklbw   mm3,            mm0
+
+            punpckhbw   mm2,            mm0
+            punpckhbw   mm4,            mm0
+
+            psubw       mm1,            mm3
+            psubw       mm2,            mm4
+
+            movq        [rdi+16],       mm1
+            movq        [rdi+24],       mm2
+
+
+            add         rdi,            32
+            add         rax,            16
+
+            lea         rsi,            [rsi+rdx]
+
+            sub         rcx,            1
+            jnz         submby_loop
+
+    pop rdi
+    pop rsi
+    ; begin epilog
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
+global sym(vp8_subtract_mbuv_mmx)
+sym(vp8_subtract_mbuv_mmx)
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 5
+    push rsi
+    push rdi
+    ; end prolog
+
+    ;short *udiff = diff + 256;
+    ;short *vdiff = diff + 320;
+    ;unsigned char *upred = pred + 256;
+    ;unsigned char *vpred = pred + 320;
+
+        ;unsigned char  *z    = usrc;
+        ;unsigned short *diff = udiff;
+        ;unsigned char  *Predictor= upred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(1) ;z = usrc
+            add     rdi,        256*2  ;diff = diff + 256 (shorts)
+            add     rax,        256    ;Predictor = pred + 256
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+        ;unsigned char  *z    = vsrc;
+        ;unsigned short *diff = vdiff;
+        ;unsigned char  *Predictor= vpred;
+
+            mov     rdi,        arg(0) ;diff
+            mov     rax,        arg(3) ;pred
+            mov     rsi,        arg(2) ;z = usrc
+            add     rdi,        320*2  ;diff = diff + 320 (shorts)
+            add     rax,        320    ;Predictor = pred + 320
+            movsxd  rdx,        dword ptr arg(4) ;stride;
+            pxor    mm7,        mm7
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+
+            add     rdi,        64
+            add     rax,        32
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi]
+            movq    mm1,        [rax]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi],      mm0
+            movq    [rdi+8],    mm3
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+8]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+16],   mm0
+            movq    [rdi+24],   mm3
+
+            movq    mm0,        [rsi+rdx*2]
+            movq    mm1,        [rax+16]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+            movq    [rdi+32],   mm0
+            movq    [rdi+40],   mm3
+            lea     rsi,        [rsi+rdx*2]
+
+
+            movq    mm0,        [rsi+rdx]
+            movq    mm1,        [rax+24]
+            movq    mm3,        mm0
+            movq    mm4,        mm1
+            punpcklbw   mm0,    mm7
+            punpcklbw   mm1,    mm7
+            punpckhbw   mm3,    mm7
+            punpckhbw   mm4,    mm7
+            psubw   mm0,        mm1
+            psubw   mm3,        mm4
+
+            movq    [rdi+48],   mm0
+            movq    [rdi+56],   mm3
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000..d0da82a
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_mmx.asm

@@ -0,0 +1,980 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
+global sym(vp8_get_mb_ss_mmx)
+sym(vp8_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get8x8var_mmx)
+sym(vp8_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vp8_get4x4var_mmx)
+sym(vp8_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int
+;vp8_get4x4sse_cs_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride
+;)
+global sym(vp8_get4x4sse_cs_mmx)
+sym(vp8_get4x4sse_cs_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    push rsi
+    push rdi
+    push rbx
+    ; end prolog
+
+
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm1, mm6
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy eight bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        movq        mm0,    mm7                 ;
+        psrlq       mm7,    32
+
+        paddd       mm0,    mm7
+        movd        rax,    mm0
+
+
+    ; begin epilog
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+%define mmx_filter_shift            7
+
+;void vp8_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil4x4_var_mmx)
+sym(vp8_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil4x4_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+
+;void vp8_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_filter_block2d_bil_var_mmx)
+sym(vp8_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [mmx_bi_rd GLOBAL]  ;
+        paddw           mm2,            [mmx_bi_rd GLOBAL]  ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_mmx_loop       ;
+
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;unsigned int vp8_get16x16pred_error_mmx
+;(
+;    unsigned char *src_ptr,
+;    int src_stride,
+;    unsigned char *ref_ptr,
+;    int ref_stride
+;)
+global sym(vp8_get16x16pred_error_mmx)
+sym(vp8_get16x16pred_error_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;DWORD PTR [src_ptr]
+        mov         rdi,            arg(2) ;DWORD PTR [ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
+
+        pxor        mm0,            mm0                     ; clear xmm0 for unpack
+        pxor        mm7,            mm7                     ; clear xmm7 for accumulating diffs
+
+        pxor        mm6,            mm6                     ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16loop:
+
+        movq        mm1,            [rsi]
+        movq        mm2,            [rdi]
+
+        movq        mm3,            mm1
+        movq        mm4,            mm2
+
+        punpcklbw   mm1,            mm0
+        punpckhbw   mm3,            mm0
+
+        punpcklbw   mm2,            mm0
+        punpckhbw   mm4,            mm0
+
+        psubw       mm1,            mm2
+        psubw       mm3,            mm4
+
+        paddw       mm7,            mm1
+        pmaddwd     mm1,            mm1
+
+        paddw       mm7,            mm3
+        pmaddwd     mm3,            mm3
+
+        paddd       mm6,            mm1
+        paddd       mm6,            mm3
+
+
+        movq        mm1,            [rsi+8]
+        movq        mm2,            [rdi+8]
+
+        movq        mm3,            mm1
+        movq        mm4,            mm2
+
+        punpcklbw   mm1,            mm0
+        punpckhbw   mm3,            mm0
+
+        punpcklbw   mm2,            mm0
+        punpckhbw   mm4,            mm0
+
+        psubw       mm1,            mm2
+        psubw       mm3,            mm4
+
+        paddw       mm7,            mm1
+        pmaddwd     mm1,            mm1
+
+        paddw       mm7,            mm3
+        pmaddwd     mm3,            mm3
+
+        paddd       mm6,            mm1
+        paddd       mm6,            mm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16loop
+
+
+        movq        mm1,            mm6
+        pxor        mm6,            mm6
+
+        pxor        mm5,            mm5
+        punpcklwd   mm6,            mm7
+
+        punpckhwd   mm5,            mm7
+        psrad       mm5,            16
+
+        psrad       mm6,            16
+        paddd       mm6,            mm5
+
+        movq        mm2,            mm1
+        psrlq       mm1,            32
+
+        paddd       mm2,            mm1
+        movq        mm7,            mm6
+
+        psrlq       mm6,            32
+        paddd       mm6,            mm7
+
+        movd DWORD PTR [rsp],       mm6  ;Sum
+        movd DWORD PTR [rsp+4],     mm2  ;SSE
+
+        ; return (SSE-((Sum*Sum)>>8));
+        movsxd      rdx, dword ptr [rsp]
+        imul        rdx, rdx
+        sar         rdx, 8
+        movsxd      rax, dword ptr [rsp + 4]
+        sub         rax, rdx
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64

diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
new file mode 100644
index 0000000..7e5ee28
--- /dev/null
+++ b/vp8/encoder/x86/variance_impl_sse2.asm

@@ -0,0 +1,975 @@
+;
+;  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license and patent
+;  grant that can be found in the LICENSE file in the root of the source
+;  tree. All contributing project authors may be found in the AUTHORS
+;  file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+%define xmm_filter_shift            7
+
+;unsigned int vp8_get_mb_ss_sse2
+;(
+;    short *src_ptr
+;)
+global sym(vp8_get_mb_ss_sse2)
+sym(vp8_get_mb_ss_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 1
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+
+        mov         rax, arg(0) ;[src_ptr]
+        mov         rcx, 8
+        pxor        xmm4, xmm4
+
+NEXTROW:
+        movdqa      xmm0, [rax]
+        movdqa      xmm1, [rax+16]
+        movdqa      xmm2, [rax+32]
+        movdqa      xmm3, [rax+48]
+        pmaddwd     xmm0, xmm0
+        pmaddwd     xmm1, xmm1
+        pmaddwd     xmm2, xmm2
+        pmaddwd     xmm3, xmm3
+
+        paddd       xmm0, xmm1
+        paddd       xmm2, xmm3
+        paddd       xmm4, xmm0
+        paddd       xmm4, xmm2
+
+        add         rax, 0x40
+        dec         rcx
+        ja          NEXTROW
+
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,8
+        paddd       xmm4,xmm3
+        movdqa      xmm3,xmm4
+        psrldq      xmm4,4
+        paddd       xmm4,xmm3
+        movd        rax,xmm4
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get16x16var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get16x16var_sse2)
+sym(vp8_get16x16var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16loop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16loop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd DWORD PTR [rax],       xmm7
+        movd DWORD PTR [rdi],       xmm1
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;unsigned int vp8_get16x16pred_error_sse2
+;(
+;   unsigned char *src_ptr,
+;    int src_stride,
+;    unsigned char *ref_ptr,
+;    int ref_stride
+;)
+global sym(vp8_get16x16pred_error_sse2)
+sym(vp8_get16x16pred_error_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 4
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[src_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[ref_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
+        mov         rcx,            16
+
+var16peloop:
+        movdqu      xmm1,           XMMWORD PTR [rsi]
+        movdqu      xmm2,           XMMWORD PTR [rdi]
+
+        movdqa      xmm3,           xmm1
+        movdqa      xmm4,           xmm2
+
+        punpcklbw   xmm1,           xmm0
+        punpckhbw   xmm3,           xmm0
+
+        punpcklbw   xmm2,           xmm0
+        punpckhbw   xmm4,           xmm0
+
+        psubw       xmm1,           xmm2
+        psubw       xmm3,           xmm4
+
+        paddw       xmm7,           xmm1
+        pmaddwd     xmm1,           xmm1
+
+        paddw       xmm7,           xmm3
+        pmaddwd     xmm3,           xmm3
+
+        paddd       xmm6,           xmm1
+        paddd       xmm6,           xmm3
+
+        add         rsi,            rax
+        add         rdi,            rdx
+
+        sub         rcx,            1
+        jnz         var16peloop
+
+
+        movdqa      xmm1,           xmm6
+        pxor        xmm6,           xmm6
+
+        pxor        xmm5,           xmm5
+        punpcklwd   xmm6,           xmm7
+
+        punpckhwd   xmm5,           xmm7
+        psrad       xmm5,           16
+
+        psrad       xmm6,           16
+        paddd       xmm6,           xmm5
+
+        movdqa      xmm2,           xmm1
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddd       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddd       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        movd DWORD PTR [rsp],       xmm7  ;Sum
+        movd DWORD PTR [rsp+4],     xmm1  ;SSE
+
+        ; return (SSE-((Sum*Sum)>>8));
+        movsxd      rdx, dword ptr [rsp]
+        imul        rdx, rdx
+        sar         rdx, 8
+        movsxd      rax, dword ptr [rsp + 4]
+        sub         rax, rdx
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;unsigned int vp8_get8x8var_sse2
+;(
+;    unsigned char   *  src_ptr,
+;    int             source_stride,
+;    unsigned char   *  ref_ptr,
+;    int             recon_stride,
+;    unsigned int    *  SSE,
+;    int             *  Sum
+;)
+global sym(vp8_get8x8var_sse2)
+sym(vp8_get8x8var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        mov         rsi,            arg(0) ;[src_ptr]
+        mov         rdi,            arg(2) ;[ref_ptr]
+
+        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
+        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
+
+        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
+        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
+
+        movq        xmm1,           QWORD PTR [rsi]
+        movq        xmm2,           QWORD PTR [rdi]
+
+        punpcklbw   xmm1,           xmm0
+        punpcklbw   xmm2,           xmm0
+
+        psubsw      xmm1,           xmm2
+        paddw       xmm7,           xmm1
+
+        pmaddwd     xmm1,           xmm1
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax * 2]
+        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+        movq        xmm2,           QWORD PTR[rsi + rax *2]
+        movq        xmm3,           QWORD PTR[rdi + rdx *2]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        lea         rsi,            [rsi + rax * 2]
+        lea         rdi,            [rdi + rdx * 2]
+
+        movq        xmm2,           QWORD PTR[rsi + rax]
+        movq        xmm3,           QWORD PTR[rdi + rdx]
+
+        punpcklbw   xmm2,           xmm0
+        punpcklbw   xmm3,           xmm0
+
+        psubsw      xmm2,           xmm3
+        paddw       xmm7,           xmm2
+
+        pmaddwd     xmm2,           xmm2
+        paddd       xmm1,           xmm2
+
+
+        movdqa      xmm6,           xmm7
+        punpcklwd   xmm6,           xmm0
+
+        punpckhwd   xmm7,           xmm0
+        movdqa      xmm2,           xmm1
+
+        paddw       xmm6,           xmm7
+        punpckldq   xmm1,           xmm0
+
+        punpckhdq   xmm2,           xmm0
+        movdqa      xmm7,           xmm6
+
+        paddd       xmm1,           xmm2
+        punpckldq   xmm6,           xmm0
+
+        punpckhdq   xmm7,           xmm0
+        paddw       xmm6,           xmm7
+
+        movdqa      xmm2,           xmm1
+        movdqa      xmm7,           xmm6
+
+        psrldq      xmm1,           8
+        psrldq      xmm6,           8
+
+        paddw       xmm7,           xmm6
+        paddd       xmm1,           xmm2
+
+        mov         rax,            arg(5) ;[Sum]
+        mov         rdi,            arg(4) ;[SSE]
+
+        movd        rdx,            xmm7
+        movsx       rcx,            dx
+
+        mov  dword ptr [rax],       ecx
+        movd DWORD PTR [rdi],       xmm1
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vp8_filter_block2d_bil_var_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared;;
+;
+;)
+global sym(vp8_filter_block2d_bil_var_sse2)
+sym(vp8_filter_block2d_bil_var_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            xmm6,           xmm6                 ;
+        pxor            xmm7,           xmm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                 ;
+        movq            xmm1,           QWORD PTR [rsi]               ;
+
+        movq            xmm3,           QWORD PTR [rsi+1]        ;
+        punpcklbw       xmm1,           xmm0                 ;
+
+        pmullw          xmm1,           [rax]               ;
+        punpcklbw       xmm3,           xmm0
+            ;
+        pmullw          xmm3,           [rax+16]             ;
+        paddw           xmm1,           xmm3                 ;
+
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movdqa          xmm5,           xmm1
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rsi,            r8
+%endif
+filter_block2d_bil_var_sse2_loop:
+
+        movq            xmm1,           QWORD PTR [rsi]               ;
+        movq            xmm3,           QWORD PTR [rsi+1]             ;
+
+        punpcklbw       xmm1,           xmm0                 ;
+        pmullw          xmm1,           [rax]               ;
+
+        punpcklbw       xmm3,           xmm0                 ;
+        pmullw          xmm3,           [rax+16]             ;
+
+        paddw           xmm1,           xmm3                 ;
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+
+        psraw           xmm1,           xmm_filter_shift    ;
+        movdqa          xmm3,           xmm5                 ;
+
+        movdqa          xmm5,           xmm1                 ;
+        pmullw          xmm3,           [rdx]               ;
+
+        pmullw          xmm1,           [rdx+16]             ;
+        paddw           xmm1,           xmm3                 ;
+
+        paddw           xmm1,           [xmm_bi_rd GLOBAL]   ;
+        psraw           xmm1,           xmm_filter_shift    ;
+
+        movq            xmm3,           QWORD PTR [rdi]               ;
+        punpcklbw       xmm3,           xmm0                 ;
+
+        psubw           xmm1,           xmm3                 ;
+        paddw           xmm6,           xmm1                 ;
+
+        pmaddwd         xmm1,           xmm1                 ;
+        paddd           xmm7,           xmm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             filter_block2d_bil_var_sse2_loop       ;
+
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(7) ; sum
+        mov             rdi,            arg(8) ; sumsquared
+
+        movd            [rsi],          mm2    ; xsum
+        movd            [rdi],          mm4    ; xxsum
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_vert_variance16x_h_sse2)
+sym(vp8_half_horiz_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+%else
+        add             rsi, r8
+%endif
+
+vp8_half_horiz_vert_variance16x_h_1:
+
+        movq            xmm1,           QWORD PTR [rsi]     ;
+        movq            xmm2,           QWORD PTR [rsi+1]   ;
+        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
+
+        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_vert_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_vert_variance16x_h_sse2)
+sym(vp8_half_vert_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_vert_variance16x_h_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+
+        sub             rcx,            1                   ;
+        jnz             vp8_half_vert_variance16x_h_1          ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vp8_half_horiz_variance16x_h_sse2
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vp8_half_horiz_variance16x_h_sse2)
+sym(vp8_half_horiz_variance16x_h_sse2):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    ; end prolog
+
+%if ABI_IS_32BIT=0
+    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
+    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
+%endif
+
+        pxor            xmm6,           xmm6                ;  error accumulator
+        pxor            xmm7,           xmm7                ;  sse eaccumulator
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            xmm0,           xmm0                ;
+vp8_half_horiz_variance16x16_1:
+        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
+        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
+
+        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
+        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
+
+        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
+        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
+
+        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
+        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
+        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
+        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
+
+%if ABI_IS_32BIT
+        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
+        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
+%else
+        add             rsi, r8
+        add             rdi, r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             vp8_half_horiz_variance16x16_1        ;
+
+        movdq2q         mm6,            xmm6                ;
+        movdq2q         mm7,            xmm7                ;
+
+        psrldq          xmm6,           8
+        psrldq          xmm7,           8
+
+        movdq2q         mm2,            xmm6
+        movdq2q         mm3,            xmm7
+
+        paddw           mm6,            mm2
+        paddd           mm7,            mm3
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rsi,            arg(5) ; sum
+        mov             rdi,            arg(6) ; sumsquared
+
+        movd            [rsi],          mm2                 ;
+        movd            [rdi],          mm4                 ;
+
+
+    ; begin epilog
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+SECTION_RODATA
+;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
+align 16
+xmm_bi_rd:
+    times 8 dw 64

diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
new file mode 100644
index 0000000..4a5b25b
--- /dev/null
+++ b/vp8/encoder/x86/variance_mmx.c

@@ -0,0 +1,596 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx
+(
+    unsigned char *src_ptr,
+    unsigned short *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *vp7_filter
+);
+extern void filter_block1d_v6_mmx
+(
+    short *src_ptr,
+    unsigned char *output_ptr,
+    unsigned int pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    short *vp7_filter
+);
+
+extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
+extern unsigned int vp8_get8x8var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern unsigned int vp8_get4x4var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+extern unsigned int vp8_get4x4sse_cs_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride
+);
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern void vp8_filter_block2d_bil_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+extern unsigned int vp8_get16x16pred_error_mmx
+(
+    unsigned char *src_ptr,
+    int src_stride,
+    unsigned char *ref_ptr,
+    int ref_stride
+);
+
+
+void vp8_test_get_mb_ss(void)
+{
+    short zz[] =
+    {
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+        -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
+    };
+    int s = 0, x = vp8_get_mb_ss_mmx(zz);
+    {
+        int y;
+
+        for (y = 0; y < 256; y++)
+            s += (zz[y] * zz[y]);
+    }
+
+    x += 0;
+}
+
+
+unsigned int vp8_get16x16var_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned *SSE,
+    unsigned *SUM
+)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+
+    *SSE = var;
+    *SUM = avg;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+
+
+
+
+unsigned int vp8_variance4x4_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+    return (var - ((avg * avg) >> 4));
+
+}
+
+unsigned int vp8_variance8x8_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    *sse = var;
+
+    return (var - ((avg * avg) >> 6));
+
+}
+
+unsigned int vp8_mse16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+
+unsigned int vp8_variance16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+}
+
+unsigned int vp8_variance16x8_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_variance8x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass                                                          //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
+{
+    { 128, 128, 128, 128,  0,  0,  0,  0 },
+    { 112, 112, 112, 112, 16, 16, 16, 16 },
+    {  96, 96, 96, 96, 32, 32, 32, 32 },
+    {  80, 80, 80, 80, 48, 48, 48, 48 },
+    {  64, 64, 64, 64, 64, 64, 64, 64 },
+    {  48, 48, 48, 48, 80, 80, 80, 80 },
+    {  32, 32, 32, 32, 96, 96, 96, 96 },
+    {  16, 16, 16, 16, 112, 112, 112, 112 }
+};
+
+unsigned int vp8_sub_pixel_variance4x4_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse)
+
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+
+
+}
+
+unsigned int vp8_sub_pixel_mse16x16_mmx(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_mmx(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_i_sub_pixel_variance16x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+    int f2soffset = (src_pixels_per_line >> 1);
+    int f2doffset = (dst_pixels_per_line >> 1);
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset, src_pixels_per_line,
+        dst_ptr + f2doffset, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset + 8, src_pixels_per_line,
+        dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_mmx
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+    int f2soffset = (src_pixels_per_line >> 1);
+    int f2doffset = (dst_pixels_per_line >> 1);
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_mmx(
+        src_ptr + f2soffset, src_pixels_per_line,
+        dst_ptr + f2doffset, dst_pixels_per_line, 8,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}

diff --git a/vp8/encoder/x86/variance_sse2.c b/vp8/encoder/x86/variance_sse2.c
new file mode 100644
index 0000000..ea80753
--- /dev/null
+++ b/vp8/encoder/x86/variance_sse2.c

@@ -0,0 +1,514 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "variance.h"
+#include "pragmas.h"
+#include "vpx_ports/mem.h"
+
+extern void filter_block1d_h6_mmx(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d_v6_mmx(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_h6_sse2(unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+extern void filter_block1d8_v6_sse2(short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
+
+extern void vp8_filter_block2d_bil4x4_var_mmx
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+extern unsigned int vp8_get4x4var_mmx
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *SSE,
+    int *Sum
+);
+
+unsigned int vp8_get_mb_ss_sse2
+(
+    short *src_ptr
+);
+unsigned int vp8_get16x16var_sse2
+(
+    unsigned char     *src_ptr,
+    int             source_stride,
+    unsigned char     *ref_ptr,
+    int             recon_stride,
+    unsigned int      *SSE,
+    int               *Sum
+);
+unsigned int vp8_get16x16pred_error_sse2
+(
+    unsigned char *src_ptr,
+    int src_stride,
+    unsigned char *ref_ptr,
+    int ref_stride
+);
+unsigned int vp8_get8x8var_sse2
+(
+    unsigned char     *src_ptr,
+    int             source_stride,
+    unsigned char     *ref_ptr,
+    int             recon_stride,
+    unsigned int      *SSE,
+    int               *Sum
+);
+void vp8_filter_block2d_bil_var_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    const short *HFilter,
+    const short *VFilter,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_vert_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_horiz_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+void vp8_half_vert_variance16x_h_sse2
+(
+    unsigned char *ref_ptr,
+    int ref_pixels_per_line,
+    unsigned char *src_ptr,
+    int src_pixels_per_line,
+    unsigned int Height,
+    int *sum,
+    unsigned int *sumsquared
+);
+
+DECLARE_ALIGNED(16, extern short, vp8_vp7_bilinear_filters_mmx[8][8]);
+
+unsigned int vp8_variance4x4_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+    return (var - ((avg * avg) >> 4));
+
+}
+
+
+
+unsigned int vp8_variance8x8_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride)
+{
+    unsigned int var;
+    int avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
+
+    return (var - ((avg * avg) >> 6));
+
+}
+
+
+unsigned int vp8_variance16x16_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0;
+    int sum0;
+
+
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return (sse0 - ((sum0 * sum0) >> 8));
+}
+unsigned int vp8_mse16x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+
+    unsigned int sse0;
+    int sum0;
+    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    *sse = sse0;
+    return sse0;
+
+}
+
+
+unsigned int vp8_variance16x8_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+unsigned int vp8_variance8x16_wmt
+(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+///////////////////////////////////////////////////////////////////////////
+// the mmx function that does the bilinear filtering and var calculation //
+// int one pass                                                          //
+///////////////////////////////////////////////////////////////////////////
+DECLARE_ALIGNED(16, const short, vp8_bilinear_filters_xmm[8][16]) =
+{
+    { 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0 },
+    { 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 },
+    {  96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 },
+    {  80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 },
+    {  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 },
+    {  48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 },
+    {  32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 },
+    {  16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 }
+};
+unsigned int vp8_sub_pixel_variance4x4_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil4x4_var_mmx(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line,
+        vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
+        &xsum, &xxsum
+    );
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 4));
+}
+
+
+unsigned int vp8_sub_pixel_variance8x8_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum, &xxsum
+    );
+
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 6));
+}
+
+unsigned int vp8_sub_pixel_variance16x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    // note we could avoid these if statements if the calling function
+    // just called the appropriate functions inside.
+    if (xoffset == 4 && yoffset == 0)
+    {
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 0 && yoffset == 4)
+    {
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else if (xoffset == 4 && yoffset == 4)
+    {
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            &xsum0, &xxsum0);
+
+        vp8_half_horiz_vert_variance16x_h_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            &xsum1, &xxsum1);
+    }
+    else
+    {
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr, src_pixels_per_line,
+            dst_ptr, dst_pixels_per_line, 16,
+            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            &xsum0, &xxsum0
+        );
+
+
+        vp8_filter_block2d_bil_var_sse2(
+            src_ptr + 8, src_pixels_per_line,
+            dst_ptr + 8, dst_pixels_per_line, 16,
+            vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+            &xsum1, &xxsum1
+        );
+    }
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 8));
+}
+
+unsigned int vp8_sub_pixel_mse16x16_wmt(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    vp8_sub_pixel_variance16x16_wmt(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
+    return *sse;
+}
+
+unsigned int vp8_sub_pixel_variance16x8_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+
+)
+{
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum0, &xxsum0
+    );
+
+
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr + 8, src_pixels_per_line,
+        dst_ptr + 8, dst_pixels_per_line, 8,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum1, &xxsum1
+    );
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - ((xsum0 * xsum0) >> 7));
+}
+
+unsigned int vp8_sub_pixel_variance8x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    int xsum;
+    unsigned int xxsum;
+    vp8_filter_block2d_bil_var_sse2(
+        src_ptr, src_pixels_per_line,
+        dst_ptr, dst_pixels_per_line, 16,
+        vp8_bilinear_filters_xmm[xoffset], vp8_bilinear_filters_xmm[yoffset],
+        &xsum, &xxsum
+    );
+
+    *sse = xxsum;
+    return (xxsum - ((xsum * xsum) >> 7));
+}
+
+unsigned int vp8_i_variance16x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 8));
+
+}
+
+unsigned int vp8_i_variance8x16_wmt(
+    unsigned char *src_ptr,
+    int  source_stride,
+    unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
+    vp8_get8x8var_sse2(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+
+    *sse = var;
+    return (var - ((avg * avg) >> 7));
+
+}
+
+
+unsigned int vp8_i_sub_pixel_variance16x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+    return vp8_sub_pixel_variance16x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}
+
+
+unsigned int vp8_i_sub_pixel_variance8x16_wmt
+(
+    unsigned char  *src_ptr,
+    int  src_pixels_per_line,
+    int  xoffset,
+    int  yoffset,
+    unsigned char *dst_ptr,
+    int dst_pixels_per_line,
+    unsigned int *sse
+)
+{
+
+    return vp8_sub_pixel_variance8x16_wmt(src_ptr, (src_pixels_per_line >> 1), xoffset, yoffset, dst_ptr, (dst_pixels_per_line >> 1), sse);
+}

diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
new file mode 100644
index 0000000..35fc90c
--- /dev/null
+++ b/vp8/encoder/x86/variance_x86.h

@@ -0,0 +1,275 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#ifndef VARIANCE_X86_H
+#define VARIANCE_X86_H
+
+
+/* Note:
+ *
+ * This platform is commonly built for runtime CPU detection. If you modify
+ * any of the function mappings present in this file, be sure to also update
+ * them in the function pointer initialization code
+ */
+#if HAVE_MMX
+extern prototype_sad(vp8_sad4x4_mmx);
+extern prototype_sad(vp8_sad8x8_mmx);
+extern prototype_sad(vp8_sad8x16_mmx);
+extern prototype_sad(vp8_sad16x8_mmx);
+extern prototype_sad(vp8_sad16x16_mmx);
+extern prototype_variance(vp8_variance4x4_mmx);
+extern prototype_variance(vp8_variance8x8_mmx);
+extern prototype_variance(vp8_variance8x16_mmx);
+extern prototype_variance(vp8_variance16x8_mmx);
+extern prototype_variance(vp8_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_mmx);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_mmx);
+extern prototype_getmbss(vp8_get_mb_ss_mmx);
+extern prototype_variance(vp8_mse16x16_mmx);
+extern prototype_sad(vp8_get16x16pred_error_mmx);
+extern prototype_variance2(vp8_get8x8var_mmx);
+extern prototype_variance2(vp8_get16x16var_mmx);
+extern prototype_sad(vp8_get4x4sse_cs_mmx);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_mmx
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_mmx
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_mmx
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_mmx
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_mmx
+
+#undef  vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_mmx
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_mmx
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_mmx
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_mmx
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_mmx
+
+#undef  vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_mmx
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_mmx
+
+#undef  vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_mmx
+
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_mmx
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_mmx
+
+#undef  vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_mmx
+
+#undef  vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_mmx
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_mmx
+
+#undef  vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
+
+#undef  vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_mmx
+
+#undef  vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_mmx
+
+#undef  vp8_variance_get4x4sse_cs
+#define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
+
+#endif
+#endif
+
+
+#if HAVE_SSE2
+extern prototype_sad(vp8_sad4x4_wmt);
+extern prototype_sad(vp8_sad8x8_wmt);
+extern prototype_sad(vp8_sad8x16_wmt);
+extern prototype_sad(vp8_sad16x8_wmt);
+extern prototype_sad(vp8_sad16x16_wmt);
+extern prototype_variance(vp8_variance4x4_wmt);
+extern prototype_variance(vp8_variance8x8_wmt);
+extern prototype_variance(vp8_variance8x16_wmt);
+extern prototype_variance(vp8_variance16x8_wmt);
+extern prototype_variance(vp8_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance4x4_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_wmt);
+extern prototype_subpixvariance(vp8_sub_pixel_mse16x16_wmt);
+extern prototype_getmbss(vp8_get_mb_ss_sse2);
+extern prototype_variance(vp8_mse16x16_wmt);
+extern prototype_sad(vp8_get16x16pred_error_sse2);
+extern prototype_variance2(vp8_get8x8var_sse2);
+extern prototype_variance2(vp8_get16x16var_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad4x4
+#define vp8_variance_sad4x4 vp8_sad4x4_wmt
+
+#undef  vp8_variance_sad8x8
+#define vp8_variance_sad8x8 vp8_sad8x8_wmt
+
+#undef  vp8_variance_sad8x16
+#define vp8_variance_sad8x16 vp8_sad8x16_wmt
+
+#undef  vp8_variance_sad16x8
+#define vp8_variance_sad16x8 vp8_sad16x8_wmt
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_wmt
+
+#undef  vp8_variance_var4x4
+#define vp8_variance_var4x4 vp8_variance4x4_wmt
+
+#undef  vp8_variance_var8x8
+#define vp8_variance_var8x8 vp8_variance8x8_wmt
+
+#undef  vp8_variance_var8x16
+#define vp8_variance_var8x16 vp8_variance8x16_wmt
+
+#undef  vp8_variance_var16x8
+#define vp8_variance_var16x8 vp8_variance16x8_wmt
+
+#undef  vp8_variance_var16x16
+#define vp8_variance_var16x16 vp8_variance16x16_wmt
+
+#undef  vp8_variance_subpixvar4x4
+#define vp8_variance_subpixvar4x4 vp8_sub_pixel_variance4x4_wmt
+
+#undef  vp8_variance_subpixvar8x8
+#define vp8_variance_subpixvar8x8 vp8_sub_pixel_variance8x8_wmt
+
+#undef  vp8_variance_subpixvar8x16
+#define vp8_variance_subpixvar8x16 vp8_sub_pixel_variance8x16_wmt
+
+#undef  vp8_variance_subpixvar16x8
+#define vp8_variance_subpixvar16x8 vp8_sub_pixel_variance16x8_wmt
+
+#undef  vp8_variance_subpixvar16x16
+#define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_wmt
+
+#undef  vp8_variance_subpixmse16x16
+#define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_wmt
+
+#undef  vp8_variance_getmbss
+#define vp8_variance_getmbss vp8_get_mb_ss_sse2
+
+#undef  vp8_variance_mse16x16
+#define vp8_variance_mse16x16 vp8_mse16x16_wmt
+
+#undef  vp8_variance_get16x16prederror
+#define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
+
+#undef  vp8_variance_get8x8var
+#define vp8_variance_get8x8var vp8_get8x8var_sse2
+
+#undef  vp8_variance_get16x16var
+#define vp8_variance_get16x16var vp8_get16x16var_sse2
+
+#endif
+#endif
+
+
+#if HAVE_SSE3
+extern prototype_sad(vp8_sad16x16_sse3);
+extern prototype_sad(vp8_sad16x8_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x16x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad8x8x3_sse3);
+extern prototype_sad_multi_same_address(vp8_sad4x4x3_sse3);
+
+extern prototype_sad_multi_dif_address(vp8_sad16x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad16x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x16x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad8x8x4d_sse3);
+extern prototype_sad_multi_dif_address(vp8_sad4x4x4d_sse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_variance_sad16x16
+#define vp8_variance_sad16x16 vp8_sad16x16_sse3
+
+#undef  vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_sse3
+
+#undef  vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_sse3
+
+#undef  vp8_variance_sad8x16x3
+#define vp8_variance_sad8x16x3 vp8_sad8x16x3_sse3
+
+#undef  vp8_variance_sad8x8x3
+#define vp8_variance_sad8x8x3 vp8_sad8x8x3_sse3
+
+#undef  vp8_variance_sad4x4x3
+#define vp8_variance_sad4x4x3 vp8_sad4x4x3_sse3
+
+#undef  vp8_variance_sad16x16x4d
+#define vp8_variance_sad16x16x4 vp8_sad16x16x4d_sse3
+
+#undef  vp8_variance_sad16x8x4d
+#define vp8_variance_sad16x8x4d vp8_sad16x8x4d_sse3
+
+#undef  vp8_variance_sad8x16x4d
+#define vp8_variance_sad8x16x4d vp8_sad8x16x4d_sse3
+
+#undef  vp8_variance_sad8x8x4d
+#define vp8_variance_sad8x8x4d vp8_sad8x8x4d_sse3
+
+#undef  vp8_variance_sad4x4x4d
+#define vp8_variance_sad4x4x4d vp8_sad4x4x4d_sse3
+
+#endif
+#endif
+
+
+#if HAVE_SSSE3
+extern prototype_sad_multi_same_address(vp8_sad16x16x3_ssse3);
+extern prototype_sad_multi_same_address(vp8_sad16x8x3_ssse3);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+#undef  vp8_variance_sad16x16x3
+#define vp8_variance_sad16x16x3 vp8_sad16x16x3_ssse3
+
+#undef  vp8_variance_sad16x8x3
+#define vp8_variance_sad16x8x3 vp8_sad16x8x3_ssse3
+
+#endif
+#endif
+
+#endif

diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
new file mode 100644
index 0000000..f1391ba
--- /dev/null
+++ b/vp8/encoder/x86/x86_csystemdependent.c

@@ -0,0 +1,287 @@
+/*
+ *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license and patent
+ *  grant that can be found in the LICENSE file in the root of the source
+ *  tree. All contributing project authors may be found in the AUTHORS
+ *  file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "vpx_ports/x86.h"
+#include "variance.h"
+#include "onyx_int.h"
+
+
+#if HAVE_MMX
+void vp8_short_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_mmx(input,   output,    pitch);
+    vp8_short_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+void vp8_fast_fdct8x4_mmx(short *input, short *output, int pitch)
+{
+    vp8_fast_fdct4x4_mmx(input,   output   , pitch);
+    vp8_fast_fdct4x4_mmx(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_mmx(BLOCK *b, BLOCKD *d)
+{
+    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+
+    d->eob = vp8_fast_quantize_b_impl_mmx(
+                 coeff_ptr,
+                 zbin_ptr,
+                 qcoeff_ptr,
+                 dequant_ptr,
+                 scan_mask,
+
+                 round_ptr,
+                 quant_ptr,
+                 dqcoeff_ptr
+             );
+}
+
+int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_mmx(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_mmx_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_mmx(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
+}
+
+void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
+                             short *diff, unsigned char *predictor,
+                             int pitch);
+void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
+{
+    unsigned char *z = *(be->base_src) + be->src;
+    unsigned int  src_stride = be->src_stride;
+    short *diff = &be->src_diff[0];
+    unsigned char *predictor = &bd->predictor[0];
+    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
+}
+
+#endif
+
+#if HAVE_SSE2
+void vp8_short_fdct8x4_wmt(short *input, short *output, int pitch)
+{
+    vp8_short_fdct4x4_wmt(input,   output,    pitch);
+    vp8_short_fdct4x4_wmt(input + 4, output + 16, pitch);
+}
+
+int vp8_fast_quantize_b_impl_sse(short *coeff_ptr, short *zbin_ptr,
+                                 short *qcoeff_ptr, short *dequant_ptr,
+                                 short *scan_mask, short *round_ptr,
+                                 short *quant_ptr, short *dqcoeff_ptr);
+void vp8_fast_quantize_b_sse(BLOCK *b, BLOCKD *d)
+{
+    short *scan_mask    = vp8_default_zig_zag_mask;//d->scan_order_mask_ptr;
+    short *coeff_ptr  = &b->coeff[0];
+    short *zbin_ptr   = &b->zbin[0][0];
+    short *round_ptr  = &b->round[0][0];
+    short *quant_ptr  = &b->quant[0][0];
+    short *qcoeff_ptr = d->qcoeff;
+    short *dqcoeff_ptr = d->dqcoeff;
+    short *dequant_ptr = &d->dequant[0][0];
+
+    d->eob = vp8_fast_quantize_b_impl_sse(
+                 coeff_ptr,
+                 zbin_ptr,
+                 qcoeff_ptr,
+                 dequant_ptr,
+                 scan_mask,
+
+                 round_ptr,
+                 quant_ptr,
+                 dqcoeff_ptr
+             );
+}
+
+int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc);
+int vp8_mbblock_error_xmm(MACROBLOCK *mb, int dc)
+{
+    short *coeff_ptr =  mb->block[0].coeff;
+    short *dcoef_ptr =  mb->e_mbd.block[0].dqcoeff;
+    return vp8_mbblock_error_xmm_impl(coeff_ptr, dcoef_ptr, dc);
+}
+
+int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr);
+int vp8_mbuverror_xmm(MACROBLOCK *mb)
+{
+    short *s_ptr = &mb->coeff[256];
+    short *d_ptr = &mb->e_mbd.dqcoeff[256];
+    return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
+}
+
+#endif
+
+void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
+{
+#if CONFIG_RUNTIME_CPU_DETECT
+    int flags = x86_simd_caps();
+    int mmx_enabled = flags & HAS_MMX;
+    int xmm_enabled = flags & HAS_SSE;
+    int wmt_enabled = flags & HAS_SSE2;
+    int SSE3Enabled = flags & HAS_SSE3;
+    int SSSE3Enabled = flags & HAS_SSSE3;
+
+    /* Note:
+     *
+     * This platform can be built without runtime CPU detection as well. If
+     * you modify any of the function mappings present in this file, be sure
+     * to also update them in static mapings (<arch>/filename_<arch>.h)
+     */
+
+    /* Override default functions with fastest ones for this CPU. */
+#if HAVE_MMX
+
+    if (mmx_enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_mmx;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_mmx;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_mmx;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_mmx;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_mmx;
+
+        cpi->rtcd.variance.var4x4                = vp8_variance4x4_mmx;
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_mmx;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_mmx;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_mmx;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_mmx;
+
+        cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_mmx;
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_mmx;
+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_mmx;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_mmx;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_mmx;
+        cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_mmx;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_mmx;
+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_mmx;
+
+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_mmx;
+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
+        cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
+
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_mmx;
+        cpi->rtcd.fdct.fast4x4                   = vp8_fast_fdct4x4_mmx;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_mmx;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_c;
+
+        cpi->rtcd.encodemb.berr                  = vp8_block_error_mmx;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_mmx;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_mmx;
+        cpi->rtcd.encodemb.subb                  = vp8_subtract_b_mmx;
+        cpi->rtcd.encodemb.submby                = vp8_subtract_mby_mmx;
+        cpi->rtcd.encodemb.submbuv               = vp8_subtract_mbuv_mmx;
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_mmx;
+    }
+
+#endif
+#if HAVE_SSE2
+
+    if (wmt_enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_wmt;
+        cpi->rtcd.variance.sad16x8               = vp8_sad16x8_wmt;
+        cpi->rtcd.variance.sad8x16               = vp8_sad8x16_wmt;
+        cpi->rtcd.variance.sad8x8                = vp8_sad8x8_wmt;
+        cpi->rtcd.variance.sad4x4                = vp8_sad4x4_wmt;
+
+        cpi->rtcd.variance.var4x4                = vp8_variance4x4_wmt;
+        cpi->rtcd.variance.var8x8                = vp8_variance8x8_wmt;
+        cpi->rtcd.variance.var8x16               = vp8_variance8x16_wmt;
+        cpi->rtcd.variance.var16x8               = vp8_variance16x8_wmt;
+        cpi->rtcd.variance.var16x16              = vp8_variance16x16_wmt;
+
+        cpi->rtcd.variance.subpixvar4x4          = vp8_sub_pixel_variance4x4_wmt;
+        cpi->rtcd.variance.subpixvar8x8          = vp8_sub_pixel_variance8x8_wmt;
+        cpi->rtcd.variance.subpixvar8x16         = vp8_sub_pixel_variance8x16_wmt;
+        cpi->rtcd.variance.subpixvar16x8         = vp8_sub_pixel_variance16x8_wmt;
+        cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_wmt;
+        cpi->rtcd.variance.subpixmse16x16        = vp8_sub_pixel_mse16x16_wmt;
+
+        cpi->rtcd.variance.mse16x16              = vp8_mse16x16_wmt;
+        cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_sse2;
+
+        cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
+        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
+        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
+        /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;
+
+#if 0
+        /* short SSE2 DCT currently disabled, does not match the MMX version */
+        cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_wmt;
+        cpi->rtcd.fdct.short8x4                  = vp8_short_fdct8x4_wmt;
+#endif
+        /* cpi->rtcd.fdct.fast4x4  not implemented for wmt */;
+        cpi->rtcd.fdct.fast8x4                   = vp8_fast_fdct8x4_wmt;
+        cpi->rtcd.fdct.walsh_short4x4            = vp8_short_walsh4x4_sse2;
+
+        cpi->rtcd.encodemb.berr                  = vp8_block_error_xmm;
+        cpi->rtcd.encodemb.mberr                 = vp8_mbblock_error_xmm;
+        cpi->rtcd.encodemb.mbuverr               = vp8_mbuverror_xmm;
+        /* cpi->rtcd.encodemb.sub* not implemented for wmt */
+
+        cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse;
+    }
+
+#endif
+#if HAVE_SSE3
+
+    if (SSE3Enabled)
+    {
+        cpi->rtcd.variance.sad16x16              = vp8_sad16x16_sse3;
+        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_sse3;
+        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_sse3;
+        cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
+        cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
+        cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
+        cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
+
+        cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
+        cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
+        cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
+        cpi->rtcd.variance.sad8x8x4d             = vp8_sad8x8x4d_sse3;
+        cpi->rtcd.variance.sad4x4x4d             = vp8_sad4x4x4d_sse3;
+        cpi->rtcd.search.diamond_search          = vp8_diamond_search_sadx4;
+    }
+
+#endif
+#if HAVE_SSSE3
+
+    if (SSSE3Enabled)
+    {
+        cpi->rtcd.variance.sad16x16x3            = vp8_sad16x16x3_ssse3;
+        cpi->rtcd.variance.sad16x8x3             = vp8_sad16x8x3_ssse3;
+    }
+
+#endif
+#endif
+}