Merge "Skip computation of distortion in vp8_pick_inter_mode if active_map is used"
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c
index 376707e..d732317 100644
--- a/vp8/common/alloccommon.c
+++ b/vp8/common/alloccommon.c
@@ -186,7 +186,7 @@
 void vp8_create_common(VP8_COMMON *oci)
 {
     vp8_machine_specific_config(oci);
-    vp8_default_coef_probs(oci);
+
     vp8_init_mbmode_probs(oci);
     vp8_default_bmode_probs(oci->fc.bmode_prob);
 
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 9615523..61c21d1 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -202,6 +202,7 @@
 
     /* 16 Y blocks, 4 U, 4 V, 1 DC 2nd order block, each with 16 entries. */
     BLOCKD block[25];
+    int fullpixel_mask;
 
     YV12_BUFFER_CONFIG pre; /* Filtered copy of previous frame reconstruction */
     YV12_BUFFER_CONFIG dst;
@@ -283,20 +284,4 @@
 extern void vp8_build_block_doffsets(MACROBLOCKD *x);
 extern void vp8_setup_block_dptrs(MACROBLOCKD *x);
 
-static void update_blockd_bmi(MACROBLOCKD *xd)
-{
-    int i;
-    int is_4x4;
-    is_4x4 = (xd->mode_info_context->mbmi.mode == SPLITMV) ||
-              (xd->mode_info_context->mbmi.mode == B_PRED);
-
-    if (is_4x4)
-    {
-        for (i = 0; i < 16; i++)
-        {
-            xd->block[i].bmi = xd->mode_info_context->bmi[i];
-        }
-    }
-}
-
 #endif  /* __INC_BLOCKD_H */
diff --git a/vp8/common/default_coef_probs.h b/vp8/common/default_coef_probs.h
new file mode 100755
index 0000000..0d19563
--- /dev/null
+++ b/vp8/common/default_coef_probs.h
@@ -0,0 +1,188 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+*/
+
+
+/*Generated file, included by entropy.c*/
+
+
+static const vp8_prob default_coef_probs [BLOCK_TYPES]
+                                         [COEF_BANDS]
+                                         [PREV_COEF_CONTEXTS]
+                                         [ENTROPY_NODES] =
+{
+    { /* Block Type ( 0 ) */
+        { /* Coeff Band ( 0 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            { 253, 136, 254, 255, 228, 219, 128, 128, 128, 128, 128 },
+            { 189, 129, 242, 255, 227, 213, 255, 219, 128, 128, 128 },
+            { 106, 126, 227, 252, 214, 209, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  98, 248, 255, 236, 226, 255, 255, 128, 128, 128 },
+            { 181, 133, 238, 254, 221, 234, 255, 154, 128, 128, 128 },
+            {  78, 134, 202, 247, 198, 180, 255, 219, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 185, 249, 255, 243, 255, 128, 128, 128, 128, 128 },
+            { 184, 150, 247, 255, 236, 224, 128, 128, 128, 128, 128 },
+            {  77, 110, 216, 255, 236, 230, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 101, 251, 255, 241, 255, 128, 128, 128, 128, 128 },
+            { 170, 139, 241, 252, 236, 209, 255, 255, 128, 128, 128 },
+            {  37, 116, 196, 243, 228, 255, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 204, 254, 255, 245, 255, 128, 128, 128, 128, 128 },
+            { 207, 160, 250, 255, 238, 128, 128, 128, 128, 128, 128 },
+            { 102, 103, 231, 255, 211, 171, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 152, 252, 255, 240, 255, 128, 128, 128, 128, 128 },
+            { 177, 135, 243, 255, 234, 225, 128, 128, 128, 128, 128 },
+            {  80, 129, 211, 255, 194, 224, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 246,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 255, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 1 ) */
+        { /* Coeff Band ( 0 )*/
+            { 198,  35, 237, 223, 193, 187, 162, 160, 145, 155,  62 },
+            { 131,  45, 198, 221, 172, 176, 220, 157, 252, 221,   1 },
+            {  68,  47, 146, 208, 149, 167, 221, 162, 255, 223, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 149, 241, 255, 221, 224, 255, 255, 128, 128, 128 },
+            { 184, 141, 234, 253, 222, 220, 255, 199, 128, 128, 128 },
+            {  81,  99, 181, 242, 176, 190, 249, 202, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1, 129, 232, 253, 214, 197, 242, 196, 255, 255, 128 },
+            {  99, 121, 210, 250, 201, 198, 255, 202, 128, 128, 128 },
+            {  23,  91, 163, 242, 170, 187, 247, 210, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 200, 246, 255, 234, 255, 128, 128, 128, 128, 128 },
+            { 109, 178, 241, 255, 231, 245, 255, 255, 128, 128, 128 },
+            {  44, 130, 201, 253, 205, 192, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1, 132, 239, 251, 219, 209, 255, 165, 128, 128, 128 },
+            {  94, 136, 225, 251, 218, 190, 255, 255, 128, 128, 128 },
+            {  22, 100, 174, 245, 186, 161, 255, 199, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 182, 249, 255, 232, 235, 128, 128, 128, 128, 128 },
+            { 124, 143, 241, 255, 227, 234, 128, 128, 128, 128, 128 },
+            {  35,  77, 181, 251, 193, 211, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 157, 247, 255, 236, 231, 255, 255, 128, 128, 128 },
+            { 121, 141, 235, 255, 225, 227, 255, 255, 128, 128, 128 },
+            {  45,  99, 188, 251, 195, 217, 255, 224, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 251, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 203,   1, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 137,   1, 177, 255, 224, 255, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 2 ) */
+        { /* Coeff Band ( 0 )*/
+            { 253,   9, 248, 251, 207, 208, 255, 192, 128, 128, 128 },
+            { 175,  13, 224, 243, 193, 185, 249, 198, 255, 255, 128 },
+            {  73,  17, 171, 221, 161, 179, 236, 167, 255, 234, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1,  95, 247, 253, 212, 183, 255, 255, 128, 128, 128 },
+            { 239,  90, 244, 250, 211, 209, 255, 255, 128, 128, 128 },
+            { 155,  77, 195, 248, 188, 195, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  24, 239, 251, 218, 219, 255, 205, 128, 128, 128 },
+            { 201,  51, 219, 255, 196, 186, 128, 128, 128, 128, 128 },
+            {  69,  46, 190, 239, 201, 218, 255, 228, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 191, 251, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 223, 165, 249, 255, 213, 255, 128, 128, 128, 128, 128 },
+            { 141, 124, 248, 255, 255, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  16, 248, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 190,  36, 230, 255, 236, 255, 128, 128, 128, 128, 128 },
+            { 149,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 226, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 247, 192, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 240, 128, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 134, 252, 255, 255, 128, 128, 128, 128, 128, 128 },
+            { 213,  62, 250, 255, 255, 128, 128, 128, 128, 128, 128 },
+            {  55,  93, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    },
+    { /* Block Type ( 3 ) */
+        { /* Coeff Band ( 0 )*/
+            { 202,  24, 213, 235, 186, 191, 220, 160, 240, 175, 255 },
+            { 126,  38, 182, 232, 169, 184, 228, 174, 255, 187, 128 },
+            {  61,  46, 138, 219, 151, 178, 240, 170, 255, 216, 128 }
+        },
+        { /* Coeff Band ( 1 )*/
+            {   1, 112, 230, 250, 199, 191, 247, 159, 255, 255, 128 },
+            { 166, 109, 228, 252, 211, 215, 255, 174, 128, 128, 128 },
+            {  39,  77, 162, 232, 172, 180, 245, 178, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 2 )*/
+            {   1,  52, 220, 246, 198, 199, 249, 220, 255, 255, 128 },
+            { 124,  74, 191, 243, 183, 193, 250, 221, 255, 255, 128 },
+            {  24,  71, 130, 219, 154, 170, 243, 182, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 3 )*/
+            {   1, 182, 225, 249, 219, 240, 255, 224, 128, 128, 128 },
+            { 149, 150, 226, 252, 216, 205, 255, 171, 128, 128, 128 },
+            {  28, 108, 170, 242, 183, 194, 254, 223, 255, 255, 128 }
+        },
+        { /* Coeff Band ( 4 )*/
+            {   1,  81, 230, 252, 204, 203, 255, 192, 128, 128, 128 },
+            { 123, 102, 209, 247, 188, 196, 255, 233, 128, 128, 128 },
+            {  20,  95, 153, 243, 164, 173, 255, 203, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 5 )*/
+            {   1, 222, 248, 255, 216, 213, 128, 128, 128, 128, 128 },
+            { 168, 175, 246, 252, 235, 205, 255, 255, 128, 128, 128 },
+            {  47, 116, 215, 255, 211, 212, 255, 255, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 6 )*/
+            {   1, 121, 236, 253, 212, 214, 255, 255, 128, 128, 128 },
+            { 141,  84, 213, 252, 201, 202, 255, 219, 128, 128, 128 },
+            {  42,  80, 160, 240, 162, 185, 255, 205, 128, 128, 128 }
+        },
+        { /* Coeff Band ( 7 )*/
+            {   1,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 244,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 },
+            { 238,   1, 255, 128, 128, 128, 128, 128, 128, 128, 128 }
+        }
+    }
+};
diff --git a/vp8/common/defaultcoefcounts.h b/vp8/common/defaultcoefcounts.h
deleted file mode 100644
index 7a1e28b..0000000
--- a/vp8/common/defaultcoefcounts.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef __DEFAULTCOEFCOUNTS_H
-#define __DEFAULTCOEFCOUNTS_H
-
-#include "entropy.h"
-
-extern const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
-                                                 [COEF_BANDS]
-                                                 [PREV_COEF_CONTEXTS]
-                                                 [MAX_ENTROPY_TOKENS];
-
-#endif //__DEFAULTCOEFCOUNTS_H
diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c
index 0eee60e..f3d5a9c 100644
--- a/vp8/common/entropy.c
+++ b/vp8/common/entropy.c
@@ -15,6 +15,7 @@
 #include "string.h"
 #include "blockd.h"
 #include "onyxc_int.h"
+#include "vpx_mem/vpx_mem.h"
 
 #define uchar unsigned char     /* typedefs can clash */
 #define uint  unsigned int
@@ -153,39 +154,15 @@
     { cat6, Pcat6, 11, 67},
     { 0, 0, 0, 0}
 };
-#include "defaultcoefcounts.h"
+
+#include "default_coef_probs.h"
 
 void vp8_default_coef_probs(VP8_COMMON *pc)
 {
-    int h = 0;
-
-    do
-    {
-        int i = 0;
-
-        do
-        {
-            int k = 0;
-
-            do
-            {
-                unsigned int branch_ct [ENTROPY_NODES] [2];
-                vp8_tree_probs_from_distribution(
-                    MAX_ENTROPY_TOKENS, vp8_coef_encodings, vp8_coef_tree,
-                    pc->fc.coef_probs[h][i][k],
-                    branch_ct,
-                    vp8_default_coef_counts[h][i][k],
-                    256, 1);
-
-            }
-            while (++k < PREV_COEF_CONTEXTS);
-        }
-        while (++i < COEF_BANDS);
-    }
-    while (++h < BLOCK_TYPES);
+    vpx_memcpy(pc->fc.coef_probs, default_coef_probs,
+                   sizeof(default_coef_probs));
 }
 
-
 void vp8_coef_tree_initialize()
 {
     init_bit_trees();
diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c
index 84cda13..064a835 100644
--- a/vp8/common/reconinter.c
+++ b/vp8/common/reconinter.c
@@ -19,10 +19,6 @@
 #include "onyxc_int.h"
 #endif
 
-static const int bbb[4] = {0, 2, 8, 10};
-
-
-
 void vp8_copy_mem16x16_c(
     unsigned char *src,
     int src_stride,
@@ -203,54 +199,109 @@
 
 
 /*encoder only*/
-void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x)
+void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x)
 {
-    int i;
+    unsigned char *uptr, *vptr;
+    unsigned char *upred_ptr = &x->predictor[256];
+    unsigned char *vpred_ptr = &x->predictor[320];
 
-    if (x->mode_info_context->mbmi.mode != SPLITMV)
+    int mv_row = x->mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->mode_info_context->mbmi.mv.as_mv.col;
+    int offset;
+    int pre_stride = x->block[16].pre_stride;
+
+    /* calc uv motion vectors */
+    if (mv_row < 0)
+        mv_row -= 1;
+    else
+        mv_row += 1;
+
+    if (mv_col < 0)
+        mv_col -= 1;
+    else
+        mv_col += 1;
+
+    mv_row /= 2;
+    mv_col /= 2;
+
+    mv_row &= x->fullpixel_mask;
+    mv_col &= x->fullpixel_mask;
+
+    offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
+    uptr = x->pre.u_buffer + offset;
+    vptr = x->pre.v_buffer + offset;
+
+    if ((mv_row | mv_col) & 7)
     {
-        unsigned char *uptr, *vptr;
-        unsigned char *upred_ptr = &x->predictor[256];
-        unsigned char *vpred_ptr = &x->predictor[320];
-
-        int mv_row = x->block[16].bmi.mv.as_mv.row;
-        int mv_col = x->block[16].bmi.mv.as_mv.col;
-        int offset;
-        int pre_stride = x->block[16].pre_stride;
-
-        offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
-        uptr = x->pre.u_buffer + offset;
-        vptr = x->pre.v_buffer + offset;
-
-        if ((mv_row | mv_col) & 7)
-        {
-            x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
-            x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
-        }
-        else
-        {
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
-            RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
-        }
+        x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8);
+        x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8);
     }
     else
     {
-        for (i = 16; i < 24; i += 2)
-        {
-            BLOCKD *d0 = &x->block[i];
-            BLOCKD *d1 = &x->block[i+1];
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8);
+        RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8);
+    }
+}
 
-            if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
-                build_inter_predictors2b(x, d0, 8);
-            else
-            {
-                vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
-                vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
-            }
+/*encoder only*/
+void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x)
+{
+    int i, j;
+
+    /* build uv mvs */
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.row
+                   + x->block[yoffset+1].bmi.mv.as_mv.row
+                   + x->block[yoffset+4].bmi.mv.as_mv.row
+                   + x->block[yoffset+5].bmi.mv.as_mv.row;
+
+            if (temp < 0) temp -= 4;
+            else temp += 4;
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->block[yoffset  ].bmi.mv.as_mv.col
+                   + x->block[yoffset+1].bmi.mv.as_mv.col
+                   + x->block[yoffset+4].bmi.mv.as_mv.col
+                   + x->block[yoffset+5].bmi.mv.as_mv.col;
+
+            if (temp < 0) temp -= 4;
+            else temp += 4;
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            x->block[voffset].bmi.mv.as_mv.row =
+                x->block[uoffset].bmi.mv.as_mv.row ;
+            x->block[voffset].bmi.mv.as_mv.col =
+                x->block[uoffset].bmi.mv.as_mv.col ;
+        }
+    }
+
+    for (i = 16; i < 24; i += 2)
+    {
+        BLOCKD *d0 = &x->block[i];
+        BLOCKD *d1 = &x->block[i+1];
+
+        if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
+            build_inter_predictors2b(x, d0, 8);
+        else
+        {
+            vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict);
+            vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict);
         }
     }
 }
 
+
 /*encoder only*/
 void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x)
 {
@@ -302,8 +353,23 @@
         RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride);
     }
 
-    mv_row = x->block[16].bmi.mv.as_mv.row;
-    mv_col = x->block[16].bmi.mv.as_mv.col;
+    /* calc uv motion vectors */
+    if (mv_row < 0)
+        mv_row -= 1;
+    else
+        mv_row += 1;
+
+    if (mv_col < 0)
+        mv_col -= 1;
+    else
+        mv_col += 1;
+
+    mv_row /= 2;
+    mv_col /= 2;
+
+    mv_row &= x->fullpixel_mask;
+    mv_col &= x->fullpixel_mask;
+
     pre_stride >>= 1;
     offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
     uptr = x->pre.u_buffer + offset;
@@ -322,17 +388,21 @@
 
 }
 
-void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x)
+static void build_inter4x4_predictors_mb(MACROBLOCKD *x)
 {
     int i;
 
     if (x->mode_info_context->mbmi.partitioning < 3)
     {
-        for (i = 0; i < 4; i++)
-        {
-            BLOCKD *d = &x->block[bbb[i]];
-            build_inter_predictors4b(x, d, 16);
-        }
+        x->block[ 0].bmi = x->mode_info_context->bmi[ 0];
+        x->block[ 2].bmi = x->mode_info_context->bmi[ 2];
+        x->block[ 8].bmi = x->mode_info_context->bmi[ 8];
+        x->block[10].bmi = x->mode_info_context->bmi[10];
+
+        build_inter_predictors4b(x, &x->block[ 0], 16);
+        build_inter_predictors4b(x, &x->block[ 2], 16);
+        build_inter_predictors4b(x, &x->block[ 8], 16);
+        build_inter_predictors4b(x, &x->block[10], 16);
     }
     else
     {
@@ -341,6 +411,9 @@
             BLOCKD *d0 = &x->block[i];
             BLOCKD *d1 = &x->block[i+1];
 
+            x->block[i+0].bmi = x->mode_info_context->bmi[i+0];
+            x->block[i+1].bmi = x->mode_info_context->bmi[i+1];
+
             if (d0->bmi.mv.as_int == d1->bmi.mv.as_int)
                 build_inter_predictors2b(x, d0, 16);
             else
@@ -368,6 +441,49 @@
     }
 }
 
+static
+void build_4x4uvmvs(MACROBLOCKD *x)
+{
+    int i, j;
+
+    for (i = 0; i < 2; i++)
+    {
+        for (j = 0; j < 2; j++)
+        {
+            int yoffset = i * 8 + j * 2;
+            int uoffset = 16 + i * 2 + j;
+            int voffset = 20 + i * 2 + j;
+
+            int temp;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.row
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.row;
+
+            if (temp < 0) temp -= 4;
+            else temp += 4;
+
+            x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & x->fullpixel_mask;
+
+            temp = x->mode_info_context->bmi[yoffset + 0].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 1].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 4].mv.as_mv.col
+                 + x->mode_info_context->bmi[yoffset + 5].mv.as_mv.col;
+
+            if (temp < 0) temp -= 4;
+            else temp += 4;
+
+            x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & x->fullpixel_mask;
+
+            x->block[voffset].bmi.mv.as_mv.row =
+                x->block[uoffset].bmi.mv.as_mv.row ;
+            x->block[voffset].bmi.mv.as_mv.col =
+                x->block[uoffset].bmi.mv.as_mv.col ;
+        }
+    }
+}
+
 void vp8_build_inter_predictors_mb(MACROBLOCKD *x)
 {
     if (x->mode_info_context->mbmi.mode != SPLITMV)
@@ -377,89 +493,8 @@
     }
     else
     {
-        vp8_build_inter4x4_predictors_mb(x);
+        build_4x4uvmvs(x);
+        build_inter4x4_predictors_mb(x);
     }
 }
 
-void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel)
-{
-    int i, j;
-
-    if (x->mode_info_context->mbmi.mode == SPLITMV)
-    {
-        for (i = 0; i < 2; i++)
-        {
-            for (j = 0; j < 2; j++)
-            {
-                int yoffset = i * 8 + j * 2;
-                int uoffset = 16 + i * 2 + j;
-                int voffset = 20 + i * 2 + j;
-
-                int temp;
-
-                temp = x->block[yoffset  ].bmi.mv.as_mv.row
-                       + x->block[yoffset+1].bmi.mv.as_mv.row
-                       + x->block[yoffset+4].bmi.mv.as_mv.row
-                       + x->block[yoffset+5].bmi.mv.as_mv.row;
-
-                if (temp < 0) temp -= 4;
-                else temp += 4;
-
-                x->block[uoffset].bmi.mv.as_mv.row = temp / 8;
-
-                if (fullpixel)
-                    x->block[uoffset].bmi.mv.as_mv.row = (temp / 8) & 0xfffffff8;
-
-                temp = x->block[yoffset  ].bmi.mv.as_mv.col
-                       + x->block[yoffset+1].bmi.mv.as_mv.col
-                       + x->block[yoffset+4].bmi.mv.as_mv.col
-                       + x->block[yoffset+5].bmi.mv.as_mv.col;
-
-                if (temp < 0) temp -= 4;
-                else temp += 4;
-
-                x->block[uoffset].bmi.mv.as_mv.col = temp / 8;
-
-                if (fullpixel)
-                    x->block[uoffset].bmi.mv.as_mv.col = (temp / 8) & 0xfffffff8;
-
-                x->block[voffset].bmi.mv.as_mv.row = x->block[uoffset].bmi.mv.as_mv.row ;
-                x->block[voffset].bmi.mv.as_mv.col = x->block[uoffset].bmi.mv.as_mv.col ;
-            }
-        }
-    }
-    else
-    {
-        int mvrow = x->mode_info_context->mbmi.mv.as_mv.row;
-        int mvcol = x->mode_info_context->mbmi.mv.as_mv.col;
-
-        if (mvrow < 0)
-            mvrow -= 1;
-        else
-            mvrow += 1;
-
-        if (mvcol < 0)
-            mvcol -= 1;
-        else
-            mvcol += 1;
-
-        mvrow /= 2;
-        mvcol /= 2;
-
-        for (i = 0; i < 8; i++)
-        {
-            x->block[ 16 + i].bmi.mv.as_mv.row = mvrow;
-            x->block[ 16 + i].bmi.mv.as_mv.col = mvcol;
-
-            if (fullpixel)
-            {
-                x->block[ 16 + i].bmi.mv.as_mv.row = mvrow & 0xfffffff8;
-                x->block[ 16 + i].bmi.mv.as_mv.col = mvcol & 0xfffffff8;
-            }
-        }
-    }
-}
-
-
-
-
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index a68e4aa..456812e 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -22,8 +22,9 @@
 
 
 extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x);
-extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf);
-extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x);
+
+extern void vp8_build_inter16x16_predictors_mbuv(MACROBLOCKD *x);
+extern void vp8_build_inter4x4_predictors_mbuv(MACROBLOCKD *x);
 
 #endif
diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm
index 34a7e18..83d3765 100644
--- a/vp8/common/x86/idctllm_sse2.asm
+++ b/vp8/common/x86/idctllm_sse2.asm
@@ -11,7 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;void idct_dequant_0_2x_sse2
+;void vp8_idct_dequant_0_2x_sse2
 ; (
 ;   short *qcoeff       - 0
 ;   short *dequant      - 1
@@ -21,8 +21,8 @@
 ;   int blk_stride      - 5
 ; )
 
-global sym(idct_dequant_0_2x_sse2)
-sym(idct_dequant_0_2x_sse2):
+global sym(vp8_idct_dequant_0_2x_sse2)
+sym(vp8_idct_dequant_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -97,8 +97,8 @@
     pop         rbp
     ret
 
-global sym(idct_dequant_full_2x_sse2)
-sym(idct_dequant_full_2x_sse2):
+global sym(vp8_idct_dequant_full_2x_sse2)
+sym(vp8_idct_dequant_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -353,7 +353,7 @@
     pop         rbp
     ret
 
-;void idct_dequant_dc_0_2x_sse2
+;void vp8_idct_dequant_dc_0_2x_sse2
 ; (
 ;   short *qcoeff       - 0
 ;   short *dequant      - 1
@@ -362,8 +362,8 @@
 ;   int dst_stride      - 4
 ;   short *dc           - 5
 ; )
-global sym(idct_dequant_dc_0_2x_sse2)
-sym(idct_dequant_dc_0_2x_sse2):
+global sym(vp8_idct_dequant_dc_0_2x_sse2)
+sym(vp8_idct_dequant_dc_0_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
@@ -438,8 +438,8 @@
     pop         rbp
     ret
 
-global sym(idct_dequant_dc_full_2x_sse2)
-sym(idct_dequant_dc_full_2x_sse2):
+global sym(vp8_idct_dequant_dc_full_2x_sse2)
+sym(vp8_idct_dequant_dc_full_2x_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 7
diff --git a/vp8/common/x86/loopfilter_mmx.asm b/vp8/common/x86/loopfilter_mmx.asm
index ad47284..697a5de 100644
--- a/vp8/common/x86/loopfilter_mmx.asm
+++ b/vp8/common/x86/loopfilter_mmx.asm
@@ -40,7 +40,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         movsxd      rcx, dword ptr arg(5) ;count
-next8_h:
+.next8_h:
         mov         rdx, arg(3) ;limit
         movq        mm7, [rdx]
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
@@ -211,7 +211,7 @@
         add         rsi,8
         neg         rax
         dec         rcx
-        jnz         next8_h
+        jnz         .next8_h
 
     add rsp, 32
     pop rsp
@@ -255,7 +255,7 @@
         lea         rsi,        [rsi + rax*4 - 4]
 
         movsxd      rcx,        dword ptr arg(5) ;count
-next8_v:
+.next8_v:
         mov         rdi,        rsi           ; rdi points to row +1 for indirect addressing
         add         rdi,        rax
 
@@ -581,7 +581,7 @@
 
         lea         rsi,        [rsi+rax*8]
         dec         rcx
-        jnz         next8_v
+        jnz         .next8_v
 
     add rsp, 64
     pop rsp
@@ -622,7 +622,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         movsxd      rcx, dword ptr arg(5) ;count
-next8_mbh:
+.next8_mbh:
         mov         rdx, arg(3) ;limit
         movq        mm7, [rdx]
         mov         rdi, rsi              ; rdi points to row +1 for indirect addressing
@@ -898,7 +898,7 @@
         neg         rax
         add         rsi,8
         dec         rcx
-        jnz         next8_mbh
+        jnz         .next8_mbh
 
     add rsp, 32
     pop rsp
@@ -942,7 +942,7 @@
         lea         rsi,        [rsi + rax*4 - 4]
 
         movsxd      rcx,        dword ptr arg(5) ;count
-next8_mbv:
+.next8_mbv:
         lea         rdi,        [rsi + rax]  ; rdi points to row +1 for indirect addressing
 
         ;transpose
@@ -1365,7 +1365,7 @@
         lea         rsi,        [rsi+rax*8]
         dec         rcx
 
-        jnz         next8_mbv
+        jnz         .next8_mbv
 
     add rsp, 96
     pop rsp
@@ -1398,7 +1398,7 @@
         movsxd      rax, dword ptr arg(1) ;src_pixel_step     ; destination pitch?
 
         mov         rcx, 2                ; count
-nexts8_h:
+.nexts8_h:
         mov         rdx, arg(2) ;blimit           ; get blimit
         movq        mm3, [rdx]            ;
 
@@ -1483,7 +1483,7 @@
         add         rsi,8
         neg         rax
         dec         rcx
-        jnz         nexts8_h
+        jnz         .nexts8_h
 
     ; begin epilog
     pop rdi
@@ -1520,7 +1520,7 @@
 
         lea         rsi, [rsi + rax*4- 2];  ;
         mov         rcx, 2                                      ; count
-nexts8_v:
+.nexts8_v:
 
         lea         rdi,        [rsi + rax];
         movd        mm0,        [rdi + rax * 2]                 ; xx xx xx xx 73 72 71 70
@@ -1695,7 +1695,7 @@
         lea         rsi,        [rsi+rax*8]                 ; next 8
 
         dec         rcx
-        jnz         nexts8_v
+        jnz         .nexts8_v
 
     add rsp, 32
     pop rsp
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 4efff7e..295609c 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -1395,8 +1395,8 @@
         neg         rax
 
         ; calculate mask
-        movdqu      xmm1, [rsi+2*rax]       ; p1
-        movdqu      xmm0, [rdi]             ; q1
+        movdqa      xmm1, [rsi+2*rax]       ; p1
+        movdqa      xmm0, [rdi]             ; q1
         movdqa      xmm2, xmm1
         movdqa      xmm7, xmm0
         movdqa      xmm4, xmm0
@@ -1406,8 +1406,8 @@
         pand        xmm1, [GLOBAL(tfe)]     ; set lsb of each byte to zero
         psrlw       xmm1, 1                 ; abs(p1-q1)/2
 
-        movdqu      xmm5, [rsi+rax]         ; p0
-        movdqu      xmm4, [rsi]             ; q0
+        movdqa      xmm5, [rsi+rax]         ; p0
+        movdqa      xmm4, [rsi]             ; q0
         movdqa      xmm0, xmm4              ; q0
         movdqa      xmm6, xmm5              ; p0
         psubusb     xmm5, xmm4              ; p0-=q0
@@ -1449,7 +1449,7 @@
 
         psubsb      xmm3, xmm0              ; q0-= q0 add
         pxor        xmm3, [GLOBAL(t80)]     ; unoffset
-        movdqu      [rsi], xmm3             ; write back
+        movdqa      [rsi], xmm3             ; write back
 
         ; now do +3 side
         psubsb      xmm5, [GLOBAL(t1s)]     ; +3 instead of +4
@@ -1465,7 +1465,7 @@
 
         paddsb      xmm6, xmm0              ; p0+= p0 add
         pxor        xmm6, [GLOBAL(t80)]     ; unoffset
-        movdqu      [rsi+rax], xmm6         ; write back
+        movdqa      [rsi+rax], xmm6         ; write back
 
     ; begin epilog
     pop rdi
@@ -1507,17 +1507,17 @@
         lea         rdx,        [rsi + rax*4]
         lea         rcx,        [rdx + rax]
 
-        movdqu      xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
-        movdqu      xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
-        movdqu      xmm2,       [rdi]                   ; 13 12 11 10
-        movdqu      xmm3,       [rcx]                   ; 53 52 51 50
+        movd        xmm0,       [rsi]                   ; (high 96 bits unused) 03 02 01 00
+        movd        xmm1,       [rdx]                   ; (high 96 bits unused) 43 42 41 40
+        movd        xmm2,       [rdi]                   ; 13 12 11 10
+        movd        xmm3,       [rcx]                   ; 53 52 51 50
         punpckldq   xmm0,       xmm1                    ; (high 64 bits unused) 43 42 41 40 03 02 01 00
         punpckldq   xmm2,       xmm3                    ; 53 52 51 50 13 12 11 10
 
-        movdqu      xmm4,       [rsi + rax*2]           ; 23 22 21 20
-        movdqu      xmm5,       [rdx + rax*2]           ; 63 62 61 60
-        movdqu      xmm6,       [rdi + rax*2]           ; 33 32 31 30
-        movdqu      xmm7,       [rcx + rax*2]           ; 73 72 71 70
+        movd        xmm4,       [rsi + rax*2]           ; 23 22 21 20
+        movd        xmm5,       [rdx + rax*2]           ; 63 62 61 60
+        movd        xmm6,       [rdi + rax*2]           ; 33 32 31 30
+        movd        xmm7,       [rcx + rax*2]           ; 73 72 71 70
         punpckldq   xmm4,       xmm5                    ; 63 62 61 60 23 22 21 20
         punpckldq   xmm6,       xmm7                    ; 73 72 71 70 33 32 31 30
 
@@ -1540,17 +1540,17 @@
         lea         rdx,        [rsi + rax*4]
         lea         rcx,        [rdx + rax]
 
-        movdqu      xmm4,       [rsi]                   ; 83 82 81 80
-        movdqu      xmm1,       [rdx]                   ; c3 c2 c1 c0
-        movdqu      xmm6,       [rdi]                   ; 93 92 91 90
-        movdqu      xmm3,       [rcx]                   ; d3 d2 d1 d0
+        movd        xmm4,       [rsi]                   ; 83 82 81 80
+        movd        xmm1,       [rdx]                   ; c3 c2 c1 c0
+        movd        xmm6,       [rdi]                   ; 93 92 91 90
+        movd        xmm3,       [rcx]                   ; d3 d2 d1 d0
         punpckldq   xmm4,       xmm1                    ; c3 c2 c1 c0 83 82 81 80
         punpckldq   xmm6,       xmm3                    ; d3 d2 d1 d0 93 92 91 90
 
-        movdqu      xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
-        movdqu      xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
-        movdqu      xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
-        movdqu      xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
+        movd        xmm0,       [rsi + rax*2]           ; a3 a2 a1 a0
+        movd        xmm5,       [rdx + rax*2]           ; e3 e2 e1 e0
+        movd        xmm2,       [rdi + rax*2]           ; b3 b2 b1 b0
+        movd        xmm7,       [rcx + rax*2]           ; f3 f2 f1 f0
         punpckldq   xmm0,       xmm5                    ; e3 e2 e1 e0 a3 a2 a1 a0
         punpckldq   xmm2,       xmm7                    ; f3 f2 f1 f0 b3 b2 b1 b0
 
diff --git a/vp8/common/x86/postproc_mmx.asm b/vp8/common/x86/postproc_mmx.asm
index 787e832..8112218 100644
--- a/vp8/common/x86/postproc_mmx.asm
+++ b/vp8/common/x86/postproc_mmx.asm
@@ -58,10 +58,10 @@
         movsxd      rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
         pxor        mm0, mm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
 
         xor         rdx,        rdx       ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
 
         pxor        mm7, mm7              ; mm7 = 00000000
         movq        mm6, [rbx + 32 ]      ; mm6 = kernel 2 taps
@@ -146,7 +146,7 @@
         add         rdx, 4
 
         cmp         edx, dword ptr arg(5) ;cols
-        jl          nextcol
+        jl          .nextcol
         ; done with the all cols, start the across filtering in place
         sub         rsi, rdx
         sub         rdi, rdx
@@ -156,7 +156,7 @@
         xor         rdx,    rdx
         mov         rax,    [rdi-4];
 
-acrossnextcol:
+.acrossnextcol:
         pxor        mm7, mm7              ; mm7 = 00000000
         movq        mm6, [rbx + 32 ]      ;
         movq        mm4, [rdi+rdx]        ; mm4 = p0..p7
@@ -237,7 +237,7 @@
 
         add         rdx, 4
         cmp         edx, dword ptr arg(5) ;cols
-        jl          acrossnextcol;
+        jl          .acrossnextcol;
 
         mov         DWORD PTR [rdi+rdx-4],  eax
         pop         rax
@@ -249,7 +249,7 @@
         movsxd      rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch?
 
         dec         rcx                   ; decrement count
-        jnz         nextrow               ; next row
+        jnz         .nextrow               ; next row
         pop         rbx
 
     ; begin epilog
@@ -293,7 +293,7 @@
     add         dword ptr arg(2), 8
 
     ;for(c=0; c<cols; c+=4)
-loop_col:
+.loop_col:
             mov         rsi,        arg(0)  ;s
             pxor        mm0,        mm0     ;
 
@@ -312,7 +312,7 @@
 
             mov         rcx,        15          ;
 
-loop_initvar:
+.loop_initvar:
             movd        mm1,        DWORD PTR [rdi];
             punpcklbw   mm1,        mm0     ;
 
@@ -329,10 +329,10 @@
             lea         rdi,        [rdi+rax]   ;
 
             dec         rcx
-            jne         loop_initvar
+            jne         .loop_initvar
             ;save the var and sum
             xor         rdx,        rdx
-loop_row:
+.loop_row:
             movd        mm1,        DWORD PTR [rsi]     ; [s-pitch*8]
             movd        mm2,        DWORD PTR [rdi]     ; [s+pitch*7]
 
@@ -438,13 +438,13 @@
             add         rdx,        1
 
             cmp         edx,        dword arg(2) ;rows
-            jl          loop_row
+            jl          .loop_row
 
 
         add         dword arg(0), 4 ; s += 4
         sub         dword arg(3), 4 ; cols -= 4
         cmp         dword arg(3), 0
-        jg          loop_col
+        jg          .loop_col
 
     add         rsp, 136
     pop         rsp
@@ -475,7 +475,7 @@
     push        rdi
     ; end prolog
 
-addnoise_loop:
+.addnoise_loop:
     call sym(rand) WRT_PLT
     mov     rcx, arg(1) ;noise
     and     rax, 0xff
@@ -492,7 +492,7 @@
             mov     rsi, arg(0) ;Pos
             xor         rax,rax
 
-addnoise_nextset:
+.addnoise_nextset:
             movq        mm1,[rsi+rax]         ; get the source
 
             psubusb     mm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
@@ -506,12 +506,12 @@
             add         rax,8                 ; move to the next line
 
             cmp         rax, rcx
-            jl          addnoise_nextset
+            jl          .addnoise_nextset
 
     movsxd  rax, dword arg(7) ; Pitch
     add     arg(0), rax ; Start += Pitch
     sub     dword arg(6), 1   ; Height -= 1
-    jg      addnoise_loop
+    jg      .addnoise_loop
 
     ; begin epilog
     pop rdi
diff --git a/vp8/common/x86/postproc_mmx.c b/vp8/common/x86/postproc_mmx.c
deleted file mode 100644
index 6b6321a..0000000
--- a/vp8/common/x86/postproc_mmx.c
+++ /dev/null
@@ -1,1508 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include <math.h>
-#include <stdlib.h>
-#include "vpx_scale/yv12config.h"
-#include "pragmas.h"
-
-#define VP8_FILTER_WEIGHT 128
-#define VP8_FILTER_SHIFT  7
-
-
-
-/* static constants */
-__declspec(align(16))
-const static short  Blur[48] =
-{
-
-    16, 16, 16, 16, 16, 16, 16, 16,
-    16, 16, 16, 16, 16, 16, 16, 16,
-    64, 64, 64, 64, 64, 64, 64, 64,
-    16, 16, 16, 16, 16, 16, 16, 16,
-    16, 16, 16, 16, 16, 16, 16, 16,
-    0,  0,  0,  0,  0,  0,  0,  0,
-
-};
-#define RD  __declspec(align(16)) __int64 rd  = 0x0040004000400040;
-#define R4D2 __declspec(align(16)) __int64 rd42[2] = {0x0004000400040004,0x0004000400040004};
-
-#ifndef RELOCATEABLE
-const static RD;
-const static R4D2;
-#endif
-
-
-/* external references */
-extern double vp8_gaussian(double sigma, double mu, double x);
-extern short vp8_rv[];
-extern int vp8_q2mbl(int x) ;
-
-
-
-void vp8_post_proc_down_and_across_mmx
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int rows,
-    int cols,
-    int flimit
-)
-{
-#ifdef RELOCATEABLE
-    RD
-    R4D2
-#endif
-
-    __asm
-    {
-        push        ebx
-        lea         ebx, Blur
-        movd        mm2, flimit
-        punpcklwd   mm2, mm2
-        punpckldq   mm2, mm2
-
-        mov         esi,        src_ptr
-        mov         edi,        dst_ptr
-
-        mov         ecx, DWORD PTR rows
-        mov         eax, src_pixels_per_line ;
-        destination pitch?
-        pxor        mm0, mm0              ;
-        mm0 = 00000000
-
-        nextrow:
-
-        xor         edx,        edx       ;
-
-        clear out edx for use as loop counter
-        nextcol:
-
-        pxor        mm7, mm7              ;
-
-    mm7 = 00000000
-    movq        mm6, [ebx + 32 ]      ;
-        mm6 = kernel 2 taps
-        movq        mm3, [esi]            ;
-        mm4 = r0 p0..p7
-        punpcklbw   mm3, mm0              ;
-        mm3 = p0..p3
-        movq        mm1, mm3              ;
-        mm1 = p0..p3
-        pmullw      mm3, mm6              ;
-        mm3 *= kernel 2 modifiers
-
-        movq        mm6, [ebx + 48]       ;
-        mm6 = kernel 3 taps
-        movq        mm5, [esi + eax]      ;
-        mm4 = r1 p0..p7
-        punpcklbw   mm5, mm0              ;
-        mm5 = r1 p0..p3
-        pmullw      mm6, mm5              ;
-        mm6 *= p0..p3 * kernel 3 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm6
-
-        ;
-        thresholding
-        movq        mm7, mm1              ;
-        mm7 = r0 p0..p3
-        psubusw     mm7, mm5              ;
-        mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     mm5, mm1              ;
-        mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     mm7, mm5              ;
-        mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [ebx + 64 ]      ;
-        mm6 = kernel 4 modifiers
-        movq        mm5, [esi + 2*eax]    ;
-        mm4 = r2 p0..p7
-        punpcklbw   mm5, mm0              ;
-        mm5 = r2 p0..p3
-        pmullw      mm6, mm5              ;
-        mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ;
-        mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     mm5, mm1              ;
-        mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     mm6, mm5              ;
-        mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-
-        neg         eax
-        movq        mm6, [ebx ]           ;
-        kernel 0 taps
-        movq        mm5, [esi+2*eax]      ;
-        mm4 = r-2 p0..p7
-        punpcklbw   mm5, mm0              ;
-        mm5 = r-2 p0..p3
-        pmullw      mm6, mm5              ;
-        mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = r0 p0..p3
-        psubusw     mm6, mm5              ;
-        mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm5, mm1              ;
-        mm5 = r-2 p0..p3 - p0..p3
-        paddusw     mm6, mm5              ;
-        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-        movq        mm6, [ebx + 16]       ;
-        kernel 1 taps
-        movq        mm4, [esi+eax]        ;
-        mm4 = r-1 p0..p7
-        punpcklbw   mm4, mm0              ;
-        mm4 = r-1 p0..p3
-        pmullw      mm6, mm4              ;
-        mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = r0 p0..p3
-        psubusw     mm6, mm4              ;
-        mm6 = p0..p3 - r-2 p0..p3
-        psubusw     mm4, mm1              ;
-        mm5 = r-1 p0..p3 - p0..p3
-        paddusw     mm6, mm4              ;
-        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-
-        paddusw     mm3, rd               ;
-        mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ;
-        mm3 /= 128
-
-        pand        mm1, mm7              ;
-        mm1 select vals > thresh from source
-        pandn       mm7, mm3              ;
-        mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ;
-        combination
-
-        packuswb    mm1, mm0              ;
-        pack to bytes
-
-        movd        [edi], mm1            ;
-        neg         eax                   ;
-        pitch is positive
-
-
-        add         esi, 4
-        add         edi, 4
-        add         edx, 4
-
-        cmp         edx, cols
-        jl          nextcol
-        // done with the all cols, start the across filtering in place
-        sub         esi, edx
-        sub         edi, edx
-
-
-        push        eax
-        xor         edx,    edx
-        mov         eax,    [edi-4];
-
-        acrossnextcol:
-        pxor        mm7, mm7              ;
-        mm7 = 00000000
-        movq        mm6, [ebx + 32 ]      ;
-        movq        mm4, [edi+edx]        ;
-        mm4 = p0..p7
-        movq        mm3, mm4              ;
-        mm3 = p0..p7
-        punpcklbw   mm3, mm0              ;
-        mm3 = p0..p3
-        movq        mm1, mm3              ;
-        mm1 = p0..p3
-        pmullw      mm3, mm6              ;
-        mm3 *= kernel 2 modifiers
-
-        movq        mm6, [ebx + 48]
-        psrlq       mm4, 8                ;
-        mm4 = p1..p7
-        movq        mm5, mm4              ;
-        mm5 = p1..p7
-        punpcklbw   mm5, mm0              ;
-        mm5 = p1..p4
-        pmullw      mm6, mm5              ;
-        mm6 *= p1..p4 * kernel 3 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm6
-
-        ;
-        thresholding
-        movq        mm7, mm1              ;
-        mm7 = p0..p3
-        psubusw     mm7, mm5              ;
-        mm7 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     mm7, mm5              ;
-        mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm7, mm2
-
-        movq        mm6, [ebx + 64 ]
-        psrlq       mm4, 8                ;
-        mm4 = p2..p7
-        movq        mm5, mm4              ;
-        mm5 = p2..p7
-        punpcklbw   mm5, mm0              ;
-        mm5 = p2..p5
-        pmullw      mm6, mm5              ;
-        mm5 *= kernel 4 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = p0..p3
-        psubusw     mm6, mm5              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-
-        movq        mm6, [ebx ]
-        movq        mm4, [edi+edx-2]      ;
-        mm4 = p-2..p5
-        movq        mm5, mm4              ;
-        mm5 = p-2..p5
-        punpcklbw   mm5, mm0              ;
-        mm5 = p-2..p1
-        pmullw      mm6, mm5              ;
-        mm5 *= kernel 0 modifiers
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = p0..p3
-        psubusw     mm6, mm5              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     mm5, mm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm5              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-        movq        mm6, [ebx + 16]
-        psrlq       mm4, 8                ;
-        mm4 = p-1..p5
-        punpcklbw   mm4, mm0              ;
-        mm4 = p-1..p2
-        pmullw      mm6, mm4              ;
-        mm4 *= kernel 1 modifiers.
-        paddusw     mm3, mm6              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movq        mm6, mm1              ;
-        mm6 = p0..p3
-        psubusw     mm6, mm4              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     mm4, mm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     mm6, mm4              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     mm6, mm2
-        por         mm7, mm6              ;
-        accumulate thresholds
-
-        paddusw     mm3, rd               ;
-        mm3 += round value
-        psraw       mm3, VP8_FILTER_SHIFT     ;
-        mm3 /= 128
-
-        pand        mm1, mm7              ;
-        mm1 select vals > thresh from source
-        pandn       mm7, mm3              ;
-        mm7 select vals < thresh from blurred result
-        paddusw     mm1, mm7              ;
-        combination
-
-        packuswb    mm1, mm0              ;
-        pack to bytes
-        mov         DWORD PTR [edi+edx-4],  eax   ;
-        store previous four bytes
-        movd        eax,    mm1
-
-        add         edx, 4
-        cmp         edx, cols
-        jl          acrossnextcol;
-
-        mov         DWORD PTR [edi+edx-4],  eax
-        pop         eax
-
-        // done with this rwo
-        add         esi, eax               ;
-        next line
-        mov         eax, dst_pixels_per_line ;
-        destination pitch?
-        add         edi, eax               ;
-        next destination
-        mov         eax, src_pixels_per_line ;
-        destination pitch?
-
-        dec         ecx                   ;
-        decrement count
-        jnz         nextrow               ;
-        next row
-        pop         ebx
-
-    }
-}
-
-
-
-void vp8_post_proc_down_and_across_xmm
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int rows,
-    int cols,
-    int flimit
-)
-{
-#ifdef RELOCATEABLE
-    R4D2
-#endif
-
-    __asm
-    {
-        movd        xmm2,       flimit
-        punpcklwd   xmm2,       xmm2
-        punpckldq   xmm2,       xmm2
-        punpcklqdq  xmm2,       xmm2
-
-        mov         esi,        src_ptr
-        mov         edi,        dst_ptr
-
-        mov         ecx,        DWORD PTR rows
-        mov         eax,        src_pixels_per_line ;
-        destination pitch?
-        pxor        xmm0,       xmm0              ;
-        mm0 = 00000000
-
-        nextrow:
-
-        xor         edx,        edx       ;
-
-        clear out edx for use as loop counter
-        nextcol:
-        movq        xmm3,       QWORD PTR [esi]         ;
-
-        mm4 = r0 p0..p7
-        punpcklbw   xmm3,       xmm0                    ;
-        mm3 = p0..p3
-        movdqa      xmm1,       xmm3                    ;
-        mm1 = p0..p3
-        psllw       xmm3,       2                       ;
-
-        movq        xmm5,       QWORD PTR [esi + eax]   ;
-        mm4 = r1 p0..p7
-        punpcklbw   xmm5,       xmm0                    ;
-        mm5 = r1 p0..p3
-        paddusw     xmm3,       xmm5                    ;
-        mm3 += mm6
-
-        ;
-        thresholding
-        movdqa      xmm7,       xmm1                    ;
-        mm7 = r0 p0..p3
-        psubusw     xmm7,       xmm5                    ;
-        mm7 = r0 p0..p3 - r1 p0..p3
-        psubusw     xmm5,       xmm1                    ;
-        mm5 = r1 p0..p3 - r0 p0..p3
-        paddusw     xmm7,       xmm5                    ;
-        mm7 = abs(r0 p0..p3 - r1 p0..p3)
-        pcmpgtw     xmm7,       xmm2
-
-        movq        xmm5,       QWORD PTR [esi + 2*eax] ;
-        mm4 = r2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ;
-        mm5 = r2 p0..p3
-        paddusw     xmm3,       xmm5                    ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1                    ;
-        mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ;
-        mm6 = r0 p0..p3 - r2 p0..p3
-        psubusw     xmm5,       xmm1                    ;
-        mm5 = r2 p0..p3 - r2 p0..p3
-        paddusw     xmm6,       xmm5                    ;
-        mm6 = abs(r0 p0..p3 - r2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ;
-        accumulate thresholds
-
-
-        neg         eax
-        movq        xmm5,       QWORD PTR [esi+2*eax]   ;
-        mm4 = r-2 p0..p7
-        punpcklbw   xmm5,       xmm0                    ;
-        mm5 = r-2 p0..p3
-        paddusw     xmm3,       xmm5                    ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1                    ;
-        mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm5                    ;
-        mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm5,       xmm1                    ;
-        mm5 = r-2 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm5                    ;
-        mm6 = abs(r0 p0..p3 - r-2 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ;
-        accumulate thresholds
-
-        movq        xmm4,       QWORD PTR [esi+eax]     ;
-        mm4 = r-1 p0..p7
-        punpcklbw   xmm4,       xmm0                    ;
-        mm4 = r-1 p0..p3
-        paddusw     xmm3,       xmm4                    ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1                    ;
-        mm6 = r0 p0..p3
-        psubusw     xmm6,       xmm4                    ;
-        mm6 = p0..p3 - r-2 p0..p3
-        psubusw     xmm4,       xmm1                    ;
-        mm5 = r-1 p0..p3 - p0..p3
-        paddusw     xmm6,       xmm4                    ;
-        mm6 = abs(r0 p0..p3 - r-1 p0..p3)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6                    ;
-        accumulate thresholds
-
-
-        paddusw     xmm3,       rd42                    ;
-        mm3 += round value
-        psraw       xmm3,       3                       ;
-        mm3 /= 8
-
-        pand        xmm1,       xmm7                    ;
-        mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3                    ;
-        mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7                    ;
-        combination
-
-        packuswb    xmm1,       xmm0                    ;
-        pack to bytes
-        movq        QWORD PTR [edi], xmm1             ;
-
-        neg         eax                   ;
-        pitch is positive
-        add         esi,        8
-        add         edi,        8
-
-        add         edx,        8
-        cmp         edx,        cols
-
-        jl          nextcol
-
-        // done with the all cols, start the across filtering in place
-        sub         esi,        edx
-        sub         edi,        edx
-
-        xor         edx,        edx
-        movq        mm0,        QWORD PTR [edi-8];
-
-        acrossnextcol:
-        movq        xmm7,       QWORD PTR [edi +edx -2]
-        movd        xmm4,       DWORD PTR [edi +edx +6]
-
-        pslldq      xmm4,       8
-        por         xmm4,       xmm7
-
-        movdqa      xmm3,       xmm4
-        psrldq      xmm3,       2
-        punpcklbw   xmm3,       xmm0              ;
-        mm3 = p0..p3
-        movdqa      xmm1,       xmm3              ;
-        mm1 = p0..p3
-        psllw       xmm3,       2
-
-
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       3
-        punpcklbw   xmm5,       xmm0              ;
-        mm5 = p1..p4
-        paddusw     xmm3,       xmm5              ;
-        mm3 += mm6
-
-        ;
-        thresholding
-        movdqa      xmm7,       xmm1              ;
-        mm7 = p0..p3
-        psubusw     xmm7,       xmm5              ;
-        mm7 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     xmm7,       xmm5              ;
-        mm7 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm7,       xmm2
-
-        movdqa      xmm5,       xmm4
-        psrldq      xmm5,       4
-        punpcklbw   xmm5,       xmm0              ;
-        mm5 = p2..p5
-        paddusw     xmm3,       xmm5              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1              ;
-        mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ;
-        accumulate thresholds
-
-
-        movdqa      xmm5,       xmm4              ;
-        mm5 = p-2..p5
-        punpcklbw   xmm5,       xmm0              ;
-        mm5 = p-2..p1
-        paddusw     xmm3,       xmm5              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1              ;
-        mm6 = p0..p3
-        psubusw     xmm6,       xmm5              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     xmm5,       xmm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm5              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ;
-        accumulate thresholds
-
-        psrldq      xmm4,       1                   ;
-        mm4 = p-1..p5
-        punpcklbw   xmm4,       xmm0              ;
-        mm4 = p-1..p2
-        paddusw     xmm3,       xmm4              ;
-        mm3 += mm5
-
-        ;
-        thresholding
-        movdqa      xmm6,       xmm1              ;
-        mm6 = p0..p3
-        psubusw     xmm6,       xmm4              ;
-        mm6 = p0..p3 - p1..p4
-        psubusw     xmm4,       xmm1              ;
-        mm5 = p1..p4 - p0..p3
-        paddusw     xmm6,       xmm4              ;
-        mm6 = abs(p0..p3 - p1..p4)
-        pcmpgtw     xmm6,       xmm2
-        por         xmm7,       xmm6              ;
-        accumulate thresholds
-
-        paddusw     xmm3,       rd42              ;
-        mm3 += round value
-        psraw       xmm3,       3                 ;
-        mm3 /= 8
-
-        pand        xmm1,       xmm7              ;
-        mm1 select vals > thresh from source
-        pandn       xmm7,       xmm3              ;
-        mm7 select vals < thresh from blurred result
-        paddusw     xmm1,       xmm7              ;
-        combination
-
-        packuswb    xmm1,       xmm0              ;
-        pack to bytes
-        movq        QWORD PTR [edi+edx-8],  mm0   ;
-        store previous four bytes
-        movdq2q     mm0,        xmm1
-
-        add         edx,        8
-        cmp         edx,        cols
-        jl          acrossnextcol;
-
-        // last 8 pixels
-        movq        QWORD PTR [edi+edx-8],  mm0
-
-        // done with this rwo
-        add         esi, eax               ;
-        next line
-        mov         eax, dst_pixels_per_line ;
-        destination pitch?
-        add         edi, eax               ;
-        next destination
-        mov         eax, src_pixels_per_line ;
-        destination pitch?
-
-        dec         ecx                   ;
-        decrement count
-        jnz         nextrow               ;
-        next row
-    }
-}
-
-
-void vp8_mbpost_proc_down_mmx(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
-    int c, i;
-    __declspec(align(16))
-    int flimit2[2];
-    __declspec(align(16))
-    unsigned char d[16][8];
-
-    flimit = vp8_q2mbl(flimit);
-
-    for (i = 0; i < 2; i++)
-        flimit2[i] = flimit;
-
-    rows += 8;
-
-    for (c = 0; c < cols; c += 4)
-    {
-        unsigned char *s = &dst[c];
-
-        __asm
-        {
-            mov         esi,        s           ;
-            pxor        mm0,        mm0     ;
-
-            mov         eax,        pitch       ;
-            neg         eax                                     // eax = -pitch
-
-            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
-            neg         eax
-
-
-            pxor        mm5,        mm5
-            pxor        mm6,        mm6     ;
-
-            pxor        mm7,        mm7     ;
-            mov         edi,        esi
-
-            mov         ecx,        15          ;
-
-            loop_initvar:
-            movd        mm1,        DWORD PTR [edi];
-            punpcklbw   mm1,        mm0     ;
-
-            paddw       mm5,        mm1     ;
-            pmullw      mm1,        mm1     ;
-
-            movq        mm2,        mm1     ;
-            punpcklwd   mm1,        mm0     ;
-
-            punpckhwd   mm2,        mm0     ;
-            paddd       mm6,        mm1     ;
-
-            paddd       mm7,        mm2     ;
-            lea         edi,        [edi+eax]   ;
-
-            dec         ecx
-            jne         loop_initvar
-            //save the var and sum
-            xor         edx,        edx
-            loop_row:
-            movd        mm1,        DWORD PTR [esi]     // [s-pitch*8]
-            movd        mm2,        DWORD PTR [edi]     // [s+pitch*7]
-
-            punpcklbw   mm1,        mm0
-            punpcklbw   mm2,        mm0
-
-            paddw       mm5,        mm2
-            psubw       mm5,        mm1
-
-            pmullw      mm2,        mm2
-            movq        mm4,        mm2
-
-            punpcklwd   mm2,        mm0
-            punpckhwd   mm4,        mm0
-
-            paddd       mm6,        mm2
-            paddd       mm7,        mm4
-
-            pmullw      mm1,        mm1
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm0
-            psubd       mm6,        mm1
-
-            punpckhwd   mm2,        mm0
-            psubd       mm7,        mm2
-
-
-            movq        mm3,        mm6
-            pslld       mm3,        4
-
-            psubd       mm3,        mm6
-            movq        mm1,        mm5
-
-            movq        mm4,        mm5
-            pmullw      mm1,        mm1
-
-            pmulhw      mm4,        mm4
-            movq        mm2,        mm1
-
-            punpcklwd   mm1,        mm4
-            punpckhwd   mm2,        mm4
-
-            movq        mm4,        mm7
-            pslld       mm4,        4
-
-            psubd       mm4,        mm7
-
-            psubd       mm3,        mm1
-            psubd       mm4,        mm2
-
-            psubd       mm3,        flimit2
-            psubd       mm4,        flimit2
-
-            psrad       mm3,        31
-            psrad       mm4,        31
-
-            packssdw    mm3,        mm4
-            packsswb    mm3,        mm0
-
-            movd        mm1,        DWORD PTR [esi+eax*8]
-
-            movq        mm2,        mm1
-            punpcklbw   mm1,        mm0
-
-            paddw       mm1,        mm5
-            mov         ecx,        edx
-
-            and         ecx,        127
-            movq        mm4,        vp8_rv[ecx*2]
-
-            paddw       mm1,        mm4
-            //paddw     xmm1,       eight8s
-            psraw       mm1,        4
-
-            packuswb    mm1,        mm0
-            pand        mm1,        mm3
-
-            pandn       mm3,        mm2
-            por         mm1,        mm3
-
-            and         ecx,        15
-            movd        DWORD PTR  d[ecx*4], mm1
-
-            mov         ecx,        edx
-            sub         ecx,        8
-
-            and         ecx,        15
-            movd        mm1,        DWORD PTR d[ecx*4]
-
-            movd        [esi],      mm1
-            lea         esi,        [esi+eax]
-
-            lea         edi,        [edi+eax]
-            add         edx,        1
-
-            cmp         edx,        rows
-            jl          loop_row
-
-        }
-
-    }
-}
-
-void vp8_mbpost_proc_down_xmm(unsigned char *dst, int pitch, int rows, int cols, int flimit)
-{
-    int c, i;
-    __declspec(align(16))
-    int flimit4[4];
-    __declspec(align(16))
-    unsigned char d[16][8];
-
-    flimit = vp8_q2mbl(flimit);
-
-    for (i = 0; i < 4; i++)
-        flimit4[i] = flimit;
-
-    rows += 8;
-
-    for (c = 0; c < cols; c += 8)
-    {
-        unsigned char *s = &dst[c];
-
-        __asm
-        {
-            mov         esi,        s           ;
-            pxor        xmm0,       xmm0        ;
-
-            mov         eax,        pitch       ;
-            neg         eax                                     // eax = -pitch
-
-            lea         esi,        [esi + eax*8];              // edi = s[-pitch*8]
-            neg         eax
-
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         edi,        esi
-
-            mov         ecx,        15          ;
-
-            loop_initvar:
-            movq        xmm1,       QWORD PTR [edi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         edi,        [edi+eax]   ;
-
-            dec         ecx
-            jne         loop_initvar
-            //save the var and sum
-            xor         edx,        edx
-            loop_row:
-            movq        xmm1,       QWORD PTR [esi]     // [s-pitch*8]
-            movq        xmm2,       QWORD PTR [edi]     // [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [esi+eax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         ecx,        edx
-
-            and         ecx,        127
-            movdqu      xmm4,       vp8_rv[ecx*2]
-
-            paddw       xmm1,       xmm4
-            //paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         ecx,        15
-            movq        QWORD PTR  d[ecx*8], xmm1
-
-            mov         ecx,        edx
-            sub         ecx,        8
-
-            and         ecx,        15
-            movq        mm0,        d[ecx*8]
-
-            movq        [esi],      mm0
-            lea         esi,        [esi+eax]
-
-            lea         edi,        [edi+eax]
-            add         edx,        1
-
-            cmp         edx,        rows
-            jl          loop_row
-
-        }
-
-    }
-}
-#if 0
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_wmt
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
-    unsigned int i;
-
-    __declspec(align(16)) unsigned char blackclamp[16];
-    __declspec(align(16)) unsigned char whiteclamp[16];
-    __declspec(align(16)) unsigned char bothclamp[16];
-    char char_dist[300];
-    char Rand[2048];
-    double sigma;
-//    return;
-    __asm emms
-    sigma = a + .5 + .6 * (63 - q) / 63.0;
-
-    // set up a lookup table of 256 entries that matches
-    // a gaussian distribution with sigma determined by q.
-    //
-    {
-        double i;
-        int next, j;
-
-        next = 0;
-
-        for (i = -32; i < 32; i++)
-        {
-            double g = 256 * vp8_gaussian(sigma, 0, 1.0 * i);
-            int a = (int)(g + .5);
-
-            if (a)
-            {
-                for (j = 0; j < a; j++)
-                {
-                    char_dist[next+j] = (char) i;
-                }
-
-                next = next + j;
-            }
-
-        }
-
-        for (next = next; next < 256; next++)
-            char_dist[next] = 0;
-
-    }
-
-    for (i = 0; i < 2048; i++)
-    {
-        Rand[i] = char_dist[rand() & 0xff];
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        blackclamp[i] = -char_dist[0];
-        whiteclamp[i] = -char_dist[0];
-        bothclamp[i] = -2 * char_dist[0];
-    }
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = Rand + (rand() & 0xff);
-
-        __asm
-        {
-            mov ecx, [Width]
-            mov esi, Pos
-            mov edi, Ref
-            xor         eax, eax
-
-            nextset:
-            movdqu      xmm1, [esi+eax]        // get the source
-
-            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, bothclamp
-            psubusb     xmm1, whiteclamp
-
-            movdqu      xmm2, [edi+eax]        // get the noise for this line
-            paddb       xmm1, xmm2             // add it in
-            movdqu      [esi+eax], xmm1        // store the result
-
-            add         eax, 16                // move to the next line
-
-            cmp         eax, ecx
-            jl          nextset
-
-
-        }
-
-    }
-}
-#endif
-__declspec(align(16))
-static const int four8s[4] = { 8, 8, 8, 8};
-void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, int pitch, int rows, int cols, int flimit)
-{
-    int r, i;
-    __declspec(align(16))
-    int flimit4[4];
-    unsigned char *s = src;
-    int sumsq;
-    int sum;
-
-
-    flimit = vp8_q2mbl(flimit);
-    flimit4[0] =
-        flimit4[1] =
-            flimit4[2] =
-                flimit4[3] = flimit;
-
-    for (r = 0; r < rows; r++)
-    {
-
-
-        sumsq = 0;
-        sum = 0;
-
-        for (i = -8; i <= 6; i++)
-        {
-            sumsq += s[i] * s[i];
-            sum   += s[i];
-        }
-
-        __asm
-        {
-            mov         eax,    sumsq
-            movd        xmm7,   eax
-
-            mov         eax,    sum
-            movd        xmm6,   eax
-
-            mov         esi,    s
-            xor         ecx,    ecx
-
-            mov         edx,    cols
-            add         edx,    8
-            pxor        mm0,    mm0
-            pxor        mm1,    mm1
-
-            pxor        xmm0,   xmm0
-            nextcol4:
-
-            movd        xmm1,   DWORD PTR [esi+ecx-8]   // -8 -7 -6 -5
-            movd        xmm2,   DWORD PTR [esi+ecx+7]   // +7 +8 +9 +10
-
-            punpcklbw   xmm1,   xmm0                    // expanding
-            punpcklbw   xmm2,   xmm0                    // expanding
-
-            punpcklwd   xmm1,   xmm0                    // expanding to dwords
-            punpcklwd   xmm2,   xmm0                    // expanding to dwords
-
-            psubd       xmm2,   xmm1                    // 7--8   8--7   9--6 10--5
-            paddd       xmm1,   xmm1                    // -8*2   -7*2   -6*2 -5*2
-
-            paddd       xmm1,   xmm2                    // 7+-8   8+-7   9+-6 10+-5
-            pmaddwd     xmm1,   xmm2                    // squared of 7+-8   8+-7   9+-6 10+-5
-
-            paddd       xmm6,   xmm2
-            paddd       xmm7,   xmm1
-
-            pshufd      xmm6,   xmm6,   0               // duplicate the last ones
-            pshufd      xmm7,   xmm7,   0               // duplicate the last ones
-
-            psrldq      xmm1,       4                   // 8--7   9--6 10--5  0000
-            psrldq      xmm2,       4                   // 8--7   9--6 10--5  0000
-
-            pshufd      xmm3,   xmm1,   3               // 0000  8--7   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   3               // 0000  8--7   8--7   8--7 squared
-
-            paddd       xmm6,   xmm4
-            paddd       xmm7,   xmm3
-
-            pshufd      xmm3,   xmm1,   01011111b       // 0000  0000   9--6   9--6 squared
-            pshufd      xmm4,   xmm2,   01011111b       // 0000  0000   9--6   9--6 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            pshufd      xmm3,   xmm1,   10111111b       // 0000  0000   8--7   8--7 squared
-            pshufd      xmm4,   xmm2,   10111111b       // 0000  0000   8--7   8--7 squared
-
-            paddd       xmm7,   xmm3
-            paddd       xmm6,   xmm4
-
-            movdqa      xmm3,   xmm6
-            pmaddwd     xmm3,   xmm3
-
-            movdqa      xmm5,   xmm7
-            pslld       xmm5,   4
-
-            psubd       xmm5,   xmm7
-            psubd       xmm5,   xmm3
-
-            psubd       xmm5,   flimit4
-            psrad       xmm5,   31
-
-            packssdw    xmm5,   xmm0
-            packsswb    xmm5,   xmm0
-
-            movd        xmm1,   DWORD PTR [esi+ecx]
-            movq        xmm2,   xmm1
-
-            punpcklbw   xmm1,   xmm0
-            punpcklwd   xmm1,   xmm0
-
-            paddd       xmm1,   xmm6
-            paddd       xmm1,   four8s
-
-            psrad       xmm1,   4
-            packssdw    xmm1,   xmm0
-
-            packuswb    xmm1,   xmm0
-            pand        xmm1,   xmm5
-
-            pandn       xmm5,   xmm2
-            por         xmm5,   xmm1
-
-            movd        [esi+ecx-8],  mm0
-            movq        mm0,    mm1
-
-            movdq2q     mm1,    xmm5
-            psrldq      xmm7,   12
-
-            psrldq      xmm6,   12
-            add         ecx,    4
-
-            cmp         ecx,    edx
-            jl          nextcol4
-
-        }
-        s += pitch;
-    }
-}
-
-#if 0
-
-/****************************************************************************
- *
- *  ROUTINE       : plane_add_noise_mmx
- *
- *  INPUTS        : unsigned char *Start    starting address of buffer to add gaussian
- *                                  noise to
- *                  unsigned int Width    width of plane
- *                  unsigned int Height   height of plane
- *                  int  Pitch    distance between subsequent lines of frame
- *                  int  q        quantizer used to determine amount of noise
- *                                  to add
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void.
- *
- *  FUNCTION      : adds gaussian noise to a plane of pixels
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
-    unsigned int i;
-    int Pitch4 = Pitch * 4;
-    const int noise_amount = 2;
-    const int noise_adder = 2 * noise_amount + 1;
-
-    __declspec(align(16)) unsigned char blackclamp[16];
-    __declspec(align(16)) unsigned char whiteclamp[16];
-    __declspec(align(16)) unsigned char bothclamp[16];
-
-    char char_dist[300];
-    char Rand[2048];
-
-    double sigma;
-    __asm emms
-    sigma = a + .5 + .6 * (63 - q) / 63.0;
-
-    // set up a lookup table of 256 entries that matches
-    // a gaussian distribution with sigma determined by q.
-    //
-    {
-        double i, sum = 0;
-        int next, j;
-
-        next = 0;
-
-        for (i = -32; i < 32; i++)
-        {
-            int a = (int)(.5 + 256 * vp8_gaussian(sigma, 0, i));
-
-            if (a)
-            {
-                for (j = 0; j < a; j++)
-                {
-                    char_dist[next+j] = (char) i;
-                }
-
-                next = next + j;
-            }
-
-        }
-
-        for (next = next; next < 256; next++)
-            char_dist[next] = 0;
-
-    }
-
-    for (i = 0; i < 2048; i++)
-    {
-        Rand[i] = char_dist[rand() & 0xff];
-    }
-
-    for (i = 0; i < 16; i++)
-    {
-        blackclamp[i] = -char_dist[0];
-        whiteclamp[i] = -char_dist[0];
-        bothclamp[i] = -2 * char_dist[0];
-    }
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = Rand + (rand() & 0xff);
-
-        __asm
-        {
-            mov ecx, [Width]
-            mov esi, Pos
-            mov edi, Ref
-            xor         eax, eax
-
-            nextset:
-            movq        mm1, [esi+eax]        // get the source
-
-            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
-            paddusb     mm1, bothclamp
-            psubusb     mm1, whiteclamp
-
-            movq        mm2, [edi+eax]        // get the noise for this line
-            paddb       mm1, mm2             // add it in
-            movq        [esi+eax], mm1        // store the result
-
-            add         eax, 8                // move to the next line
-
-            cmp         eax, ecx
-            jl          nextset
-
-
-        }
-
-    }
-}
-#else
-extern char an[8][64][3072];
-extern int cd[8][64];
-
-void vp8_plane_add_noise_mmx(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
-    unsigned int i;
-    __declspec(align(16)) unsigned char blackclamp[16];
-    __declspec(align(16)) unsigned char whiteclamp[16];
-    __declspec(align(16)) unsigned char bothclamp[16];
-
-
-    __asm emms
-
-    for (i = 0; i < 16; i++)
-    {
-        blackclamp[i] = -cd[a][q];
-        whiteclamp[i] = -cd[a][q];
-        bothclamp[i] = -2 * cd[a][q];
-    }
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char  *Ref = an[a][q] + (rand() & 0xff);
-
-        __asm
-        {
-            mov ecx, [Width]
-            mov esi, Pos
-            mov edi, Ref
-            xor         eax, eax
-
-            nextset:
-            movq        mm1, [esi+eax]        // get the source
-
-            psubusb     mm1, blackclamp       // clamp both sides so we don't outrange adding noise
-            paddusb     mm1, bothclamp
-            psubusb     mm1, whiteclamp
-
-            movq        mm2, [edi+eax]        // get the noise for this line
-            paddb       mm1, mm2             // add it in
-            movq        [esi+eax], mm1        // store the result
-
-            add         eax, 8                // move to the next line
-
-            cmp         eax, ecx
-            jl          nextset
-        }
-    }
-}
-
-
-void vp8_plane_add_noise_wmt(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a)
-{
-    unsigned int i;
-
-    __declspec(align(16)) unsigned char blackclamp[16];
-    __declspec(align(16)) unsigned char whiteclamp[16];
-    __declspec(align(16)) unsigned char bothclamp[16];
-
-    __asm emms
-
-    for (i = 0; i < 16; i++)
-    {
-        blackclamp[i] = -cd[a][q];
-        whiteclamp[i] = -cd[a][q];
-        bothclamp[i] = -2 * cd[a][q];
-    }
-
-    for (i = 0; i < Height; i++)
-    {
-        unsigned char *Pos = Start + i * Pitch;
-        char *Ref = an[a][q] + (rand() & 0xff);
-
-        __asm
-        {
-            mov ecx,    [Width]
-            mov esi,    Pos
-            mov edi,    Ref
-            xor         eax, eax
-
-            nextset:
-            movdqu      xmm1, [esi+eax]        // get the source
-
-            psubusb     xmm1, blackclamp       // clamp both sides so we don't outrange adding noise
-            paddusb     xmm1, bothclamp
-            psubusb     xmm1, whiteclamp
-
-            movdqu      xmm2, [edi+eax]        // get the noise for this line
-            paddb       xmm1, xmm2             // add it in
-            movdqu      [esi+eax], xmm1        // store the result
-
-            add         eax, 16                // move to the next line
-
-            cmp         eax, ecx
-            jl          nextset
-        }
-    }
-}
-
-#endif
diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm
index 06d51ec..1f219ca 100644
--- a/vp8/common/x86/postproc_sse2.asm
+++ b/vp8/common/x86/postproc_sse2.asm
@@ -57,10 +57,10 @@
         movsxd      rax,        DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch?
         pxor        xmm0,       xmm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
 
         xor         rdx,        rdx       ; clear out rdx for use as loop counter
-nextcol:
+.nextcol:
         movq        xmm3,       QWORD PTR [rsi]         ; mm4 = r0 p0..p7
         punpcklbw   xmm3,       xmm0                    ; mm3 = p0..p3
         movdqa      xmm1,       xmm3                    ; mm1 = p0..p3
@@ -133,7 +133,7 @@
         add         rdx,        8
         cmp         edx,        dword arg(5) ;cols
 
-        jl          nextcol
+        jl          .nextcol
 
         ; done with the all cols, start the across filtering in place
         sub         rsi,        rdx
@@ -142,7 +142,7 @@
         xor         rdx,        rdx
         movq        mm0,        QWORD PTR [rdi-8];
 
-acrossnextcol:
+.acrossnextcol:
         movq        xmm7,       QWORD PTR [rdi +rdx -2]
         movd        xmm4,       DWORD PTR [rdi +rdx +6]
 
@@ -219,7 +219,7 @@
 
         add         rdx,        8
         cmp         edx,        dword arg(5) ;cols
-        jl          acrossnextcol;
+        jl          .acrossnextcol;
 
         ; last 8 pixels
         movq        QWORD PTR [rdi+rdx-8],  mm0
@@ -231,7 +231,7 @@
         mov         eax, dword arg(2) ;src_pixels_per_line ; destination pitch?
 
         dec         rcx                   ; decrement count
-        jnz         nextrow               ; next row
+        jnz         .nextrow              ; next row
 
 %if ABI_IS_32BIT=1 && CONFIG_PIC=1
     add rsp,16
@@ -282,7 +282,7 @@
     add         dword arg(2), 8
 
     ;for(c=0; c<cols; c+=8)
-loop_col:
+.loop_col:
             mov         rsi,        arg(0) ; s
             pxor        xmm0,       xmm0        ;
 
@@ -301,7 +301,7 @@
 
             mov         rcx,        15          ;
 
-loop_initvar:
+.loop_initvar:
             movq        xmm1,       QWORD PTR [rdi];
             punpcklbw   xmm1,       xmm0        ;
 
@@ -318,10 +318,10 @@
             lea         rdi,        [rdi+rax]   ;
 
             dec         rcx
-            jne         loop_initvar
+            jne         .loop_initvar
             ;save the var and sum
             xor         rdx,        rdx
-loop_row:
+.loop_row:
             movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
             movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
 
@@ -428,12 +428,12 @@
             add         rdx,        1
 
             cmp         edx,        dword arg(2) ;rows
-            jl          loop_row
+            jl          .loop_row
 
         add         dword arg(0), 8 ; s += 8
         sub         dword arg(3), 8 ; cols -= 8
         cmp         dword arg(3), 0
-        jg          loop_col
+        jg          .loop_col
 
     add         rsp, 128+16
     pop         rsp
@@ -475,13 +475,13 @@
 
 
     ;for(r=0;r<rows;r++)
-ip_row_loop:
+.ip_row_loop:
 
         xor         rdx,    rdx ;sumsq=0;
         xor         rcx,    rcx ;sum=0;
         mov         rsi,    arg(0); s
         mov         rdi,    -8
-ip_var_loop:
+.ip_var_loop:
         ;for(i=-8;i<=6;i++)
         ;{
         ;    sumsq += s[i]*s[i];
@@ -493,7 +493,7 @@
         add         edx, eax
         add         rdi, 1
         cmp         rdi, 6
-        jle         ip_var_loop
+        jle         .ip_var_loop
 
 
             ;mov         rax,    sumsq
@@ -513,7 +513,7 @@
             pxor        mm1,    mm1
 
             pxor        xmm0,   xmm0
-nextcol4:
+.nextcol4:
 
             movd        xmm1,   DWORD PTR [rsi+rcx-8]   ; -8 -7 -6 -5
             movd        xmm2,   DWORD PTR [rsi+rcx+7]   ; +7 +8 +9 +10
@@ -600,7 +600,7 @@
             add         rcx,    4
 
             cmp         rcx,    rdx
-            jl          nextcol4
+            jl          .nextcol4
 
         ;s+=pitch;
         movsxd rax, dword arg(1)
@@ -608,7 +608,7 @@
 
         sub dword arg(2), 1 ;rows-=1
         cmp dword arg(2), 0
-        jg ip_row_loop
+        jg .ip_row_loop
 
     add         rsp, 16
     pop         rsp
@@ -640,7 +640,7 @@
     push        rdi
     ; end prolog
 
-addnoise_loop:
+.addnoise_loop:
     call sym(rand) WRT_PLT
     mov     rcx, arg(1) ;noise
     and     rax, 0xff
@@ -657,7 +657,7 @@
             mov     rsi, arg(0) ;Pos
             xor         rax,rax
 
-addnoise_nextset:
+.addnoise_nextset:
             movdqu      xmm1,[rsi+rax]         ; get the source
 
             psubusb     xmm1, [rdx]    ;blackclamp        ; clamp both sides so we don't outrange adding noise
@@ -671,12 +671,12 @@
             add         rax,16                 ; move to the next line
 
             cmp         rax, rcx
-            jl          addnoise_nextset
+            jl          .addnoise_nextset
 
     movsxd  rax, dword arg(7) ; Pitch
     add     arg(0), rax ; Start += Pitch
     sub     dword arg(6), 1   ; Height -= 1
-    jg      addnoise_loop
+    jg      .addnoise_loop
 
     ; begin epilog
     pop rdi
diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm
index 0e23116..f54cc4e 100644
--- a/vp8/common/x86/recon_sse2.asm
+++ b/vp8/common/x86/recon_sse2.asm
@@ -503,7 +503,7 @@
     mov         rdi,        arg(0) ;dst;
     movsxd      rcx,        dword ptr arg(1) ;dst_stride
 
-vp8_intra_pred_uv_tm_%1_loop:
+.vp8_intra_pred_uv_tm_%1_loop:
     movd        xmm3,       [rsi]
     movd        xmm5,       [rsi+rax]
 %ifidn %1, sse2
@@ -525,7 +525,7 @@
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_tm_%1_loop
+    jnz .vp8_intra_pred_uv_tm_%1_loop
 
     ; begin epilog
     pop         rdi
@@ -615,7 +615,7 @@
 %endif
     dec         rsi
 %ifidn %1, mmx2
-vp8_intra_pred_uv_ho_%1_loop:
+.vp8_intra_pred_uv_ho_%1_loop:
     movd        mm0,        [rsi]
     movd        mm1,        [rsi+rax]
     punpcklbw   mm0,        mm0
@@ -627,7 +627,7 @@
     lea         rsi,        [rsi+rax*2]
     lea         rdi,        [rdi+rcx*2]
     dec         edx
-    jnz vp8_intra_pred_uv_ho_%1_loop
+    jnz .vp8_intra_pred_uv_ho_%1_loop
 %else
     movd        xmm0,       [rsi]
     movd        xmm3,       [rsi+rax]
diff --git a/vp8/common/x86/subpixel_mmx.asm b/vp8/common/x86/subpixel_mmx.asm
index 9004b52..e68d950 100644
--- a/vp8/common/x86/subpixel_mmx.asm
+++ b/vp8/common/x86/subpixel_mmx.asm
@@ -50,7 +50,7 @@
         movsxd      rax,    dword ptr arg(5) ;output_width      ; destination pitch?
         pxor        mm0,    mm0              ; mm0 = 00000000
 
-nextrow:
+.nextrow:
         movq        mm3,    [rsi-2]          ; mm3 = p-2..p5
         movq        mm4,    mm3              ; mm4 = p-2..p5
         psrlq       mm3,    8                ; mm3 = p-1..p5
@@ -102,7 +102,7 @@
 %endif
 
         dec         rcx                      ; decrement count
-        jnz         nextrow                  ; next row
+        jnz         .nextrow                 ; next row
 
     ; begin epilog
     pop rdi
@@ -152,7 +152,7 @@
         pxor        mm0, mm0              ; mm0 = 00000000
 
 
-nextrow_cv:
+.nextrow_cv:
         movq        mm3, [rsi+rdx]        ; mm3 = p0..p8  = row -1
         pmullw      mm3, mm1              ; mm3 *= kernel 1 modifiers.
 
@@ -190,7 +190,7 @@
         ; avoidable!!!.
         lea         rdi,  [rdi+rax] ;
         dec         rcx                   ; decrement count
-        jnz         nextrow_cv             ; next row
+        jnz         .nextrow_cv           ; next row
 
         pop         rbx
 
@@ -282,7 +282,7 @@
         packuswb    mm7,        mm4                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_8x8:
+.next_row_8x8:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line
 
@@ -349,7 +349,7 @@
         add         rdi,        r8                  ;dst_pitch
 %endif
         cmp         rdi,        rcx                 ;
-        jne         next_row_8x8
+        jne         .next_row_8x8
 
     ; begin epilog
     pop rdi
@@ -437,7 +437,7 @@
         packuswb    mm7,        mm4                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_8x4:
+.next_row_8x4:
         movq        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movq        mm4,        mm3                 ; make a copy of current line
 
@@ -504,7 +504,7 @@
         add         rdi,        r8
 %endif
         cmp         rdi,        rcx                 ;
-        jne         next_row_8x4
+        jne         .next_row_8x4
 
     ; begin epilog
     pop rdi
@@ -579,7 +579,7 @@
         packuswb    mm7,        mm0                 ;
 
         add         rsi,        rdx                 ; next line
-next_row_4x4:
+.next_row_4x4:
         movd        mm3,        [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
 
@@ -622,7 +622,7 @@
 %endif
 
         cmp         rdi,        rcx                 ;
-        jne         next_row_4x4
+        jne         .next_row_4x4
 
     ; begin epilog
     pop rdi
diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm
index 83e3b14..b62b5c6 100644
--- a/vp8/common/x86/subpixel_sse2.asm
+++ b/vp8/common/x86/subpixel_sse2.asm
@@ -55,7 +55,7 @@
 %endif
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d8_h6_rowloop:
+.filter_block1d8_h6_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -124,7 +124,7 @@
 %endif
         dec         rcx
 
-        jnz         filter_block1d8_h6_rowloop                ; next row
+        jnz         .filter_block1d8_h6_rowloop                ; next row
 
     ; begin epilog
     pop rdi
@@ -176,7 +176,7 @@
 
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d16_h6_sse2_rowloop:
+.filter_block1d16_h6_sse2_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -301,7 +301,7 @@
 %endif
 
         dec         rcx
-        jnz         filter_block1d16_h6_sse2_rowloop                ; next row
+        jnz         .filter_block1d16_h6_sse2_rowloop                ; next row
 
     ; begin epilog
     pop rdi
@@ -356,7 +356,7 @@
         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 %endif
 
-vp8_filter_block1d8_v6_sse2_loop:
+.vp8_filter_block1d8_v6_sse2_loop:
         movdqa      xmm1,       XMMWORD PTR [rsi]
         pmullw      xmm1,       [rax]
 
@@ -396,7 +396,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d8_v6_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d8_v6_sse2_loop               ; next row
 
     ; begin epilog
     pop rdi
@@ -448,7 +448,7 @@
         movsxd      r8,         dword ptr arg(2) ; dst_ptich
 %endif
 
-vp8_filter_block1d16_v6_sse2_loop:
+.vp8_filter_block1d16_v6_sse2_loop:
 ; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
         movdqa      xmm1,       XMMWORD PTR [rsi + rdx]       ; line 2
         movdqa      xmm2,       XMMWORD PTR [rsi + rdx + 16]
@@ -511,7 +511,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d16_v6_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d16_v6_sse2_loop              ; next row
 
     ; begin epilog
     pop rdi
@@ -556,7 +556,7 @@
 %endif
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d8_h6_only_rowloop:
+.filter_block1d8_h6_only_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -624,7 +624,7 @@
 %endif
         dec         rcx
 
-        jnz         filter_block1d8_h6_only_rowloop                ; next row
+        jnz         .filter_block1d8_h6_only_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -670,7 +670,7 @@
 
         pxor        xmm0,       xmm0                        ; clear xmm0 for unpack
 
-filter_block1d16_h6_only_sse2_rowloop:
+.filter_block1d16_h6_only_sse2_rowloop:
         movq        xmm3,       MMWORD PTR [rsi - 2]
         movq        xmm1,       MMWORD PTR [rsi + 6]
 
@@ -789,7 +789,7 @@
 %endif
 
         dec         rcx
-        jnz         filter_block1d16_h6_only_sse2_rowloop                ; next row
+        jnz         .filter_block1d16_h6_only_sse2_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -837,7 +837,7 @@
         movsxd      r8,         dword ptr arg(3) ; dst_ptich
 %endif
 
-vp8_filter_block1d8_v6_only_sse2_loop:
+.vp8_filter_block1d8_v6_only_sse2_loop:
         movq        xmm1,       MMWORD PTR [rsi]
         movq        xmm2,       MMWORD PTR [rsi + rdx]
         movq        xmm3,       MMWORD PTR [rsi + rdx * 2]
@@ -883,7 +883,7 @@
         add         rdi,        r8
 %endif
         dec         rcx         ; decrement count
-        jnz         vp8_filter_block1d8_v6_only_sse2_loop               ; next row
+        jnz         .vp8_filter_block1d8_v6_only_sse2_loop              ; next row
 
     ; begin epilog
     pop rdi
@@ -924,7 +924,7 @@
         movsxd      r8,         dword ptr arg(4) ;output_width            ; Pitch for Source
 %endif
 
-unpack_block1d16_h6_sse2_rowloop:
+.unpack_block1d16_h6_sse2_rowloop:
         movq        xmm1,       MMWORD PTR [rsi]            ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
         movq        xmm3,       MMWORD PTR [rsi+8]          ; make copy of xmm1
 
@@ -941,7 +941,7 @@
         add         rdi,        r8
 %endif
         dec         rcx
-        jnz         unpack_block1d16_h6_sse2_rowloop                ; next row
+        jnz         .unpack_block1d16_h6_sse2_rowloop               ; next row
 
     ; begin epilog
     pop rdi
@@ -980,7 +980,7 @@
         movsxd      rax,        dword ptr arg(2) ;xoffset
 
         cmp         rax,        0      ;skip first_pass filter if xoffset=0
-        je          b16x16_sp_only
+        je          .b16x16_sp_only
 
         shl         rax,        5
         add         rax,        rcx    ;HFilter
@@ -995,7 +995,7 @@
         movsxd      rax,        dword ptr arg(3) ;yoffset
 
         cmp         rax,        0      ;skip second_pass filter if yoffset=0
-        je          b16x16_fp_only
+        je          .b16x16_fp_only
 
         shl         rax,        5
         add         rax,        rcx    ;VFilter
@@ -1041,7 +1041,7 @@
         packuswb    xmm7,       xmm4
 
         add         rsi,        rdx                 ; next line
-next_row:
+.next_row:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movdqa      xmm4,       xmm3                 ; make a copy of current line
 
@@ -1104,11 +1104,11 @@
 %endif
 
         cmp         rdi,        rcx
-        jne         next_row
+        jne         .next_row
 
-        jmp         done
+        jmp         .done
 
-b16x16_sp_only:
+.b16x16_sp_only:
         movsxd      rax,        dword ptr arg(3) ;yoffset
         shl         rax,        5
         add         rax,        rcx    ;VFilter
@@ -1130,7 +1130,7 @@
         movdqu      xmm7,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 
         add         rsi,        rax                 ; next line
-next_row_spo:
+.next_row_spo:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
 
         movdqa      xmm5,       xmm7
@@ -1164,17 +1164,17 @@
         add         rsi,        rax                 ; next line
         add         rdi,        rdx                 ;dst_pitch
         cmp         rdi,        rcx
-        jne         next_row_spo
+        jne         .next_row_spo
 
-        jmp         done
+        jmp         .done
 
-b16x16_fp_only:
+.b16x16_fp_only:
         lea         rcx,        [rdi+rdx*8]
         lea         rcx,        [rcx+rdx*8]
         movsxd      rax,        dword ptr arg(1) ;src_pixels_per_line
         pxor        xmm0,       xmm0
 
-next_row_fpo:
+.next_row_fpo:
         movdqu      xmm3,       [rsi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
         movdqa      xmm4,       xmm3                 ; make a copy of current line
 
@@ -1208,9 +1208,9 @@
         add         rsi,        rax                 ; next line
         add         rdi,        rdx                 ; dst_pitch
         cmp         rdi,        rcx
-        jne         next_row_fpo
+        jne         .next_row_fpo
 
-done:
+.done:
     ; begin epilog
     pop rdi
     pop rsi
@@ -1318,7 +1318,7 @@
 
         movdqa      xmm7,       xmm3
         add         rsp,        16                 ; next line
-next_row8x8:
+.next_row8x8:
         movdqa      xmm3,       XMMWORD PTR [rsp]               ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
         movdqa      xmm4,       xmm3                 ; make a copy of current line
         psrldq      xmm4,       1
@@ -1352,7 +1352,7 @@
         add         rdi,        rdx
 
         cmp         rdi,        rcx
-        jne         next_row8x8
+        jne         .next_row8x8
 
     ;add rsp, 144
     pop rsp
diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm
index 1ddbc54..6bca82b 100644
--- a/vp8/common/x86/subpixel_ssse3.asm
+++ b/vp8/common/x86/subpixel_ssse3.asm
@@ -70,7 +70,7 @@
 
     sub         rdi, rdx
 ;xmm3 free
-filter_block1d8_h6_rowloop_ssse3:
+.filter_block1d8_h6_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm2,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -102,7 +102,7 @@
     packuswb    xmm0,   xmm0
 
     movq        MMWORD Ptr [rdi], xmm0
-    jnz         filter_block1d8_h6_rowloop_ssse3
+    jnz         .filter_block1d8_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -129,7 +129,7 @@
 
     sub         rdi, rdx
 
-filter_block1d8_h4_rowloop_ssse3:
+.filter_block1d8_h4_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm1,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -158,7 +158,7 @@
 
     movq        MMWORD Ptr [rdi], xmm0
 
-    jnz         filter_block1d8_h4_rowloop_ssse3
+    jnz         .filter_block1d8_h4_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -207,7 +207,7 @@
     movsxd      rcx, dword ptr arg(4)           ;output_height
     movsxd      rdx, dword ptr arg(3)           ;output_pitch
 
-filter_block1d16_h6_rowloop_ssse3:
+.filter_block1d16_h6_rowloop_ssse3:
     movq        xmm0,   MMWORD PTR [rsi - 2]    ; -2 -1  0  1  2  3  4  5
 
     movq        xmm3,   MMWORD PTR [rsi + 3]    ;  3  4  5  6  7  8  9 10
@@ -264,7 +264,7 @@
 
     lea         rdi,    [rdi + rdx]
     dec         rcx
-    jnz         filter_block1d16_h6_rowloop_ssse3
+    jnz         .filter_block1d16_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -304,7 +304,7 @@
     movdqa      xmm7, [GLOBAL(rd)]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d4_h4_ssse3
+    je          .vp8_filter_block1d4_h4_ssse3
 
     movdqa      xmm4, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
@@ -318,7 +318,7 @@
     movsxd      rdx, dword ptr arg(3)   ;output_pitch
 
 ;xmm3 free
-filter_block1d4_h6_rowloop_ssse3:
+.filter_block1d4_h6_rowloop_ssse3:
     movdqu      xmm0,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm1, xmm0
@@ -346,7 +346,7 @@
 
     add         rdi, rdx
     dec         rcx
-    jnz         filter_block1d4_h6_rowloop_ssse3
+    jnz         .filter_block1d4_h6_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -356,7 +356,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d4_h4_ssse3:
+.vp8_filter_block1d4_h4_ssse3:
     movdqa      xmm5, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm6, XMMWORD PTR [rax+128]     ;k1_k3
     movdqa      xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
@@ -369,7 +369,7 @@
 
     movsxd      rdx, dword ptr arg(3)   ;output_pitch
 
-filter_block1d4_h4_rowloop_ssse3:
+.filter_block1d4_h4_rowloop_ssse3:
     movdqu      xmm1,   XMMWORD PTR [rsi - 2]
 
     movdqa      xmm2, xmm1
@@ -391,7 +391,7 @@
 
     add         rdi, rdx
     dec         rcx
-    jnz         filter_block1d4_h4_rowloop_ssse3
+    jnz         .filter_block1d4_h4_rowloop_ssse3
 
     ; begin epilog
     pop rdi
@@ -432,7 +432,7 @@
     add         rax, rdx
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d16_v4_ssse3
+    je          .vp8_filter_block1d16_v4_ssse3
 
     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
@@ -450,7 +450,7 @@
     add         rax, rdx
 
 
-vp8_filter_block1d16_v6_ssse3_loop:
+.vp8_filter_block1d16_v6_ssse3_loop:
     movq        xmm1, MMWORD PTR [rsi]                  ;A
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
@@ -508,7 +508,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d16_v6_ssse3_loop
+    jnz         .vp8_filter_block1d16_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -519,7 +519,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d16_v4_ssse3:
+.vp8_filter_block1d16_v4_ssse3:
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
 
@@ -534,7 +534,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;output_height
     add         rax, rdx
 
-vp8_filter_block1d16_v4_ssse3_loop:
+.vp8_filter_block1d16_v4_ssse3_loop:
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
@@ -581,7 +581,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d16_v4_ssse3_loop
+    jnz         .vp8_filter_block1d16_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -627,7 +627,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d8_v4_ssse3
+    je          .vp8_filter_block1d8_v4_ssse3
 
     movdqa      xmm5, XMMWORD PTR [rax]         ;k0_k5
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
@@ -638,7 +638,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d8_v6_ssse3_loop:
+.vp8_filter_block1d8_v6_ssse3_loop:
     movq        xmm1, MMWORD PTR [rsi]                  ;A
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
@@ -673,7 +673,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d8_v6_ssse3_loop
+    jnz         .vp8_filter_block1d8_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -684,7 +684,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d8_v4_ssse3:
+.vp8_filter_block1d8_v4_ssse3:
     movdqa      xmm6, XMMWORD PTR [rax+256]     ;k2_k4
     movdqa      xmm7, XMMWORD PTR [rax+128]     ;k1_k3
     movdqa      xmm5, [GLOBAL(rd)]
@@ -694,7 +694,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d8_v4_ssse3_loop:
+.vp8_filter_block1d8_v4_ssse3_loop:
     movq        xmm2, MMWORD PTR [rsi + rdx]            ;B
     movq        xmm3, MMWORD PTR [rsi + rdx * 2]        ;C
     movq        xmm4, MMWORD PTR [rax + rdx * 2]        ;D
@@ -722,7 +722,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d8_v4_ssse3_loop
+    jnz         .vp8_filter_block1d8_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -766,7 +766,7 @@
     movsxd      rcx, DWORD PTR arg(4)   ;[output_height]
 
     cmp         esi, DWORD PTR [rax]
-    je          vp8_filter_block1d4_v4_ssse3
+    je          .vp8_filter_block1d4_v4_ssse3
 
     movq        mm5, MMWORD PTR [rax]         ;k0_k5
     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
@@ -777,7 +777,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d4_v6_ssse3_loop:
+.vp8_filter_block1d4_v6_ssse3_loop:
     movd        mm1, DWORD PTR [rsi]                  ;A
     movd        mm2, DWORD PTR [rsi + rdx]            ;B
     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
@@ -813,7 +813,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d4_v6_ssse3_loop
+    jnz         .vp8_filter_block1d4_v6_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -823,7 +823,7 @@
     pop         rbp
     ret
 
-vp8_filter_block1d4_v4_ssse3:
+.vp8_filter_block1d4_v4_ssse3:
     movq        mm6, MMWORD PTR [rax+256]     ;k2_k4
     movq        mm7, MMWORD PTR [rax+128]     ;k1_k3
     movq        mm5, MMWORD PTR [GLOBAL(rd)]
@@ -833,7 +833,7 @@
     mov         rax, rsi
     add         rax, rdx
 
-vp8_filter_block1d4_v4_ssse3_loop:
+.vp8_filter_block1d4_v4_ssse3_loop:
     movd        mm2, DWORD PTR [rsi + rdx]            ;B
     movd        mm3, DWORD PTR [rsi + rdx * 2]        ;C
     movd        mm4, DWORD PTR [rax + rdx * 2]        ;D
@@ -861,7 +861,7 @@
     add         rdi,        r8
 %endif
     dec         rcx
-    jnz         vp8_filter_block1d4_v4_ssse3_loop
+    jnz         .vp8_filter_block1d4_v4_ssse3_loop
 
     ; begin epilog
     pop rdi
@@ -895,7 +895,7 @@
         movsxd      rax,        dword ptr arg(2)    ; xoffset
 
         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          b16x16_sp_only
+        je          .b16x16_sp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; HFilter
@@ -909,7 +909,7 @@
         movsxd      rax,        dword ptr arg(3)    ; yoffset
 
         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          b16x16_fp_only
+        je          .b16x16_fp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -996,9 +996,9 @@
         cmp         rdi,        rcx
         jne         .next_row
 
-        jmp         done
+        jmp         .done
 
-b16x16_sp_only:
+.b16x16_sp_only:
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1018,7 +1018,7 @@
         movq        xmm2,       [rsi + 8]           ; load row 0
 
         lea         rsi,        [rsi + rax]         ; next line
-.next_row:
+.next_row_sp:
         movq        xmm3,       [rsi]               ; load row + 1
         movq        xmm5,       [rsi + 8]           ; load row + 1
 
@@ -1062,16 +1062,16 @@
         lea         rdi,        [rdi + 2*rdx]
 
         cmp         rdi,        rcx
-        jne         .next_row
+        jne         .next_row_sp
 
-        jmp         done
+        jmp         .done
 
-b16x16_fp_only:
+.b16x16_fp_only:
         lea         rcx,        [rdi+rdx*8]
         lea         rcx,        [rcx+rdx*8]
         movsxd      rax,        dword ptr arg(1)    ; src_pixels_per_line
 
-.next_row:
+.next_row_fp:
         movq        xmm2,       [rsi]               ; 00 01 02 03 04 05 06 07
         movq        xmm4,       [rsi+1]             ; 01 02 03 04 05 06 07 08
 
@@ -1122,9 +1122,9 @@
 
         cmp         rdi,        rcx
 
-        jne         .next_row
+        jne         .next_row_fp
 
-done:
+.done:
     ; begin epilog
     pop         rdi
     pop         rsi
@@ -1191,7 +1191,7 @@
 
         movsxd      rax,        dword ptr arg(2)    ; xoffset
         cmp         rax,        0                   ; skip first_pass filter if xoffset=0
-        je          b8x8_sp_only
+        je          .b8x8_sp_only
 
         shl         rax,        4
         add         rax,        rcx                 ; HFilter
@@ -1203,7 +1203,7 @@
 
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         cmp         rax,        0                   ; skip second_pass filter if yoffset=0
-        je          b8x8_fp_only
+        je          .b8x8_fp_only
 
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1260,9 +1260,9 @@
         cmp         rdi,        rcx
         jne         .next_row
 
-        jmp         done8x8
+        jmp         .done8x8
 
-b8x8_sp_only:
+.b8x8_sp_only:
         movsxd      rax,        dword ptr arg(3)    ; yoffset
         shl         rax,        4
         lea         rax,        [rax + rcx]         ; VFilter
@@ -1364,12 +1364,12 @@
         movq        [rdi+rdx],  xmm1
         lea         rsp,        [rsp + 144]
 
-        jmp         done8x8
+        jmp         .done8x8
 
-b8x8_fp_only:
+.b8x8_fp_only:
         lea         rcx,        [rdi+rdx*8]
 
-.next_row:
+.next_row_fp:
         movdqa      xmm1,       XMMWORD PTR [rsp]
         movdqa      xmm3,       XMMWORD PTR [rsp+16]
 
@@ -1430,11 +1430,11 @@
         lea         rdi,        [rdi + 2*rdx]
         cmp         rdi,        rcx
 
-        jne         .next_row
+        jne         .next_row_fp
 
         lea         rsp,        [rsp + 16]
 
-done8x8:
+.done8x8:
     ;add rsp, 144
     pop         rsp
     ; begin epilog
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index ddb0970..1c11b0b 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -264,8 +264,10 @@
         for (i = 0; i < 16; i++)
         {
             BLOCKD *b = &xd->block[i];
+            int b_mode = xd->mode_info_context->bmi[i].as_mode;
+
             RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict)
-                          (b, b->bmi.as_mode, b->predictor);
+                          (b, b_mode, b->predictor);
 
             if (xd->eobs[i] > 1)
             {
@@ -410,8 +412,6 @@
         }
 #endif
 
-        update_blockd_bmi(xd);
-
         xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
         xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
         xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
@@ -436,14 +436,6 @@
             xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
         }
 
-        vp8_build_uvmvs(xd, pc->full_pixel);
-
-        /*
-        if(pc->current_video_frame==0 &&mb_col==1 && mb_row==0)
-        pbi->debugoutput =1;
-        else
-        pbi->debugoutput =0;
-        */
         decode_macroblock(pbi, xd, mb_row * pc->mb_cols  + mb_col);
 
         /* check if the boolean decoder has suffered an error */
@@ -685,6 +677,11 @@
     xd->mode_info_context->mbmi.mode = DC_PRED;
     xd->mode_info_stride = pc->mode_info_stride;
     xd->corrupted = 0; /* init without corruption */
+
+    xd->fullpixel_mask = 0xffffffff;
+    if(pc->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
+
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi)
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index fdde04a..a8bd087 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -28,7 +28,6 @@
 
 extern void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd);
 extern void clamp_mvs(MACROBLOCKD *xd);
-extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define RTCD_VTABLE(x) (&(pbi)->common.rtcd.x)
@@ -83,6 +82,11 @@
         {
             mbd->block[j].dequant = xd->block[j].dequant;
         }
+
+        mbd->fullpixel_mask = 0xffffffff;
+        if(pc->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
+
     }
 
     for (i=0; i< pc->mb_rows; i++)
@@ -212,8 +216,9 @@
         for (i = 0; i < 16; i++)
         {
             BLOCKD *b = &xd->block[i];
+            int b_mode = xd->mode_info_context->bmi[i].as_mode;
 
-            vp8mt_predict_intra4x4(pbi, xd, b->bmi.as_mode, b->predictor, mb_row, mb_col, i);
+            vp8mt_predict_intra4x4(pbi, xd, b_mode, b->predictor, mb_row, mb_col, i);
 
             if (xd->eobs[i] > 1)
             {
@@ -313,8 +318,6 @@
                             }
                         }
 
-                        update_blockd_bmi(xd);
-
                         /* Distance of MB to the various image edges.
                          * These are specified to 8th pel as they are always
                          * compared to values that are in 1/8th pel units.
@@ -378,7 +381,6 @@
                             xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
                         }
 
-                        vp8_build_uvmvs(xd, pc->full_pixel);
                         decode_macroblock(pbi, xd, mb_row, mb_col);
 
                         /* check if the boolean decoder has suffered an error */
@@ -819,8 +821,6 @@
                     }
                 }
 
-                update_blockd_bmi(xd);
-
                 /* Distance of MB to the various image edges.
                  * These are specified to 8th pel as they are always compared to
                  * values that are in 1/8th pel units.
@@ -879,7 +879,6 @@
                     xd->corrupted |= pc->yv12_fb[ref_fb_idx].corrupted;
                 }
 
-                vp8_build_uvmvs(xd, pc->full_pixel);
                 decode_macroblock(pbi, xd, mb_row, mb_col);
 
                 /* check if the boolean decoder has suffered an error */
diff --git a/vp8/decoder/x86/idct_blk_sse2.c b/vp8/decoder/x86/idct_blk_sse2.c
index 4c88db4..3a48068 100644
--- a/vp8/decoder/x86/idct_blk_sse2.c
+++ b/vp8/decoder/x86/idct_blk_sse2.c
@@ -12,17 +12,17 @@
 #include "vp8/common/idct.h"
 #include "vp8/decoder/dequantize.h"
 
-void idct_dequant_dc_0_2x_sse2
+void vp8_idct_dequant_dc_0_2x_sse2
             (short *q, short *dq, unsigned char *pre,
              unsigned char *dst, int dst_stride, short *dc);
-void idct_dequant_dc_full_2x_sse2
+void vp8_idct_dequant_dc_full_2x_sse2
             (short *q, short *dq, unsigned char *pre,
              unsigned char *dst, int dst_stride, short *dc);
 
-void idct_dequant_0_2x_sse2
+void vp8_idct_dequant_0_2x_sse2
             (short *q, short *dq ,unsigned char *pre,
              unsigned char *dst, int dst_stride, int blk_stride);
-void idct_dequant_full_2x_sse2
+void vp8_idct_dequant_full_2x_sse2
             (short *q, short *dq ,unsigned char *pre,
              unsigned char *dst, int dst_stride, int blk_stride);
 
@@ -35,14 +35,14 @@
     for (i = 0; i < 4; i++)
     {
         if (((short *)(eobs))[0] & 0xfefe)
-            idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
+            vp8_idct_dequant_dc_full_2x_sse2 (q, dq, pre, dst, stride, dc);
         else
-            idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
+            vp8_idct_dequant_dc_0_2x_sse2 (q, dq, pre, dst, stride, dc);
 
         if (((short *)(eobs))[1] & 0xfefe)
-            idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+            vp8_idct_dequant_dc_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
         else
-            idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
+            vp8_idct_dequant_dc_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, dc+2);
 
         q    += 64;
         dc   += 4;
@@ -61,14 +61,14 @@
     for (i = 0; i < 4; i++)
     {
         if (((short *)(eobs))[0] & 0xfefe)
-            idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
+            vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dst, stride, 16);
         else
-            idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
+            vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dst, stride, 16);
 
         if (((short *)(eobs))[1] & 0xfefe)
-            idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+            vp8_idct_dequant_full_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
         else
-            idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
+            vp8_idct_dequant_0_2x_sse2 (q+32, dq, pre+8, dst+8, stride, 16);
 
         q    += 64;
         pre  += 64;
@@ -82,33 +82,33 @@
              unsigned char *dstu, unsigned char *dstv, int stride, char *eobs)
 {
     if (((short *)(eobs))[0] & 0xfefe)
-        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+        vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
     else
-        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+        vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
 
     q    += 32;
     pre  += 32;
     dstu += stride*4;
 
     if (((short *)(eobs))[1] & 0xfefe)
-        idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
+        vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstu, stride, 8);
     else
-        idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
+        vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstu, stride, 8);
 
     q    += 32;
     pre  += 32;
 
     if (((short *)(eobs))[2] & 0xfefe)
-        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+        vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
     else
-        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+        vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
 
     q    += 32;
     pre  += 32;
     dstv += stride*4;
 
     if (((short *)(eobs))[3] & 0xfefe)
-        idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
+        vp8_idct_dequant_full_2x_sse2 (q, dq, pre, dstv, stride, 8);
     else
-        idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
+        vp8_idct_dequant_0_2x_sse2 (q, dq, pre, dstv, stride, 8);
 }
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index b3c2439..cea8e12 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -22,7 +22,8 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx_mem/vpx_mem.h"
 #include "bitstream.h"
-#include "vp8/common/defaultcoefcounts.h"
+
+#include "defaultcoefcounts.h"
 
 const int vp8cx_base_skip_false_prob[128] =
 {
@@ -1199,7 +1200,7 @@
             if (cpi->common.frame_type == KEY_FRAME)
             {
                 /* Reset to default probabilities at key frames */
-                sum_probs_over_prev_coef_context(vp8_default_coef_counts[i][j],
+                sum_probs_over_prev_coef_context(default_coef_counts[i][j],
                                                  prev_coef_count_sum);
             }
             else
diff --git a/vp8/common/defaultcoefcounts.c b/vp8/encoder/defaultcoefcounts.h
similarity index 96%
rename from vp8/common/defaultcoefcounts.c
rename to vp8/encoder/defaultcoefcounts.h
index b0e2e70..2c0f3dd 100644
--- a/vp8/common/defaultcoefcounts.c
+++ b/vp8/encoder/defaultcoefcounts.h
@@ -8,14 +8,12 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "defaultcoefcounts.h"
-
 /* Generated file, included by entropy.c */
 
-const unsigned int vp8_default_coef_counts[BLOCK_TYPES]
-                                          [COEF_BANDS]
-                                          [PREV_COEF_CONTEXTS]
-                                          [MAX_ENTROPY_TOKENS] =
+static const unsigned int default_coef_counts[BLOCK_TYPES]
+                                             [COEF_BANDS]
+                                             [PREV_COEF_CONTEXTS]
+                                             [MAX_ENTROPY_TOKENS] =
 {
 
     {
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 5d4e7e7..19b52a3 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -368,7 +368,6 @@
                    int *segment_counts,
                    int *totalrate)
 {
-    int i;
     int recon_yoffset, recon_uvoffset;
     int mb_col;
     int ref_fb_idx = cm->lst_fb_idx;
@@ -534,10 +533,6 @@
         // Increment the activity mask pointers.
         x->mb_activity_ptr++;
 
-        /* save the block info */
-        for (i = 0; i < 16; i++)
-            xd->mode_info_context->bmi[i] = xd->block[i].bmi;
-
         // adjust to the next column of macroblocks
         x->src.y_buffer += 16;
         x->src.u_buffer += 8;
@@ -665,6 +660,9 @@
                                         + vp8_cost_one(cpi->prob_gf_coded);
     }
 
+    xd->fullpixel_mask = 0xffffffff;
+    if(cm->full_pixel)
+        xd->fullpixel_mask = 0xfffffff8;
 }
 
 void vp8_encode_frame(VP8_COMP *cpi)
@@ -1257,7 +1255,11 @@
                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;
             }
         }
-        vp8_update_zbin_extra(cpi, x);
+
+        /* The fast quantizer doesn't use zbin_extra, only do so with
+         * the regular quantizer. */
+        if (cpi->sf.improved_quant)
+            vp8_update_zbin_extra(cpi, x);
     }
 
     cpi->count_mb_ref_frame_usage[xd->mode_info_context->mbmi.ref_frame] ++;
@@ -1281,8 +1283,6 @@
     {
         int ref_fb_idx;
 
-        vp8_build_uvmvs(xd, cpi->common.full_pixel);
-
         if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
             ref_fb_idx = cpi->common.lst_fb_idx;
         else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 3ed16b6..50985be 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -615,15 +615,3 @@
     RECON_INVOKE(&rtcd->common->recon, recon_mby)
         (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
-
-
-void vp8_encode_inter16x16uvrd(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x)
-{
-    vp8_build_inter_predictors_mbuv(&x->e_mbd);
-    ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
-
-    vp8_transform_mbuv(x);
-
-    vp8_quantize_mbuv(x);
-
-}
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 47fc72d..f2cf00b 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -99,7 +99,7 @@
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);
 void vp8_transform_intra_mby(MACROBLOCK *x);
-void vp8_encode_inter16x16uvrd(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+
 void vp8_optimize_mby(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_optimize_mbuv(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
 void vp8_encode_inter16x16y(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x);
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 9cdc1e5..8559142 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -1473,7 +1473,6 @@
 
     int i;
     double boost_score = 0.0;
-    double fwd_boost_score = 0.0;
     double mv_ratio_accumulator = 0.0;
     double decay_accumulator = 1.0;
     double this_frame_mv_in_out = 0.0;
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 9906105..a14843a 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -94,16 +94,15 @@
 #if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
 #endif
+#if CONFIG_INTERNAL_STATS
+    cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_c;
+    cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_c;
+#endif
 #endif
 
     // Pure C:
     vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame;
 
-#if CONFIG_INTERNAL_STATS
-    cpi->rtcd.variance.ssimpf_8x8            = ssim_parms_8x8_c;
-    cpi->rtcd.variance.ssimpf                = ssim_parms_c;
-#endif
-
 #if ARCH_X86 || ARCH_X86_64
     vp8_arch_x86_encoder_init(cpi);
 #endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index ff9a641..35e187e 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -3237,16 +3237,17 @@
     // Test code for segmentation of gf/arf (0,0)
     //segmentation_test_function((VP8_PTR) cpi);
 
-#if CONFIG_REALTIME_ONLY
-    if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
+    if (cpi->compressor_speed == 2)
     {
-        if(cpi->force_next_frame_intra)
+        if(cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
         {
-            cm->frame_type = KEY_FRAME;  /* delayed intra frame */
+            if(cpi->force_next_frame_intra)
+            {
+                cm->frame_type = KEY_FRAME;  /* delayed intra frame */
+            }
         }
+        cpi->force_next_frame_intra = 0;
     }
-    cpi->force_next_frame_intra = 0;
-#endif
 
     // For an alt ref frame in 2 pass we skip the call to the second pass function that sets the target bandwidth
 #if !(CONFIG_REALTIME_ONLY)
@@ -3775,15 +3776,15 @@
         // (assuming that we didn't)!
         if (cpi->pass != 2 && cpi->oxcf.auto_key && cm->frame_type != KEY_FRAME)
         {
+            int key_frame_decision = decide_key_frame(cpi);
 
-#if CONFIG_REALTIME_ONLY
+            if (cpi->compressor_speed == 2)
             {
                 /* we don't do re-encoding in realtime mode
                  * if key frame is decided than we force it on next frame */
-                cpi->force_next_frame_intra = decide_key_frame(cpi);
+                cpi->force_next_frame_intra = key_frame_decision;
             }
-#else
-            if (decide_key_frame(cpi))
+            else if (key_frame_decision)
             {
                 // Reset all our sizing numbers and recode
                 cm->frame_type = KEY_FRAME;
@@ -3820,7 +3821,6 @@
 
                 continue;
             }
-#endif
         }
 
         vp8_clear_system_state();
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index f75f6cb..aead2fb 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -609,9 +609,8 @@
     int *lf_ref_frame_sign_bias;
     int *lf_ref_frame;
 
-#if CONFIG_REALTIME_ONLY
     int force_next_frame_intra; /* force next frame to intra when kf_auto says so */
-#endif
+
     int droppable;
 } VP8_COMP;
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 992df71..124cfe5 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -440,14 +440,23 @@
 
     unsigned int sse1 = 0;
     unsigned int sse2 = 0;
-    int mv_row;
-    int mv_col;
+    int mv_row = x->e_mbd.mode_info_context->mbmi.mv.as_mv.row;
+    int mv_col = x->e_mbd.mode_info_context->mbmi.mv.as_mv.col;
     int offset;
     int pre_stride = x->e_mbd.block[16].pre_stride;
 
-    vp8_build_uvmvs(&x->e_mbd, 0);
-    mv_row = x->e_mbd.block[16].bmi.mv.as_mv.row;
-    mv_col = x->e_mbd.block[16].bmi.mv.as_mv.col;
+    if (mv_row < 0)
+        mv_row -= 1;
+    else
+        mv_row += 1;
+
+    if (mv_col < 0)
+        mv_col -= 1;
+    else
+        mv_col += 1;
+
+    mv_row /= 2;
+    mv_col /= 2;
 
     offset = (mv_row >> 3) * pre_stride + (mv_col >> 3);
     uptr = x->e_mbd.pre.u_buffer + offset;
@@ -786,11 +795,31 @@
 }
 
 
-static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int fullpixel)
+static int rd_inter16x16_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                            int *distortion, int fullpixel)
 {
-    vp8_build_uvmvs(&x->e_mbd, fullpixel);
-    vp8_encode_inter16x16uvrd(IF_RTCD(&cpi->rtcd), x);
+    vp8_build_inter16x16_predictors_mbuv(&x->e_mbd);
+    ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
 
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
+
+    *rate       = rd_cost_mbuv(x);
+    *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
+
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
+}
+
+static int rd_inter4x4_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate,
+                          int *distortion, int fullpixel)
+{
+    vp8_build_inter4x4_predictors_mbuv(&x->e_mbd);
+    ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff,
+        x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride);
+
+    vp8_transform_mbuv(x);
+    vp8_quantize_mbuv(x);
 
     *rate       = rd_cost_mbuv(x);
     *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
@@ -1956,7 +1985,7 @@
             if (tmp_rd < best_yrd)
             {
                 // Now work out UV cost and add it in
-                vp8_rd_inter_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
+                rd_inter4x4_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
                 rate2 += rate_uv;
                 distortion2 += distortion_uv;
             }
@@ -2207,7 +2236,7 @@
             distortion2 += distortion;
 
             // UV cost and distortion
-            vp8_rd_inter_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
+            rd_inter16x16_uv(cpi, x, &rate_uv, &distortion_uv, cpi->common.full_pixel);
             rate2 += rate_uv;
             distortion2 += distortion_uv;
             break;
@@ -2385,13 +2414,13 @@
     if (best_mbmode.mode == B_PRED)
     {
         for (i = 0; i < 16; i++)
-          x->e_mbd.block[i].bmi.as_mode = best_bmodes[i].as_mode;
+            xd->mode_info_context->bmi[i].as_mode = best_bmodes[i].as_mode;
     }
 
     if (best_mbmode.mode == SPLITMV)
     {
         for (i = 0; i < 16; i++)
-            x->e_mbd.block[i].bmi.mv.as_int = best_bmodes[i].mv.as_int;
+            xd->mode_info_context->bmi[i].mv.as_int = best_bmodes[i].mv.as_int;
 
         vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO));
 
@@ -2401,6 +2430,8 @@
 
     rd_update_mvcount(cpi, x, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
 
+
+
 }
 
 void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c
index fea756f..d0f8e49 100644
--- a/vp8/encoder/ssim.c
+++ b/vp8/encoder/ssim.c
@@ -9,18 +9,9 @@
  */
 
 
-#include "vpx_scale/yv12config.h"
-#include "math.h"
 #include "onyx_int.h"
 
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x)  (x)
-#else
-#define IF_RTCD(x)  NULL
-#endif
-
-
-void ssim_parms_c
+void vp8_ssim_parms_16x16_c
 (
     unsigned char *s,
     int sp,
@@ -46,7 +37,7 @@
          }
      }
 }
-void ssim_parms_8x8_c
+void vp8_ssim_parms_8x8_c
 (
     unsigned char *s,
     int sp,
@@ -107,14 +98,14 @@
             const vp8_variance_rtcd_vtable_t *rtcd)
 {
     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256);
 }
 static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp,
                 const vp8_variance_rtcd_vtable_t *rtcd)
 {
     unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0;
-    rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,8x8)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
 }
 
@@ -134,7 +125,7 @@
     c1 = cc1*16;
     c2 = cc2*16;
 
-    rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
+    SSIMPF_INVOKE(rtcd,16x16)(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr);
     ssim_n1 = (2*sum_s*sum_r+ c1);
 
     ssim_n2 =((int64_t) 2*256*sum_sxr-(int64_t) 2*sum_s*sum_r+c2);
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 5fd6d3a..d9bf669 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -320,16 +320,16 @@
 #endif
 extern prototype_get16x16prederror(vp8_variance_get4x4sse_cs);
 
-#ifndef vp8_ssimpf
-#define vp8_ssimpf ssim_parms_c
-#endif
-extern prototype_ssimpf(vp8_ssimpf)
-
 #ifndef vp8_ssimpf_8x8
-#define vp8_ssimpf_8x8 ssim_parms_8x8_c
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_c
 #endif
 extern prototype_ssimpf(vp8_ssimpf_8x8)
 
+#ifndef vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_c
+#endif
+extern prototype_ssimpf(vp8_ssimpf_16x16)
+
 typedef prototype_sad(*vp8_sad_fn_t);
 typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t);
 typedef prototype_sad_multi_same_address_1(*vp8_sad_multi1_fn_t);
@@ -394,7 +394,7 @@
 
 #if CONFIG_INTERNAL_STATS
     vp8_ssimpf_fn_t          ssimpf_8x8;
-    vp8_ssimpf_fn_t          ssimpf;
+    vp8_ssimpf_fn_t          ssimpf_16x16;
 #endif
 
 } vp8_variance_rtcd_vtable_t;
@@ -417,8 +417,10 @@
 
 #if CONFIG_RUNTIME_CPU_DETECT
 #define VARIANCE_INVOKE(ctx,fn) (ctx)->fn
+#define SSIMPF_INVOKE(ctx,fn) (ctx)->ssimpf_##fn
 #else
 #define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
+#define SSIMPF_INVOKE(ctx,fn) vp8_ssimpf_##fn
 #endif
 
 #endif
diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm
index 9946294..7ec7d60 100644
--- a/vp8/encoder/x86/encodeopt.asm
+++ b/vp8/encoder/x86/encodeopt.asm
@@ -148,7 +148,7 @@
         pcmpeqw     mm1,        mm7
         mov         rcx,        16
 
-mberror_loop_mmx:
+.mberror_loop_mmx:
         movq        mm3,       [rsi]
         movq        mm4,       [rdi]
 
@@ -186,7 +186,7 @@
         add         rdi,        32
         sub         rcx,        1
 
-        jnz         mberror_loop_mmx
+        jnz         .mberror_loop_mmx
 
         movq        mm0,        mm2
         psrlq       mm2,        32
@@ -226,7 +226,7 @@
         pcmpeqw     xmm5,       xmm6
         mov         rcx,        16
 
-mberror_loop:
+.mberror_loop:
         movdqa      xmm0,       [rsi]
         movdqa      xmm1,       [rdi]
 
@@ -249,7 +249,7 @@
         paddd       xmm4,       xmm2
 
         paddd       xmm4,       xmm0
-        jnz         mberror_loop
+        jnz         .mberror_loop
 
         movdqa      xmm0,       xmm4
         punpckldq   xmm0,       xmm6
@@ -289,7 +289,7 @@
         mov             rcx,        16
         pxor            mm7,        mm7
 
-mbuverror_loop_mmx:
+.mbuverror_loop_mmx:
 
         movq            mm1,        [rsi]
         movq            mm2,        [rdi]
@@ -313,7 +313,7 @@
         add             rdi,        16
 
         dec             rcx
-        jnz             mbuverror_loop_mmx
+        jnz             .mbuverror_loop_mmx
 
         movq            mm0,        mm7
         psrlq           mm7,        32
@@ -346,7 +346,7 @@
         mov             rcx,        16
         pxor            xmm3,       xmm3
 
-mbuverror_loop:
+.mbuverror_loop:
 
         movdqa          xmm1,       [rsi]
         movdqa          xmm2,       [rdi]
@@ -360,7 +360,7 @@
         add             rdi,        16
 
         dec             rcx
-        jnz             mbuverror_loop
+        jnz             .mbuverror_loop
 
         pxor        xmm0,           xmm0
         movdqa      xmm1,           xmm3
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 056b64c..c483933 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -137,17 +137,17 @@
     ; if (x >= zbin)
     sub         cx, WORD PTR[rdx]           ; x - zbin
     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          rq_zigzag_loop_%1           ; x < zbin
+    jl          .rq_zigzag_loop_%1           ; x < zbin
 
     movsx       edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
 
     ; downshift by quant_shift[rc]
     movsx       cx, BYTE PTR[rax + %1]      ; quant_shift_ptr[rc]
     sar         edi, cl                     ; also sets Z bit
-    je          rq_zigzag_loop_%1           ; !y
+    je          .rq_zigzag_loop_%1           ; !y
     mov         WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
     mov         rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
 %endmacro
 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
 ZIGZAG_LOOP  0
diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm
index 258899e..95e1c20 100644
--- a/vp8/encoder/x86/quantize_sse4.asm
+++ b/vp8/encoder/x86/quantize_sse4.asm
@@ -140,21 +140,21 @@
     ; if (x >= zbin)
     sub         cx, WORD PTR[rdx]           ; x - zbin
     lea         rdx, [rdx + 2]              ; zbin_boost_ptr++
-    jl          rq_zigzag_loop_%1           ; x < zbin
+    jl          .rq_zigzag_loop_%1          ; x < zbin
 
     pextrw      edi, %3, %2                 ; y
 
     ; downshift by quant_shift[rc]
     pextrb      ecx, xmm5, %1               ; quant_shift[rc]
     sar         edi, cl                     ; also sets Z bit
-    je          rq_zigzag_loop_%1           ; !y
+    je          .rq_zigzag_loop_%1          ; !y
 %if ABI_IS_32BIT
     mov         WORD PTR[rsp + qcoeff + %1 *2], di
 %else
     pinsrw      %5, edi, %2                 ; qcoeff[rc]
 %endif
     mov         rdx, rax                    ; reset to b->zrun_zbin_boost
-rq_zigzag_loop_%1:
+.rq_zigzag_loop_%1:
 %endmacro
 ; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
 ZIGZAG_LOOP  0, 0, xmm2, xmm6, xmm4
diff --git a/vp8/encoder/x86/sad_mmx.asm b/vp8/encoder/x86/sad_mmx.asm
index 85cb023..407b399 100644
--- a/vp8/encoder/x86/sad_mmx.asm
+++ b/vp8/encoder/x86/sad_mmx.asm
@@ -43,7 +43,7 @@
 
         pxor            mm6,        mm6
 
-x16x16sad_mmx_loop:
+.x16x16sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm2,        QWORD PTR [rsi+8]
@@ -83,7 +83,7 @@
         paddw           mm7,        mm1
 
         cmp             rsi,        rcx
-        jne             x16x16sad_mmx_loop
+        jne             .x16x16sad_mmx_loop
 
 
         movq            mm0,        mm7
@@ -135,7 +135,7 @@
 
         pxor            mm6,        mm6
 
-x8x16sad_mmx_loop:
+.x8x16sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -158,7 +158,7 @@
         paddw           mm7,        mm2
         cmp             rsi,        rcx
 
-        jne             x8x16sad_mmx_loop
+        jne             .x8x16sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
@@ -205,7 +205,7 @@
 
         pxor            mm6,        mm6
 
-x8x8sad_mmx_loop:
+.x8x8sad_mmx_loop:
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -228,7 +228,7 @@
         paddw           mm7,       mm0
         cmp             rsi,        rcx
 
-        jne             x8x8sad_mmx_loop
+        jne             .x8x8sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
@@ -364,7 +364,7 @@
 
         pxor            mm6,        mm6
 
-x16x8sad_mmx_loop:
+.x16x8sad_mmx_loop:
 
         movq            mm0,       [rsi]
         movq            mm1,       [rdi]
@@ -404,7 +404,7 @@
         paddw           mm7,        mm0
 
         cmp             rsi,        rcx
-        jne             x16x8sad_mmx_loop
+        jne             .x16x8sad_mmx_loop
 
         movq            mm0,        mm7
         punpcklwd       mm0,        mm6
diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm
index 1011c95..fa8e3e3 100644
--- a/vp8/encoder/x86/sad_sse2.asm
+++ b/vp8/encoder/x86/sad_sse2.asm
@@ -37,7 +37,7 @@
         lea             rcx,        [rcx+rax*8]
         pxor            xmm6,       xmm6
 
-x16x16sad_wmt_loop:
+.x16x16sad_wmt_loop:
 
         movq            xmm0,       QWORD PTR [rsi]
         movq            xmm2,       QWORD PTR [rsi+8]
@@ -68,7 +68,7 @@
         paddw           xmm6,       xmm4
 
         cmp             rsi,        rcx
-        jne             x16x16sad_wmt_loop
+        jne             .x16x16sad_wmt_loop
 
         movq            xmm0,       xmm6
         psrldq          xmm6,       8
@@ -111,11 +111,11 @@
         lea             rcx,        [rcx+rbx*8]
         pxor            mm7,        mm7
 
-x8x16sad_wmt_loop:
+.x8x16sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x8x16sad_wmt_early_exit
+        jg              .x8x16sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -133,11 +133,11 @@
         paddw           mm7,        mm2
 
         cmp             rsi,        rcx
-        jne             x8x16sad_wmt_loop
+        jne             .x8x16sad_wmt_loop
 
         movq            rax,        mm7
 
-x8x16sad_wmt_early_exit:
+.x8x16sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -172,11 +172,11 @@
         lea             rcx,        [rsi+rbx*8]
         pxor            mm7,        mm7
 
-x8x8sad_wmt_loop:
+.x8x8sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x8x8sad_wmt_early_exit
+        jg              .x8x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm1,        QWORD PTR [rdi]
@@ -188,10 +188,10 @@
         paddw           mm7,        mm0
 
         cmp             rsi,        rcx
-        jne             x8x8sad_wmt_loop
+        jne             .x8x8sad_wmt_loop
 
         movq            rax,        mm7
-x8x8sad_wmt_early_exit:
+.x8x8sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -281,11 +281,11 @@
         lea             rcx,        [rsi+rbx*8]
         pxor            mm7,        mm7
 
-x16x8sad_wmt_loop:
+.x16x8sad_wmt_loop:
 
         movq            rax,        mm7
         cmp             eax,        arg(4)
-        jg              x16x8sad_wmt_early_exit
+        jg              .x16x8sad_wmt_early_exit
 
         movq            mm0,        QWORD PTR [rsi]
         movq            mm2,        QWORD PTR [rsi+8]
@@ -315,11 +315,11 @@
         paddw           mm7,        mm4
 
         cmp             rsi,        rcx
-        jne             x16x8sad_wmt_loop
+        jne             .x16x8sad_wmt_loop
 
         movq            rax,        mm7
 
-x16x8sad_wmt_early_exit:
+.x16x8sad_wmt_early_exit:
 
     ; begin epilog
     pop         rdi
@@ -352,7 +352,7 @@
         movsxd          rdx,        dword ptr arg(3) ;dst_stride
         movsxd          rcx,        dword ptr arg(4) ;height
 
-block_copy_sse2_loopx4:
+.block_copy_sse2_loopx4:
         movdqu          xmm0,       XMMWORD PTR [rsi]
         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
         movdqu          xmm2,       XMMWORD PTR [rsi + rax]
@@ -383,12 +383,12 @@
 
         sub             rcx,     4
         cmp             rcx,     4
-        jge             block_copy_sse2_loopx4
+        jge             .block_copy_sse2_loopx4
 
         cmp             rcx, 0
-        je              copy_is_done
+        je              .copy_is_done
 
-block_copy_sse2_loop:
+.block_copy_sse2_loop:
         movdqu          xmm0,       XMMWORD PTR [rsi]
         movdqu          xmm1,       XMMWORD PTR [rsi + 16]
         lea             rsi,    [rsi+rax]
@@ -398,9 +398,9 @@
         lea             rdi,    [rdi+rdx]
 
         sub             rcx,     1
-        jne             block_copy_sse2_loop
+        jne             .block_copy_sse2_loop
 
-copy_is_done:
+.copy_is_done:
     ; begin epilog
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm
index 9e05521..a255097 100644
--- a/vp8/encoder/x86/sad_sse3.asm
+++ b/vp8/encoder/x86/sad_sse3.asm
@@ -647,7 +647,7 @@
 
     STACK_FRAME_CREATE_X3
 
-block_copy_sse3_loopx4:
+.block_copy_sse3_loopx4:
         lea             end_ptr,    [src_ptr+src_stride*2]
 
         movdqu          xmm0,       XMMWORD PTR [src_ptr]
@@ -676,13 +676,13 @@
 
         sub             height,     4
         cmp             height,     4
-        jge             block_copy_sse3_loopx4
+        jge             .block_copy_sse3_loopx4
 
         ;Check to see if there is more rows need to be copied.
         cmp             height, 0
-        je              copy_is_done
+        je              .copy_is_done
 
-block_copy_sse3_loop:
+.block_copy_sse3_loop:
         movdqu          xmm0,       XMMWORD PTR [src_ptr]
         movdqu          xmm1,       XMMWORD PTR [src_ptr + 16]
         lea             src_ptr,    [src_ptr+src_stride]
@@ -692,9 +692,9 @@
         lea             ref_ptr,    [ref_ptr+ref_stride]
 
         sub             height,     1
-        jne             block_copy_sse3_loop
+        jne             .block_copy_sse3_loop
 
-copy_is_done:
+.copy_is_done:
     STACK_FRAME_DESTROY_X3
 
 ;void vp8_sad16x16x4d_sse3(
diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm
index 6ecf081..95b6c89 100644
--- a/vp8/encoder/x86/sad_ssse3.asm
+++ b/vp8/encoder/x86/sad_ssse3.asm
@@ -169,30 +169,30 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp vp8_sad16x16x3_ssse3_skiptable
-vp8_sad16x16x3_ssse3_jumptable:
-        dd vp8_sad16x16x3_ssse3_aligned_by_0  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_1  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_2  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_3  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_4  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_5  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_6  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_7  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_8  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_9  - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_10 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_11 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_12 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_13 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_14 - vp8_sad16x16x3_ssse3_do_jump
-        dd vp8_sad16x16x3_ssse3_aligned_by_15 - vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_skiptable:
+        jmp .vp8_sad16x16x3_ssse3_skiptable
+.vp8_sad16x16x3_ssse3_jumptable:
+        dd .vp8_sad16x16x3_ssse3_aligned_by_0  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_1  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_2  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_3  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_4  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_5  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_6  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_7  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_8  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_9  - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump
+        dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_skiptable:
 
-        call vp8_sad16x16x3_ssse3_do_jump
-vp8_sad16x16x3_ssse3_do_jump:
+        call .vp8_sad16x16x3_ssse3_do_jump
+.vp8_sad16x16x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  vp8_sad16x16x3_ssse3_jumptable - vp8_sad16x16x3_ssse3_do_jump
+        mov             rax,  .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump
         add             rax,  rcx  ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
@@ -203,23 +203,23 @@
 
         jmp             rcx
 
-        PROCESS_16X16X3_OFFSET 0,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, vp8_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 0,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 1,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 2,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 3,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 4,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 5,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 6,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 7,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 8,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 9,  .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3
+        PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3
 
-vp8_sad16x16x3_ssse3_aligned_by_15:
+.vp8_sad16x16x3_ssse3_aligned_by_15:
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
@@ -229,7 +229,7 @@
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-vp8_sad16x16x3_ssse3_store_off:
+.vp8_sad16x16x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
@@ -282,30 +282,30 @@
         mov             rdx,        0xf
         and             rdx,        rdi
 
-        jmp vp8_sad16x8x3_ssse3_skiptable
-vp8_sad16x8x3_ssse3_jumptable:
-        dd vp8_sad16x8x3_ssse3_aligned_by_0  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_1  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_2  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_3  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_4  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_5  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_6  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_7  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_8  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_9  - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_10 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_11 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_12 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_13 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_14 - vp8_sad16x8x3_ssse3_do_jump
-        dd vp8_sad16x8x3_ssse3_aligned_by_15 - vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_skiptable:
+        jmp .vp8_sad16x8x3_ssse3_skiptable
+.vp8_sad16x8x3_ssse3_jumptable:
+        dd .vp8_sad16x8x3_ssse3_aligned_by_0  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_1  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_2  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_3  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_4  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_5  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_6  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_7  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_8  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_9  - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump
+        dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_skiptable:
 
-        call vp8_sad16x8x3_ssse3_do_jump
-vp8_sad16x8x3_ssse3_do_jump:
+        call .vp8_sad16x8x3_ssse3_do_jump
+.vp8_sad16x8x3_ssse3_do_jump:
         pop             rcx                         ; get the address of do_jump
-        mov             rax,  vp8_sad16x8x3_ssse3_jumptable - vp8_sad16x8x3_ssse3_do_jump
+        mov             rax,  .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump
         add             rax,  rcx  ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable
 
         movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
@@ -316,30 +316,30 @@
 
         jmp             rcx
 
-        PROCESS_16X8X3_OFFSET 0,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, vp8_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 0,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 1,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 2,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 3,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 4,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 5,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 6,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 7,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 8,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 9,  .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3
+        PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3
 
-vp8_sad16x8x3_ssse3_aligned_by_15:
+.vp8_sad16x8x3_ssse3_aligned_by_15:
 
         PROCESS_16X2X3 1
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
         PROCESS_16X2X3 0
 
-vp8_sad16x8x3_ssse3_store_off:
+.vp8_sad16x8x3_ssse3_store_off:
         mov             rdi,        arg(4) ;Results
 
         movq            xmm0,       xmm5
diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm
index d5d267a..c6db3d1 100644
--- a/vp8/encoder/x86/ssim_opt.asm
+++ b/vp8/encoder/x86/ssim_opt.asm
@@ -44,7 +44,7 @@
         paddd           %1, xmm1
         SUM_ACROSS_Q    %1
 %endmacro
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
 ;    unsigned char *r,
@@ -61,8 +61,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_16x16_sse3)
-sym(vp8_ssim_parms_16x16_sse3):
+global sym(vp8_ssim_parms_16x16_sse2)
+sym(vp8_ssim_parms_16x16_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -84,7 +84,7 @@
     pxor            xmm11,xmm11  ;sum_sxr
 
     mov             rdx, 16      ;row counter
-NextRow:
+.NextRow:
 
     ;grab source and reference pixels
     movdqu          xmm5, [rsi]
@@ -107,7 +107,7 @@
     add             rdi, rax   ; next r row
 
     dec             rdx        ; counter
-    jnz NextRow
+    jnz .NextRow
 
     SUM_ACROSS_W    xmm15
     SUM_ACROSS_W    xmm14
@@ -134,7 +134,7 @@
     pop         rbp
     ret
 
-;void ssim_parms_sse3(
+;void ssim_parms_sse2(
 ;    unsigned char *s,
 ;    int sp,
 ;    unsigned char *r,
@@ -151,8 +151,8 @@
 ; or pavgb At this point this is just meant to be first pass for calculating
 ; all the parms needed for 16x16 ssim so we can play with dssim as distortion
 ; in mode selection code.
-global sym(vp8_ssim_parms_8x8_sse3)
-sym(vp8_ssim_parms_8x8_sse3):
+global sym(vp8_ssim_parms_8x8_sse2)
+sym(vp8_ssim_parms_8x8_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 9
@@ -174,7 +174,7 @@
     pxor            xmm11,xmm11  ;sum_sxr
 
     mov             rdx, 8      ;row counter
-NextRow2:
+.NextRow:
 
     ;grab source and reference pixels
     movq            xmm3, [rsi]
@@ -188,7 +188,7 @@
     add             rdi, rax   ; next r row
 
     dec             rdx        ; counter
-    jnz NextRow2
+    jnz .NextRow
 
     SUM_ACROSS_W    xmm15
     SUM_ACROSS_W    xmm14
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
index a47e1f0..4ce16ce 100644
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ b/vp8/encoder/x86/subtract_mmx.asm
@@ -93,7 +93,7 @@
             mov         rcx,            16
             pxor        mm0,            mm0
 
-submby_loop:
+.submby_loop:
 
             movq        mm1,            [rsi]
             movq        mm3,            [rax]
@@ -139,7 +139,7 @@
             lea         rsi,            [rsi+rdx]
 
             sub         rcx,            1
-            jnz         submby_loop
+            jnz         .submby_loop
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
index 95888f6..3bd1ff6 100644
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ b/vp8/encoder/x86/subtract_sse2.asm
@@ -91,7 +91,7 @@
 
             mov         rcx,            8      ; do two lines at one time
 
-submby_loop:
+.submby_loop:
             movdqa      xmm0,           XMMWORD PTR [rsi]   ; src
             movdqa      xmm1,           XMMWORD PTR [rax]   ; pred
 
@@ -133,7 +133,7 @@
             lea         rsi,            [rsi+rdx*2]
 
             sub         rcx,            1
-            jnz         submby_loop
+            jnz         .submby_loop
 
     pop rdi
     pop rsi
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
index b777ef5..b97c694 100644
--- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -71,26 +71,26 @@
 
         lea         rcx,            [rdx + 16*16*1]
         cmp         dword ptr [rsp + block_size], 8
-        jne         temporal_filter_apply_load_16
+        jne         .temporal_filter_apply_load_16
         lea         rcx,            [rdx + 8*8*1]
 
-temporal_filter_apply_load_8:
+.temporal_filter_apply_load_8:
         movq        xmm0,           [rsi]  ; first row
         lea         rsi,            [rsi + rbp] ; += stride
         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
         movq        xmm1,           [rsi]  ; second row
         lea         rsi,            [rsi + rbp] ; += stride
         punpcklbw   xmm1,           xmm7   ; src[ 8-15]
-        jmp         temporal_filter_apply_load_finished
+        jmp         .temporal_filter_apply_load_finished
 
-temporal_filter_apply_load_16:
+.temporal_filter_apply_load_16:
         movdqa      xmm0,           [rsi]  ; src (frame1)
         lea         rsi,            [rsi + rbp] ; += stride
         movdqa      xmm1,           xmm0
         punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
         punpckhbw   xmm1,           xmm7   ; src[ 8-15]
 
-temporal_filter_apply_load_finished:
+.temporal_filter_apply_load_finished:
         movdqa      xmm2,           [rdx]  ; predictor (frame2)
         movdqa      xmm3,           xmm2
         punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
@@ -176,13 +176,13 @@
         lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
 
         cmp         rdx,            rcx
-        je          temporal_filter_apply_epilog
+        je          .temporal_filter_apply_epilog
         pxor        xmm7,           xmm7   ; zero for extraction
         cmp         dword ptr [rsp + block_size], 16
-        je          temporal_filter_apply_load_16
-        jmp         temporal_filter_apply_load_8
+        je          .temporal_filter_apply_load_16
+        jmp         .temporal_filter_apply_load_8
 
-temporal_filter_apply_epilog:
+.temporal_filter_apply_epilog:
     ; begin epilog
     mov         rbp,            [rsp + rbp_backup]
     add         rsp,            stack_size
diff --git a/vp8/encoder/x86/variance_impl_mmx.asm b/vp8/encoder/x86/variance_impl_mmx.asm
index 13b76ea..2be8bbe 100644
--- a/vp8/encoder/x86/variance_impl_mmx.asm
+++ b/vp8/encoder/x86/variance_impl_mmx.asm
@@ -27,7 +27,7 @@
         mov         rcx, 16
         pxor        mm4, mm4
 
-NEXTROW:
+.NEXTROW:
         movq        mm0, [rax]
         movq        mm1, [rax+8]
         movq        mm2, [rax+16]
@@ -44,7 +44,7 @@
 
         add         rax, 32
         dec         rcx
-        ja          NEXTROW
+        ja          .NEXTROW
         movq        QWORD PTR [rsp], mm4
 
         ;return sum[0]+sum[1];
@@ -568,7 +568,7 @@
         add             rsi, r8
 %endif
 
-filter_block2d_bil4x4_var_mmx_loop:
+.filter_block2d_bil4x4_var_mmx_loop:
 
         movd            mm1,            [rsi]               ;
         movd            mm3,            [rsi+1]             ;
@@ -614,7 +614,7 @@
         add             rdi,            r9
 %endif
         sub             rcx,            1                   ;
-        jnz             filter_block2d_bil4x4_var_mmx_loop       ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
 
 
         pxor            mm3,            mm3                 ;
@@ -726,7 +726,7 @@
         add             rsi,            r8
 %endif
 
-filter_block2d_bil_var_mmx_loop:
+.filter_block2d_bil_var_mmx_loop:
 
         movq            mm1,            [rsi]               ;
         movq            mm3,            [rsi+1]             ;
@@ -807,7 +807,7 @@
         add             rdi,            r9
 %endif
         sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_mmx_loop       ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
 
 
         pxor            mm3,            mm3                 ;
diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm
index b7a6b32..7629220 100644
--- a/vp8/encoder/x86/variance_impl_sse2.asm
+++ b/vp8/encoder/x86/variance_impl_sse2.asm
@@ -33,7 +33,7 @@
         mov         rcx, 8
         pxor        xmm4, xmm4
 
-NEXTROW:
+.NEXTROW:
         movdqa      xmm0, [rax]
         movdqa      xmm1, [rax+16]
         movdqa      xmm2, [rax+32]
@@ -50,7 +50,7 @@
 
         add         rax, 0x40
         dec         rcx
-        ja          NEXTROW
+        ja          .NEXTROW
 
         movdqa      xmm3,xmm4
         psrldq      xmm4,8
@@ -126,7 +126,7 @@
         pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
         mov         rcx,            16
 
-var16loop:
+.var16loop:
         movdqu      xmm1,           XMMWORD PTR [rsi]
         movdqu      xmm2,           XMMWORD PTR [rdi]
 
@@ -160,7 +160,7 @@
         add         rdi,            rdx
 
         sub         rcx,            1
-        jnz         var16loop
+        jnz         .var16loop
 
 
         movdqa      xmm1,           xmm6
diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm
index a582f8d..97e8b0e 100644
--- a/vp8/encoder/x86/variance_impl_ssse3.asm
+++ b/vp8/encoder/x86/variance_impl_ssse3.asm
@@ -47,7 +47,7 @@
         movsxd          rax,            dword ptr arg(5)     ; xoffset
 
         cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_ssse3_sp_only
+        je              .filter_block2d_bil_var_ssse3_sp_only
 
         shl             rax,            4                    ; point to filter coeff with xoffset
         lea             rax,            [rax + rcx]          ; HFilter
@@ -55,7 +55,7 @@
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
 
         cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_ssse3_fp_only
+        je              .filter_block2d_bil_var_ssse3_fp_only
 
         shl             rdx,            4
         lea             rdx,            [rdx + rcx]          ; VFilter
@@ -88,7 +88,7 @@
         lea             rsi,            [rsi + r8]
 %endif
 
-filter_block2d_bil_var_ssse3_loop:
+.filter_block2d_bil_var_ssse3_loop:
         movdqu          xmm1,           XMMWORD PTR [rsi]
         movdqu          xmm2,           XMMWORD PTR [rsi+1]
         movdqa          xmm3,           xmm1
@@ -142,15 +142,15 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_var_ssse3_loop
+        jnz             .filter_block2d_bil_var_ssse3_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_sp_only:
+.filter_block2d_bil_var_ssse3_sp_only:
         movsxd          rdx,            dword ptr arg(6)     ; yoffset
 
         cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              filter_block2d_bil_var_ssse3_full_pixel
+        je              .filter_block2d_bil_var_ssse3_full_pixel
 
         shl             rdx,            4
         lea             rdx,            [rdx + rcx]          ; VFilter
@@ -169,7 +169,7 @@
 
         lea             rsi,            [rsi + rax]
 
-filter_block2d_bil_sp_only_loop:
+.filter_block2d_bil_sp_only_loop:
         movdqu          xmm3,           XMMWORD PTR [rsi]
         movdqa          xmm2,           xmm1
         movdqa          xmm0,           xmm3
@@ -209,11 +209,11 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_sp_only_loop
+        jnz             .filter_block2d_bil_sp_only_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_full_pixel:
+.filter_block2d_bil_var_ssse3_full_pixel:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
         movsxd          rcx,            dword ptr arg(4)     ;Height
@@ -221,7 +221,7 @@
         movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
         pxor            xmm0,           xmm0
 
-filter_block2d_bil_full_pixel_loop:
+.filter_block2d_bil_full_pixel_loop:
         movq            xmm1,           QWORD PTR [rsi]
         punpcklbw       xmm1,           xmm0
         movq            xmm2,           QWORD PTR [rsi+8]
@@ -244,11 +244,11 @@
         lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
         lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
         sub             rcx,            1
-        jnz             filter_block2d_bil_full_pixel_loop
+        jnz             .filter_block2d_bil_full_pixel_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_var_ssse3_fp_only:
+.filter_block2d_bil_var_ssse3_fp_only:
         mov             rsi,            arg(0)               ;ref_ptr
         mov             rdi,            arg(2)               ;src_ptr
         movsxd          rcx,            dword ptr arg(4)     ;Height
@@ -260,7 +260,7 @@
         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
 %endif
 
-filter_block2d_bil_fp_only_loop:
+.filter_block2d_bil_fp_only_loop:
         movdqu          xmm1,           XMMWORD PTR [rsi]
         movdqu          xmm2,           XMMWORD PTR [rsi+1]
         movdqa          xmm3,           xmm1
@@ -298,11 +298,11 @@
 %endif
 
         sub             rcx,            1
-        jnz             filter_block2d_bil_fp_only_loop
+        jnz             .filter_block2d_bil_fp_only_loop
 
-        jmp             filter_block2d_bil_variance
+        jmp             .filter_block2d_bil_variance
 
-filter_block2d_bil_variance:
+.filter_block2d_bil_variance:
         pxor        xmm0,           xmm0
         pxor        xmm1,           xmm1
         pxor        xmm5,           xmm5
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index af6c4d2..4b41b54 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -140,6 +140,8 @@
 extern prototype_variance(vp8_mse16x16_wmt);
 extern prototype_variance2(vp8_get8x8var_sse2);
 extern prototype_variance2(vp8_get16x16var_sse2);
+extern prototype_ssimpf(vp8_ssim_parms_8x8_sse2)
+extern prototype_ssimpf(vp8_ssim_parms_16x16_sse2)
 
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_variance_sad4x4
@@ -208,6 +210,14 @@
 #undef  vp8_variance_mse16x16
 #define vp8_variance_mse16x16 vp8_mse16x16_wmt
 
+#if ARCH_X86_64
+#undef  vp8_ssimpf_8x8
+#define vp8_ssimpf_8x8 vp8_ssim_parms_8x8_sse2
+
+#undef  vp8_ssimpf_16x16
+#define vp8_ssimpf_16x16 vp8_ssim_parms_16x16_sse2
+#endif
+
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index badb9f0..36b7b71 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -111,29 +111,6 @@
 
 #endif
 
-#if HAVE_SSSE3
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-typedef void ssimpf
-(
-    unsigned char *s,
-    int sp,
-    unsigned char *r,
-    int rp,
-    unsigned long *sum_s,
-    unsigned long *sum_r,
-    unsigned long *sum_sq_s,
-    unsigned long *sum_sq_r,
-    unsigned long *sum_sxr
-);
-
-extern ssimpf vp8_ssim_parms_16x16_sse3;
-extern ssimpf vp8_ssim_parms_8x8_sse3;
-#endif
-#endif
-#endif
-
-
 void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 {
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -246,6 +223,13 @@
 #if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
 #endif
+
+#if CONFIG_INTERNAL_STATS
+#if ARCH_X86_64
+        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse2;
+        cpi->rtcd.variance.ssimpf_16x16          = vp8_ssim_parms_16x16_sse2;
+#endif
+#endif
     }
 #endif
 
@@ -280,14 +264,6 @@
         cpi->rtcd.variance.subpixvar16x16        = vp8_sub_pixel_variance16x16_ssse3;
 
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_ssse3;
-
-#if CONFIG_INTERNAL_STATS
-#if ARCH_X86_64
-        cpi->rtcd.variance.ssimpf_8x8            = vp8_ssim_parms_8x8_sse3;
-        cpi->rtcd.variance.ssimpf                = vp8_ssim_parms_16x16_sse3;
-#endif
-#endif
-
     }
 #endif
 
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 053ecae..9ec24d5 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -19,8 +19,6 @@
 VP8_COMMON_SRCS-yes += common/blockd.c
 VP8_COMMON_SRCS-yes += common/coefupdateprobs.h
 VP8_COMMON_SRCS-yes += common/debugmodes.c
-VP8_COMMON_SRCS-yes += common/defaultcoefcounts.h
-VP8_COMMON_SRCS-yes += common/defaultcoefcounts.c
 VP8_COMMON_SRCS-yes += common/entropy.c
 VP8_COMMON_SRCS-yes += common/entropymode.c
 VP8_COMMON_SRCS-yes += common/entropymv.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index d46d99d..b71a54a 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -34,6 +34,7 @@
 #INCLUDES += encoder
 
 VP8_CX_SRCS-yes += encoder/asm_enc_offsets.c
+VP8_CX_SRCS-yes += encoder/defaultcoefcounts.h
 VP8_CX_SRCS-yes += encoder/bitstream.c
 VP8_CX_SRCS-yes += encoder/boolhuff.c
 VP8_CX_SRCS-yes += encoder/dct.c