Merge "postproc: Add mode and refrence frame visualizers."
diff --git a/AUTHORS b/AUTHORS
index 6708d5a..110e5e1 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,6 +1,7 @@
 # This file is automatically generated from the git commit history
 # by tools/gen_authors.sh.
 
+Aaron Watry <awatry@gmail.com>
 Adrian Grange <agrange@google.com>
 Alex Converse <alex.converse@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
@@ -20,6 +21,7 @@
 Justin Lebar <justin.lebar@gmail.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
+Martin Ettl <ettl.martin78@googlemail.com>
 Michael Kohler <michaelkohler@live.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
diff --git a/vp8/common/arm/arm_systemdependent.c b/vp8/common/arm/arm_systemdependent.c
index fe62fae..83921f8 100644
--- a/vp8/common/arm/arm_systemdependent.c
+++ b/vp8/common/arm/arm_systemdependent.c
@@ -105,6 +105,8 @@
         rtcd->recon.recon       = vp8_recon_b_neon;
         rtcd->recon.recon2      = vp8_recon2b_neon;
         rtcd->recon.recon4      = vp8_recon4b_neon;
+        rtcd->recon.recon_mb    = vp8_recon_mb_neon;
+
     }
 #endif
 
diff --git a/vp8/common/arm/neon/recon_neon.c b/vp8/common/arm/neon/recon_neon.c
new file mode 100644
index 0000000..3b2df4c
--- /dev/null
+++ b/vp8/common/arm/neon/recon_neon.c
@@ -0,0 +1,29 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "vpx_ports/config.h"
+#include "recon.h"
+#include "blockd.h"
+
+extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
+
+void vp8_recon_mb_neon(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+{
+    unsigned char *pred_ptr = &x->predictor[0];
+    short *diff_ptr = &x->diff[0];
+    unsigned char *dst_ptr = x->dst.y_buffer;
+    unsigned char *udst_ptr = x->dst.u_buffer;
+    unsigned char *vdst_ptr = x->dst.v_buffer;
+    int ystride = x->dst.y_stride;
+    //int uv_stride = x->dst.uv_stride;
+
+    vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
+}
diff --git a/vp8/common/arm/recon_arm.c b/vp8/common/arm/recon_arm.c
deleted file mode 100644
index 218898b..0000000
--- a/vp8/common/arm/recon_arm.c
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "recon.h"
-#include "blockd.h"
-
-extern void vp8_recon16x16mb_neon(unsigned char *pred_ptr, short *diff_ptr, unsigned char *dst_ptr, int ystride, unsigned char *udst_ptr, unsigned char *vdst_ptr);
-
-/*
-void vp8_recon16x16mby(MACROBLOCKD *x)
-{
-    int i;
-    for(i=0;i<16;i+=4)
-    {
-        //vp8_recon4b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-}
-*/
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[0];
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[4];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[8];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    //b = &x->block[12];
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-
-#if HAVE_ARMV7
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    unsigned char *pred_ptr = &x->predictor[0];
-    short *diff_ptr = &x->diff[0];
-    unsigned char *dst_ptr = x->dst.y_buffer;
-    unsigned char *udst_ptr = x->dst.u_buffer;
-    unsigned char *vdst_ptr = x->dst.v_buffer;
-    int ystride = x->dst.y_stride;
-    //int uv_stride = x->dst.uv_stride;
-
-    vp8_recon16x16mb_neon(pred_ptr, diff_ptr, dst_ptr, ystride, udst_ptr, vdst_ptr);
-}
-
-#else
-/*
-void vp8_recon16x16mb(MACROBLOCKD *x)
-{
-    int i;
-
-    for(i=0;i<16;i+=4)
-    {
-//      vp8_recon4b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon4b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-
-    }
-    for(i=16;i<24;i+=2)
-    {
-//      vp8_recon2b(&x->block[i]);
-        BLOCKD *b = &x->block[i];
-        vp8_recon2b (b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-}
-*/
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    BLOCKD *b = &x->block[0];
-
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b += 4;
-
-    //b = &x->block[16];
-
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    b++;
-    b++;
-    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-}
-#endif
diff --git a/vp8/common/arm/recon_arm.h b/vp8/common/arm/recon_arm.h
index c30f6dc..bf32e62 100644
--- a/vp8/common/arm/recon_arm.h
+++ b/vp8/common/arm/recon_arm.h
@@ -51,6 +51,8 @@
 extern prototype_copy_block(vp8_copy_mem8x4_neon);
 extern prototype_copy_block(vp8_copy_mem16x16_neon);
 
+extern prototype_recon_macroblock(vp8_recon_recon_mb_neon);
+
 #if !CONFIG_RUNTIME_CPU_DETECT
 #undef  vp8_recon_recon
 #define vp8_recon_recon vp8_recon_b_neon
@@ -69,6 +71,9 @@
 
 #undef  vp8_recon_copy16x16
 #define vp8_recon_copy16x16 vp8_copy_mem16x16_neon
+
+#undef  vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp8_recon_mb_neon
 #endif
 #endif
 
diff --git a/vp8/common/arm/reconintra4x4_arm.c b/vp8/common/arm/reconintra4x4_arm.c
deleted file mode 100644
index 8d968d7..0000000
--- a/vp8/common/arm/reconintra4x4_arm.c
+++ /dev/null
@@ -1,409 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_ports/config.h"
-#include "recon.h"
-#include "vpx_mem/vpx_mem.h"
-#include "reconintra.h"
-
-void vp8_predict_intra4x4(BLOCKD *x,
-                          int b_mode,
-                          unsigned char *predictor)
-{
-    int i, r, c;
-
-    unsigned char *Above = *(x->base_dst) + x->dst - x->dst_stride;
-    unsigned char Left[4];
-    unsigned char top_left = Above[-1];
-
-    Left[0] = (*(x->base_dst))[x->dst - 1];
-    Left[1] = (*(x->base_dst))[x->dst - 1 + x->dst_stride];
-    Left[2] = (*(x->base_dst))[x->dst - 1 + 2 * x->dst_stride];
-    Left[3] = (*(x->base_dst))[x->dst - 1 + 3 * x->dst_stride];
-
-    switch (b_mode)
-    {
-    case B_DC_PRED:
-    {
-        int expected_dc = 0;
-
-        for (i = 0; i < 4; i++)
-        {
-            expected_dc += Above[i];
-            expected_dc += Left[i];
-        }
-
-        expected_dc = (expected_dc + 4) >> 3;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = expected_dc;
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-    case B_TM_PRED:
-    {
-        // prediction similar to true_motion prediction
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                int pred = Above[c] - top_left + Left[r];
-
-                if (pred < 0)
-                    pred = 0;
-
-                if (pred > 255)
-                    pred = 255;
-
-                predictor[c] = pred;
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-
-    case B_VE_PRED:
-    {
-
-        unsigned int ap[4];
-        ap[0] = (top_left  + 2 * Above[0] + Above[1] + 2) >> 2;
-        ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2;
-        ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2;
-        ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-
-                predictor[c] = ap[c];
-            }
-
-            predictor += 16;
-        }
-
-    }
-    break;
-
-
-    case B_HE_PRED:
-    {
-
-        unsigned int lp[4];
-        lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2;
-        lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2;
-        lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2;
-        lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2;
-
-        for (r = 0; r < 4; r++)
-        {
-            for (c = 0; c < 4; c++)
-            {
-                predictor[c] = lp[r];
-            }
-
-            predictor += 16;
-        }
-    }
-    break;
-    case B_LD_PRED:
-    {
-        unsigned char *ptr = Above;
-        predictor[0 * 16 + 0] = (ptr[0] + ptr[1] * 2 + ptr[2] + 2) >> 2;
-        predictor[0 * 16 + 1] =
-            predictor[1 * 16 + 0] = (ptr[1] + ptr[2] * 2 + ptr[3] + 2) >> 2;
-        predictor[0 * 16 + 2] =
-            predictor[1 * 16 + 1] =
-                predictor[2 * 16 + 0] = (ptr[2] + ptr[3] * 2 + ptr[4] + 2) >> 2;
-        predictor[0 * 16 + 3] =
-            predictor[1 * 16 + 2] =
-                predictor[2 * 16 + 1] =
-                    predictor[3 * 16 + 0] = (ptr[3] + ptr[4] * 2 + ptr[5] + 2) >> 2;
-        predictor[1 * 16 + 3] =
-            predictor[2 * 16 + 2] =
-                predictor[3 * 16 + 1] = (ptr[4] + ptr[5] * 2 + ptr[6] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[3 * 16 + 2] = (ptr[5] + ptr[6] * 2 + ptr[7] + 2) >> 2;
-        predictor[3 * 16 + 3] = (ptr[6] + ptr[7] * 2 + ptr[7] + 2) >> 2;
-
-    }
-    break;
-    case B_RD_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-        predictor[3 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[3 * 16 + 1] =
-            predictor[2 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[3 * 16 + 2] =
-            predictor[2 * 16 + 1] =
-                predictor[1 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * 16 + 3] =
-            predictor[2 * 16 + 2] =
-                predictor[1 * 16 + 1] =
-                    predictor[0 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[1 * 16 + 2] =
-                predictor[0 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[1 * 16 + 3] =
-            predictor[0 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-
-    }
-    break;
-    case B_VR_PRED:
-    {
-
-        unsigned char pp[9];
-
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 0] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[3 * 16 + 1] =
-            predictor[1 * 16 + 0] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 1] =
-            predictor[0 * 16 + 0] = (pp[4] + pp[5] + 1) >> 1;
-        predictor[3 * 16 + 2] =
-            predictor[1 * 16 + 1] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[0 * 16 + 1] = (pp[5] + pp[6] + 1) >> 1;
-        predictor[3 * 16 + 3] =
-            predictor[1 * 16 + 2] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-        predictor[2 * 16 + 3] =
-            predictor[0 * 16 + 2] = (pp[6] + pp[7] + 1) >> 1;
-        predictor[1 * 16 + 3] = (pp[6] + pp[7] * 2 + pp[8] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[7] + pp[8] + 1) >> 1;
-
-    }
-    break;
-    case B_VL_PRED:
-    {
-
-        unsigned char *pp = Above;
-
-        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[1 * 16 + 0] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * 16 + 0] =
-            predictor[0 * 16 + 1] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[1 * 16 + 1] =
-            predictor[3 * 16 + 0] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 1] =
-            predictor[0 * 16 + 2] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[3 * 16 + 1] =
-            predictor[1 * 16 + 2] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[0 * 16 + 3] =
-            predictor[2 * 16 + 2] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[3 * 16 + 2] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[2 * 16 + 3] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[3 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-    case B_HD_PRED:
-    {
-        unsigned char pp[9];
-        pp[0] = Left[3];
-        pp[1] = Left[2];
-        pp[2] = Left[1];
-        pp[3] = Left[0];
-        pp[4] = top_left;
-        pp[5] = Above[0];
-        pp[6] = Above[1];
-        pp[7] = Above[2];
-        pp[8] = Above[3];
-
-
-        predictor[3 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[3 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[2 * 16 + 0] =
-            predictor[3 * 16 + 2] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[2 * 16 + 1] =
-            predictor[3 * 16 + 3] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[1 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[2 * 16 + 3] =
-            predictor[1 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[4] + 2) >> 2;
-        predictor[1 * 16 + 2] =
-            predictor[0 * 16 + 0] = (pp[3] + pp[4] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[0 * 16 + 1] = (pp[3] + pp[4] * 2 + pp[5] + 2) >> 2;
-        predictor[0 * 16 + 2] = (pp[4] + pp[5] * 2 + pp[6] + 2) >> 2;
-        predictor[0 * 16 + 3] = (pp[5] + pp[6] * 2 + pp[7] + 2) >> 2;
-    }
-    break;
-
-
-    case B_HU_PRED:
-    {
-        unsigned char *pp = Left;
-        predictor[0 * 16 + 0] = (pp[0] + pp[1] + 1) >> 1;
-        predictor[0 * 16 + 1] = (pp[0] + pp[1] * 2 + pp[2] + 2) >> 2;
-        predictor[0 * 16 + 2] =
-            predictor[1 * 16 + 0] = (pp[1] + pp[2] + 1) >> 1;
-        predictor[0 * 16 + 3] =
-            predictor[1 * 16 + 1] = (pp[1] + pp[2] * 2 + pp[3] + 2) >> 2;
-        predictor[1 * 16 + 2] =
-            predictor[2 * 16 + 0] = (pp[2] + pp[3] + 1) >> 1;
-        predictor[1 * 16 + 3] =
-            predictor[2 * 16 + 1] = (pp[2] + pp[3] * 2 + pp[3] + 2) >> 2;
-        predictor[2 * 16 + 2] =
-            predictor[2 * 16 + 3] =
-                predictor[3 * 16 + 0] =
-                    predictor[3 * 16 + 1] =
-                        predictor[3 * 16 + 2] =
-                            predictor[3 * 16 + 3] = pp[3];
-    }
-    break;
-
-
-    }
-}
-// copy 4 bytes from the above right down so that the 4x4 prediction modes using pixels above and
-// to the right prediction have filled in pixels to use.
-void vp8_intra_prediction_down_copy(MACROBLOCKD *x)
-{
-    unsigned char *above_right = *(x->block[0].base_dst) + x->block[0].dst - x->block[0].dst_stride + 16;
-
-    unsigned int *src_ptr = (unsigned int *)above_right;
-    unsigned int *dst_ptr0 = (unsigned int *)(above_right + 4 * x->block[0].dst_stride);
-    unsigned int *dst_ptr1 = (unsigned int *)(above_right + 8 * x->block[0].dst_stride);
-    unsigned int *dst_ptr2 = (unsigned int *)(above_right + 12 * x->block[0].dst_stride);
-
-    *dst_ptr0 = *src_ptr;
-    *dst_ptr1 = *src_ptr;
-    *dst_ptr2 = *src_ptr;
-}
-
-
-
-/*
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-
-    vp8_intra_prediction_down_copy(x);
-
-    for(i=0;i<16;i++)
-    {
-        BLOCKD *b = &x->block[i];
-
-        vp8_predict_intra4x4(b, x->block[i].bmi.mode,x->block[i].predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-    vp8_recon_intra_mbuv(x);
-
-}
-*/
-void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
-{
-    int i;
-    BLOCKD *b = &x->block[0];
-
-    vp8_intra_prediction_down_copy(x);
-
-    {
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-        b += 1;
-
-        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
-        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
-    }
-
-    vp8_recon_intra_mbuv(rtcd, x);
-
-}
diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c
index 0ef375e..0c9b77e 100644
--- a/vp8/common/generic/systemdependent.c
+++ b/vp8/common/generic/systemdependent.c
@@ -40,9 +40,11 @@
     rtcd->recon.copy16x16   = vp8_copy_mem16x16_c;
     rtcd->recon.copy8x8     = vp8_copy_mem8x8_c;
     rtcd->recon.copy8x4     = vp8_copy_mem8x4_c;
-    rtcd->recon.recon      = vp8_recon_b_c;
+    rtcd->recon.recon       = vp8_recon_b_c;
     rtcd->recon.recon2      = vp8_recon2b_c;
-    rtcd->recon.recon4     = vp8_recon4b_c;
+    rtcd->recon.recon4      = vp8_recon4b_c;
+    rtcd->recon.recon_mb    = vp8_recon_mb_c;
+    rtcd->recon.recon_mby   = vp8_recon_mby_c;
 
     rtcd->subpix.sixtap16x16   = vp8_sixtap_predict16x16_c;
     rtcd->subpix.sixtap8x8     = vp8_sixtap_predict8x8_c;
diff --git a/vp8/common/recon.c b/vp8/common/recon.c
index 0b439e0..2cb3bc6 100644
--- a/vp8/common/recon.c
+++ b/vp8/common/recon.c
@@ -106,8 +106,24 @@
     }
 }
 
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+void vp8_recon_mby_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
+#if ARCH_ARM
+    BLOCKD *b = &x->block[0];
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[4];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[8];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+
+    //b = &x->block[12];
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+#else
     int i;
 
     for (i = 0; i < 16; i += 4)
@@ -116,10 +132,36 @@
 
         RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 }
 
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
+void vp8_recon_mb_c(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x)
 {
+#if ARCH_ARM
+    BLOCKD *b = &x->block[0];
+
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+    RECON_INVOKE(rtcd, recon4)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b += 4;
+
+    //b = &x->block[16];
+
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    b++;
+    b++;
+    RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+#else
     int i;
 
     for (i = 0; i < 16; i += 4)
@@ -135,4 +177,5 @@
 
         RECON_INVOKE(rtcd, recon2)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 }
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index e34a63c..c432c7b 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -12,12 +12,19 @@
 #ifndef __INC_RECON_H
 #define __INC_RECON_H
 
+#include "blockd.h"
+
 #define prototype_copy_block(sym) \
     void sym(unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch)
 
 #define prototype_recon_block(sym) \
     void sym(unsigned char *pred, short *diff, unsigned char *dst, int pitch);
 
+#define prototype_recon_macroblock(sym) \
+    void sym(const struct vp8_recon_rtcd_vtable *rtcd, MACROBLOCKD *x);
+
+struct vp8_recon_rtcd_vtable;
+
 #if ARCH_X86 || ARCH_X86_64
 #include "x86/recon_x86.h"
 #endif
@@ -56,9 +63,20 @@
 #endif
 extern prototype_recon_block(vp8_recon_recon4);
 
+#ifndef vp8_recon_recon_mb
+#define vp8_recon_recon_mb vp8_recon_mb_c
+#endif
+extern prototype_recon_macroblock(vp8_recon_recon_mb);
+
+#ifndef vp8_recon_recon_mby
+#define vp8_recon_recon_mby vp8_recon_mby_c
+#endif
+extern prototype_recon_macroblock(vp8_recon_recon_mby);
+
 typedef prototype_copy_block((*vp8_copy_block_fn_t));
 typedef prototype_recon_block((*vp8_recon_fn_t));
-typedef struct
+typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t));
+typedef struct vp8_recon_rtcd_vtable
 {
     vp8_copy_block_fn_t  copy16x16;
     vp8_copy_block_fn_t  copy8x8;
@@ -66,6 +84,8 @@
     vp8_recon_fn_t       recon;
     vp8_recon_fn_t       recon2;
     vp8_recon_fn_t       recon4;
+    vp8_recon_mb_fn_t    recon_mb;
+    vp8_recon_mb_fn_t    recon_mby;
 } vp8_recon_rtcd_vtable_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -74,9 +94,6 @@
 #define RECON_INVOKE(ctx,fn) vp8_recon_##fn
 #endif
 
-#include "blockd.h"
-void vp8_recon16x16mby(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
-void vp8_recon16x16mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 void vp8_recon_intra4x4mb(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *x);
 #endif
diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c
index c6e5fe7..3151351 100644
--- a/vp8/common/reconintra4x4.c
+++ b/vp8/common/reconintra4x4.c
@@ -318,6 +318,74 @@
 
     vp8_intra_prediction_down_copy(x);
 
+#if ARCH_ARM
+    {
+        BLOCKD *b = &x->block[0];
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+        b += 1;
+
+        vp8_predict_intra4x4(b, b->bmi.mode, b->predictor);
+        RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
+    }
+#else
     for (i = 0; i < 16; i++)
     {
         BLOCKD *b = &x->block[i];
@@ -325,6 +393,7 @@
         vp8_predict_intra4x4(b, x->block[i].bmi.mode, x->block[i].predictor);
         RECON_INVOKE(rtcd, recon)(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
     }
+#endif
 
     vp8_recon_intra_mbuv(rtcd, x);
 
diff --git a/vp8/encoder/arm/mcomp_arm.c b/vp8/encoder/arm/mcomp_arm.c
deleted file mode 100644
index 5635832..0000000
--- a/vp8/encoder/arm/mcomp_arm.c
+++ /dev/null
@@ -1,1380 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "mcomp.h"
-#include "vpx_mem/vpx_mem.h"
-
-#include <stdio.h>
-#include <limits.h>
-#include <math.h>
-
-#ifdef ENTROPY_STATS
-static int mv_ref_ct [31] [4] [2];
-static int mv_mode_cts [4] [2];
-#endif
-
-static int mv_bits_sadcost[256];
-
-extern unsigned int vp8_sub_pixel_variance16x16s_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-extern unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
-(
-    unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-
-void vp8cx_init_mv_bits_sadcost()
-{
-    int i;
-
-    for (i = 0; i < 256; i++)
-    {
-        mv_bits_sadcost[i] = (int)sqrt(i * 16);
-    }
-}
-
-
-int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight)
-{
-    // MV costing is based on the distribution of vectors in the previous frame and as such will tend to
-    // over state the cost of vectors. In addition coding a new vector can have a knock on effect on the
-    // cost of subsequent vectors and the quality of prediction from NEAR and NEAREST for subsequent blocks.
-    // The "Weight" parameter allows, to a limited extent, for some account to be taken of these factors.
-    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * Weight) >> 7;
-}
-
-int vp8_mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
-{
-    //int i;
-    //return ((mvcost[0][(mv->row - ref->row)>>1] + mvcost[1][(mv->col - ref->col)>>1] + 128) * error_per_bit) >> 8;
-    //return ( (vp8_mv_bit_cost(mv,  ref, mvcost, 100) + 128) * error_per_bit) >> 8;
-
-    //i = (vp8_mv_bit_cost(mv,  ref, mvcost, 100) * error_per_bit + 128) >> 8;
-    return ((mvcost[0][(mv->row - ref->row) >> 1] + mvcost[1][(mv->col - ref->col) >> 1]) * error_per_bit + 128) >> 8;
-    //return (vp8_mv_bit_cost(mv,  ref, mvcost, 128) * error_per_bit + 128) >> 8;
-}
-
-
-static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
-{
-    // get the estimated number of bits for a motion vector, to be used for costing in SAD based
-    // motion estimation
-    return ((mvcost[0][(mv->row - ref->row) >> 1]  +  mvcost[1][(mv->col - ref->col)>> 1]) + 128) >> 8;
-}
-
-void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride)
-{
-    int Len;
-    int search_site_count = 0;
-
-
-    // Generate offsets for 4 search sites per step.
-    Len = MAX_FIRST_STEP;
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = 0;
-    search_site_count++;
-
-    while (Len > 0)
-    {
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = -Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = Len;
-        search_site_count++;
-
-        // Contract.
-        Len /= 2;
-    }
-
-    x->ss_count = search_site_count;
-    x->searches_per_step = 4;
-}
-
-void vp8_init3smotion_compensation(MACROBLOCK *x, int stride)
-{
-    int Len;
-    int search_site_count = 0;
-
-    // Generate offsets for 8 search sites per step.
-    Len = MAX_FIRST_STEP;
-    x->ss[search_site_count].mv.col = 0;
-    x->ss[search_site_count].mv.row = 0;
-    x->ss[search_site_count].offset = 0;
-    search_site_count++;
-
-    while (Len > 0)
-    {
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = 0;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = -Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = 0;
-        x->ss[search_site_count].offset = Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride - Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = -Len;
-        x->ss[search_site_count].offset = -Len * stride + Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = -Len;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride - Len;
-        search_site_count++;
-
-        // Compute offsets for search sites.
-        x->ss[search_site_count].mv.col = Len;
-        x->ss[search_site_count].mv.row = Len;
-        x->ss[search_site_count].offset = Len * stride + Len;
-        search_site_count++;
-
-
-        // Contract.
-        Len /= 2;
-    }
-
-    x->ss_count = search_site_count;
-    x->searches_per_step = 8;
-}
-
-
-#define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
-#define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
-#define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
-#define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
-#define MIN(x,y) (((x)<(y))?(x):(y))
-#define MAX(x,y) (((x)>(y))?(x):(y))
-
-//#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
-
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-
-    int rr = ref_mv->row >> 1, rc = ref_mv->col >> 1;
-    int br = bestmv->row << 2, bc = bestmv->col << 2;
-    int tr = br, tc = bc;
-    unsigned int besterr = INT_MAX;
-    unsigned int left, right, up, down, diag;
-    unsigned int sse;
-    unsigned int whichdir;
-    unsigned int halfiters = 4;
-    unsigned int quarteriters = 4;
-
-    int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1));
-    int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1));
-    int minr = MAX(x->mv_row_min << 2, (ref_mv->row >> 1) - ((1 << mvlong_width) - 1));
-    int maxr = MIN(x->mv_row_max << 2, (ref_mv->row >> 1) + ((1 << mvlong_width) - 1));
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-
-    // calculate central point error
-    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
-    while (--halfiters)
-    {
-        // 1/2 pel
-        CHECK_BETTER(left, tr, tc - 2);
-        CHECK_BETTER(right, tr, tc + 2);
-        CHECK_BETTER(up, tr - 2, tc);
-        CHECK_BETTER(down, tr + 2, tc);
-
-        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-        switch (whichdir)
-        {
-        case 0:
-            CHECK_BETTER(diag, tr - 2, tc - 2);
-            break;
-        case 1:
-            CHECK_BETTER(diag, tr - 2, tc + 2);
-            break;
-        case 2:
-            CHECK_BETTER(diag, tr + 2, tc - 2);
-            break;
-        case 3:
-            CHECK_BETTER(diag, tr + 2, tc + 2);
-            break;
-        }
-
-        // no reason to check the same one again.
-        if (tr == br && tc == bc)
-            break;
-
-        tr = br;
-        tc = bc;
-    }
-
-    // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
-    // 1/4 pel
-    while (--quarteriters)
-    {
-        CHECK_BETTER(left, tr, tc - 1);
-        CHECK_BETTER(right, tr, tc + 1);
-        CHECK_BETTER(up, tr - 1, tc);
-        CHECK_BETTER(down, tr + 1, tc);
-
-        whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-        switch (whichdir)
-        {
-        case 0:
-            CHECK_BETTER(diag, tr - 1, tc - 1);
-            break;
-        case 1:
-            CHECK_BETTER(diag, tr - 1, tc + 1);
-            break;
-        case 2:
-            CHECK_BETTER(diag, tr + 1, tc - 1);
-            break;
-        case 3:
-            CHECK_BETTER(diag, tr + 1, tc + 1);
-            break;
-        }
-
-        // no reason to check the same one again.
-        if (tr == br && tc == bc)
-            break;
-
-        tr = br;
-        tc = bc;
-    }
-
-    bestmv->row = br << 1;
-    bestmv->col = bc << 1;
-
-    if ((abs(bestmv->col - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs(bestmv->row - ref_mv->row) > MAX_FULL_PEL_VAL))
-        return INT_MAX;
-
-    return besterr;
-}
-#undef MVC
-#undef PRE
-#undef SP
-#undef DIST
-#undef ERR
-#undef CHECK_BETTER
-#undef MIN
-#undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    int bestmse = INT_MAX;
-    MV startmv;
-    //MV this_mv;
-    MV this_mv;
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-    int left, right, up, down, diag;
-    unsigned int sse;
-    int whichdir ;
-
-
-    // Trap uncodable vectors
-    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
-    {
-        bestmv->row <<= 3;
-        bestmv->col <<= 3;
-        return INT_MAX;
-    }
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-    startmv = *bestmv;
-
-    // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-    this_mv.col = ((startmv.col - 8) | 4);
-    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 8;
-    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-    this_mv.row = ((startmv.row - 8) | 4);
-    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 8;
-    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-
-    // now check 1 more diagonal
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    //for(whichdir =0;whichdir<4;whichdir++)
-    //{
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 1:
-        this_mv.col += 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 2:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row += 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    case 3:
-        this_mv.col += 4;
-        this_mv.row += 4;
-        diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-//  }
-
-
-    // time to check quarter pels.
-    if (bestmv->row < startmv.row)
-        y -= d->pre_stride;
-
-    if (bestmv->col < startmv.col)
-        y--;
-
-    startmv = *bestmv;
-
-
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-
-    if (startmv.col & 7)
-    {
-        this_mv.col = startmv.col - 2;
-        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-    else
-    {
-        this_mv.col = (startmv.col - 8) | 6;
-        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 4;
-    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-
-    if (startmv.row & 7)
-    {
-        this_mv.row = startmv.row - 2;
-        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    }
-    else
-    {
-        this_mv.row = (startmv.row - 8) | 6;
-        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-    }
-
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 4;
-    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-
-    // now check 1 more diagonal
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-
-//  for(whichdir=0;whichdir<4;whichdir++)
-//  {
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-
-        if (startmv.row & 7)
-        {
-            this_mv.row -= 2;
-
-            if (startmv.col & 7)
-            {
-                this_mv.col -= 2;
-                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-            }
-            else
-            {
-                this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
-            }
-        }
-        else
-        {
-            this_mv.row = (startmv.row - 8) | 6;
-
-            if (startmv.col & 7)
-            {
-                this_mv.col -= 2;
-                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-            }
-            else
-            {
-                this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
-            }
-        }
-
-        break;
-    case 1:
-        this_mv.col += 2;
-
-        if (startmv.row & 7)
-        {
-            this_mv.row -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        }
-        else
-        {
-            this_mv.row = (startmv.row - 8) | 6;
-            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
-        }
-
-        break;
-    case 2:
-        this_mv.row += 2;
-
-        if (startmv.col & 7)
-        {
-            this_mv.col -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        }
-        else
-        {
-            this_mv.col = (startmv.col - 8) | 6;
-            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
-        }
-
-        break;
-    case 3:
-        this_mv.col += 2;
-        this_mv.row += 2;
-        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-//  }
-
-    return bestmse;
-}
-
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
-{
-    int bestmse = INT_MAX;
-    MV startmv;
-    //MV this_mv;
-    MV this_mv;
-    unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
-    unsigned char *z = (*(b->base_src) + b->src);
-    int left, right, up, down, diag;
-    unsigned int sse;
-
-    // Trap uncodable vectors
-    if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL))
-    {
-        bestmv->row <<= 3;
-        bestmv->col <<= 3;
-        return INT_MAX;
-    }
-
-    // central mv
-    bestmv->row <<= 3;
-    bestmv->col <<= 3;
-    startmv = *bestmv;
-
-    // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
-    bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
-
-    // go left then right and check error
-    this_mv.row = startmv.row;
-    this_mv.col = ((startmv.col - 8) | 4);
-    left = vp8_sub_pixel_variance16x16s_4_0_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (left < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = left;
-    }
-
-    this_mv.col += 8;
-    right = vp8_sub_pixel_variance16x16s_4_0_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (right < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = right;
-    }
-
-    // go up then down and check error
-    this_mv.col = startmv.col;
-    this_mv.row = ((startmv.row - 8) | 4);
-    up = vp8_sub_pixel_variance16x16s_0_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (up < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = up;
-    }
-
-    this_mv.row += 8;
-    down = vp8_sub_pixel_variance16x16s_0_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (down < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = down;
-    }
-
-    // somewhat strangely not doing all the diagonals for half pel is slower than doing them.
-#if 0
-    // now check 1 more diagonal -
-    whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    this_mv = startmv;
-
-    switch (whichdir)
-    {
-    case 0:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 1:
-        this_mv.col += 4;
-        this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 2:
-        this_mv.col = (this_mv.col - 8) | 4;
-        this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    case 3:
-        this_mv.col += 4;
-        this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
-        break;
-    }
-
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-#else
-    this_mv.col = (this_mv.col - 8) | 4;
-    this_mv.row = (this_mv.row - 8) | 4;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col += 8;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col = (this_mv.col - 8) | 4;
-    this_mv.row = startmv.row + 4;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y - 1, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-    this_mv.col += 8;
-    diag = vp8_sub_pixel_variance16x16s_4_4_neon(y, d->pre_stride, z, b->src_stride, &sse);
-    diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-
-    if (diag < bestmse)
-    {
-        *bestmv = this_mv;
-        bestmse = diag;
-    }
-
-#endif
-    return bestmse;
-}
-
-#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
-#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
-#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
-#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
-#define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
-const MV next_chkpts[6][3] =
-{
-    {{ -2, 0}, { -1, -2}, {1, -2}},
-    {{ -1, -2}, {1, -2}, {2, 0}},
-    {{1, -2}, {2, 0}, {1, 2}},
-    {{2, 0}, {1, 2}, { -1, 2}},
-    {{1, 2}, { -1, 2}, { -2, 0}},
-    {{ -1, 2}, { -2, 0}, { -1, -2}}
-};
-int vp8_hex_search
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t      sf,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
-    MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
-    int i, j;
-    unsigned char *src = (*(b->base_src) + b->src);
-    int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
-    unsigned int besterr, thiserr = 0x7fffffff;
-    int k = -1, tk;
-
-    if (bc < x->mv_col_min) bc = x->mv_col_min;
-
-    if (bc > x->mv_col_max) bc = x->mv_col_max;
-
-    if (br < x->mv_row_min) br = x->mv_row_min;
-
-    if (br > x->mv_row_max) br = x->mv_row_max;
-
-    rr >>= 1;
-    rc >>= 1;
-
-    besterr = ERR(br, bc, thiserr);
-
-    // hex search
-    //j=0
-    tr = br;
-    tc = bc;
-
-    for (i = 0; i < 6; i++)
-    {
-        int nr = tr + hex[i].row, nc = tc + hex[i].col;
-
-        if (nc < x->mv_col_min) continue;
-
-        if (nc > x->mv_col_max) continue;
-
-        if (nr < x->mv_row_min) continue;
-
-        if (nr > x->mv_row_max) continue;
-
-        //CHECK_BETTER(thiserr,nr,nc);
-        if ((thiserr = ERR(nr, nc, besterr)) < besterr)
-        {
-            besterr = thiserr;
-            br = nr;
-            bc = nc;
-            k = i;
-        }
-    }
-
-    if (tr == br && tc == bc)
-        goto cal_neighbors;
-
-    for (j = 1; j < 127; j++)
-    {
-        tr = br;
-        tc = bc;
-        tk = k;
-
-        for (i = 0; i < 3; i++)
-        {
-            int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
-
-            if (nc < x->mv_col_min) continue;
-
-            if (nc > x->mv_col_max) continue;
-
-            if (nr < x->mv_row_min) continue;
-
-            if (nr > x->mv_row_max) continue;
-
-            //CHECK_BETTER(thiserr,nr,nc);
-            if ((thiserr = ERR(nr, nc, besterr)) < besterr)
-            {
-                besterr = thiserr;
-                br = nr;
-                bc = nc; //k=(tk+5+i)%6;}
-                k = tk + 5 + i;
-
-                if (k >= 12) k -= 12;
-                else if (k >= 6) k -= 6;
-            }
-        }
-
-        if (tr == br && tc == bc)
-            break;
-    }
-
-    // check 8 1 away neighbors
-cal_neighbors:
-    tr = br;
-    tc = bc;
-
-    for (i = 0; i < 8; i++)
-    {
-        int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col;
-
-        if (nc < x->mv_col_min) continue;
-
-        if (nc > x->mv_col_max) continue;
-
-        if (nr < x->mv_row_min) continue;
-
-        if (nr > x->mv_row_max) continue;
-
-        CHECK_BETTER(thiserr, nr, nc);
-    }
-
-    best_mv->row = br;
-    best_mv->col = bc;
-
-    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
-}
-#undef MVC
-#undef PRE
-#undef SP
-#undef DIST
-#undef ERR
-#undef CHECK_BETTER
-
-int vp8_diamond_search_sad
-(
-    MACROBLOCK *x,
-    BLOCK *b,
-    BLOCKD *d,
-    MV *ref_mv,
-    MV *best_mv,
-    int search_param,
-    int error_per_bit,
-    int *num00,
-    vp8_variance_fn_ptr_t *fn_ptr,
-    int *mvsadcost[2],
-    int *mvcost[2]
-)
-{
-    int i, j, step;
-
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    unsigned char *best_address;
-
-    int tot_steps;
-    MV this_mv;
-
-    int bestsad = INT_MAX;
-    int best_site = 0;
-    int last_site = 0;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-    int this_row_offset;
-    int this_col_offset;
-    search_site *ss;
-
-    unsigned char *check_here;
-    int thissad;
-
-    // Work out the start point for the search
-    in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
-    best_address = in_what;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Check the starting position
-        bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // search_param determines the length of the initial step and hence the number of iterations
-    // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 = (MAX_FIRST_STEP/4) pel... etc.
-    ss = &x->ss[search_param * x->searches_per_step];
-    tot_steps = (x->ss_count / x->searches_per_step) - search_param;
-
-    i = 1;
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    *num00 = 0;
-
-    for (step = 0; step < tot_steps ; step++)
-    {
-        for (j = 0 ; j < x->searches_per_step ; j++)
-        {
-            // Trap illegal vectors
-            this_row_offset = best_mv->row + ss[i].mv.row;
-            this_col_offset = best_mv->col + ss[i].mv.col;
-
-            if ((this_col_offset > x->mv_col_min) && (this_col_offset < x->mv_col_max) &&
-            (this_row_offset > x->mv_row_min) && (this_row_offset < x->mv_row_max))
-
-            {
-                check_here = ss[i].offset + best_address;
-                thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-                if (thissad < bestsad)
-                {
-                    this_mv.row = this_row_offset << 3;
-                    this_mv.col = this_col_offset << 3;
-                    thissad += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_site = i;
-                    }
-                }
-            }
-
-            i++;
-        }
-
-        if (best_site != last_site)
-        {
-            best_mv->row += ss[best_site].mv.row;
-            best_mv->col += ss[best_site].mv.col;
-            best_address += ss[best_site].offset;
-            last_site = best_site;
-        }
-        else if (best_address == in_what)
-            (*num00)++;
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad == INT_MAX)
-        return INT_MAX;
-
-    return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad))
-    + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-}
-
-#if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
-{
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
-    unsigned char *bestaddress;
-    MV *best_mv = &d->bmi.mv.as_mv;
-    MV this_mv;
-    int bestsad = INT_MAX;
-    int r, c;
-
-    unsigned char *check_here;
-    int thissad;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-
-    int row_min = ref_row - distance;
-    int row_max = ref_row + distance;
-    int col_min = ref_col - distance;
-    int col_max = ref_col + distance;
-
-    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Baseline value at the centre
-
-        //bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(vp8_mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-    if (col_min < x->mv_col_min)
-        col_min = x->mv_col_min;
-
-    if (col_max > x->mv_col_max)
-        col_max = x->mv_col_max;
-
-    if (row_min < x->mv_row_min)
-        row_min = x->mv_row_min;
-
-    if (row_max > x->mv_row_max)
-        row_max = x->mv_row_max;
-
-    for (r = row_min; r < row_max ; r++)
-    {
-        this_mv.row = r << 3;
-        check_here = r * mv_stride + in_what + col_min;
-
-        for (c = col_min; c < col_max; c++)
-        {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-            this_mv.col = c << 3;
-            //thissad += (int)sqrt(vp8_mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
-            //thissad  += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
-            thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
-
-            if (thissad < bestsad)
-            {
-                bestsad = thissad;
-                best_mv->row = r;
-                best_mv->col = c;
-                bestaddress = check_here;
-            }
-
-            check_here++;
-        }
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-    else
-        return INT_MAX;
-}
-
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
-{
-    unsigned char *what = (*(b->base_src) + b->src);
-    int what_stride = b->src_stride;
-    unsigned char *in_what;
-    int in_what_stride = d->pre_stride;
-    int mv_stride = d->pre_stride;
-    unsigned char *bestaddress;
-    MV *best_mv = &d->bmi.mv.as_mv;
-    MV this_mv;
-    int bestsad = INT_MAX;
-    int r, c;
-
-    unsigned char *check_here;
-    unsigned int thissad;
-
-    int ref_row = ref_mv->row >> 3;
-    int ref_col = ref_mv->col >> 3;
-
-    int row_min = ref_row - distance;
-    int row_max = ref_row + distance;
-    int col_min = ref_col - distance;
-    int col_max = ref_col + distance;
-
-    unsigned int sad_array[3];
-
-    // Work out the mid point for the search
-    in_what = *(d->base_pre) + d->pre;
-    bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
-
-    best_mv->row = ref_row;
-    best_mv->col = ref_col;
-
-    // We need to check that the starting point for the search (as indicated by ref_mv) is within the buffer limits
-    if ((ref_col > x->mv_col_min) && (ref_col < x->mv_col_max) &&
-    (ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
-    {
-        // Baseline value at the centre
-        bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + vp8_mv_err_cost(ref_mv, ref_mv, mvsadcost, error_per_bit);
-    }
-
-    // Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
-    if (col_min < x->mv_col_min)
-        col_min = x->mv_col_min;
-
-    if (col_max > x->mv_col_max)
-        col_max = x->mv_col_max;
-
-    if (row_min < x->mv_row_min)
-        row_min = x->mv_row_min;
-
-    if (row_max > x->mv_row_max)
-        row_max = x->mv_row_max;
-
-    for (r = row_min; r < row_max ; r++)
-    {
-        this_mv.row = r << 3;
-        check_here = r * mv_stride + in_what + col_min;
-        c = col_min;
-
-        while ((c + 3) < col_max)
-        {
-            int i;
-
-            fn_ptr->sdx3f(what, what_stride, check_here , in_what_stride, sad_array);
-
-            for (i = 0; i < 3; i++)
-            {
-                thissad = sad_array[i];
-
-                if (thissad < bestsad)
-                {
-                    this_mv.col = c << 3;
-                    thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                    if (thissad < bestsad)
-                    {
-                        bestsad = thissad;
-                        best_mv->row = r;
-                        best_mv->col = c;
-                        bestaddress = check_here;
-                    }
-                }
-
-                check_here++;
-                c++;
-            }
-        }
-
-        while (c < col_max)
-        {
-            thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
-
-            if (thissad < bestsad)
-            {
-                this_mv.col = c << 3;
-                thissad  += vp8_mv_err_cost(&this_mv, ref_mv, mvsadcost, error_per_bit);
-
-                if (thissad < bestsad)
-                {
-                    bestsad = thissad;
-                    best_mv->row = r;
-                    best_mv->col = c;
-                    bestaddress = check_here;
-                }
-            }
-
-            check_here ++;
-            c ++;
-        }
-
-    }
-
-    this_mv.row = best_mv->row << 3;
-    this_mv.col = best_mv->col << 3;
-
-    if (bestsad < INT_MAX)
-        return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, (unsigned int *)(&thissad))
-        + vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
-    else
-        return INT_MAX;
-}
-#endif
-
-#ifdef ENTROPY_STATS
-void print_mode_context(void)
-{
-    FILE *f = fopen("modecont.c", "w");
-    int i, j;
-
-    fprintf(f, "#include \"entropy.h\"\n");
-    fprintf(f, "const int vp8_mode_contexts[6][4] =\n");
-    fprintf(f, "{\n");
-
-    for (j = 0; j < 6; j++)
-    {
-        fprintf(f, "  { // %d \n", j);
-        fprintf(f, "    ");
-
-        for (i = 0; i < 4; i++)
-        {
-            int overal_prob;
-            int this_prob;
-            int count; // = mv_ref_ct[j][i][0]+mv_ref_ct[j][i][1];
-
-            // Overall probs
-            count = mv_mode_cts[i][0] + mv_mode_cts[i][1];
-
-            if (count)
-                overal_prob = 256 * mv_mode_cts[i][0] / count;
-            else
-                overal_prob = 128;
-
-            if (overal_prob == 0)
-                overal_prob = 1;
-
-            // context probs
-            count = mv_ref_ct[j][i][0] + mv_ref_ct[j][i][1];
-
-            if (count)
-                this_prob = 256 * mv_ref_ct[j][i][0] / count;
-            else
-                this_prob = 128;
-
-            if (this_prob == 0)
-                this_prob = 1;
-
-            fprintf(f, "%5d, ", this_prob);
-            //fprintf(f,"%5d, %5d, %8d,", this_prob, overal_prob, (this_prob << 10)/overal_prob);
-            //fprintf(f,"%8d, ", (this_prob << 10)/overal_prob);
-        }
-
-        fprintf(f, "  },\n");
-    }
-
-    fprintf(f, "};\n");
-    fclose(f);
-}
-
-/* MV ref count ENTROPY_STATS stats code */
-#ifdef ENTROPY_STATS
-void init_mv_ref_counts()
-{
-    vpx_memset(mv_ref_ct, 0, sizeof(mv_ref_ct));
-    vpx_memset(mv_mode_cts, 0, sizeof(mv_mode_cts));
-}
-
-void accum_mv_refs(MB_PREDICTION_MODE m, const int ct[4])
-{
-    if (m == ZEROMV)
-    {
-        ++mv_ref_ct [ct[0]] [0] [0];
-        ++mv_mode_cts[0][0];
-    }
-    else
-    {
-        ++mv_ref_ct [ct[0]] [0] [1];
-        ++mv_mode_cts[0][1];
-
-        if (m == NEARESTMV)
-        {
-            ++mv_ref_ct [ct[1]] [1] [0];
-            ++mv_mode_cts[1][0];
-        }
-        else
-        {
-            ++mv_ref_ct [ct[1]] [1] [1];
-            ++mv_mode_cts[1][1];
-
-            if (m == NEARMV)
-            {
-                ++mv_ref_ct [ct[2]] [2] [0];
-                ++mv_mode_cts[2][0];
-            }
-            else
-            {
-                ++mv_ref_ct [ct[2]] [2] [1];
-                ++mv_mode_cts[2][1];
-
-                if (m == NEWMV)
-                {
-                    ++mv_ref_ct [ct[3]] [3] [0];
-                    ++mv_mode_cts[3][0];
-                }
-                else
-                {
-                    ++mv_ref_ct [ct[3]] [3] [1];
-                    ++mv_mode_cts[3][1];
-                }
-            }
-        }
-    }
-}
-
-#endif/* END MV ref count ENTROPY_STATS stats code */
-
-#endif
diff --git a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
index 1c1441c..0a2b71c 100644
--- a/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
+++ b/vp8/encoder/arm/neon/vp8_subpixelvariance16x16s_neon.asm
@@ -9,9 +9,9 @@
 ;
 
 
-    EXPORT  |vp8_sub_pixel_variance16x16s_4_0_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_0_4_neon|
-    EXPORT  |vp8_sub_pixel_variance16x16s_4_4_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_h_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_v_neon|
+    EXPORT  |vp8_variance_halfpixvar16x16_hv_neon|
     EXPORT  |vp8_sub_pixel_variance16x16s_neon|
     ARM
     REQUIRE8
@@ -20,7 +20,7 @@
     AREA ||.text||, CODE, READONLY, ALIGN=2
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_4_0_neon
+;unsigned int vp8_variance_halfpixvar16x16_h_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -29,7 +29,7 @@
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_4_0_neon| PROC
+|vp8_variance_halfpixvar16x16_h_neon| PROC
     push            {lr}
 
     mov             r12, #4                  ;loop counter
@@ -120,7 +120,7 @@
     ENDP
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_0_4_neon
+;unsigned int vp8_variance_halfpixvar16x16_v_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -129,7 +129,7 @@
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_0_4_neon| PROC
+|vp8_variance_halfpixvar16x16_v_neon| PROC
     push            {lr}
 
     mov             r12, #4                     ;loop counter
@@ -216,7 +216,7 @@
     ENDP
 
 ;================================================
-;unsigned int vp8_sub_pixel_variance16x16s_4_4_neon
+;unsigned int vp8_variance_halfpixvar16x16_hv_neon
 ;(
 ;    unsigned char  *src_ptr, r0
 ;    int  src_pixels_per_line,  r1
@@ -225,7 +225,7 @@
 ;    unsigned int *sse
 ;);
 ;================================================
-|vp8_sub_pixel_variance16x16s_4_4_neon| PROC
+|vp8_variance_halfpixvar16x16_hv_neon| PROC
     push            {lr}
 
     vld1.u8         {d0, d1, d2, d3}, [r0], r1      ;load src data
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index fb9dd5a..0e5f62f 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -30,6 +30,9 @@
 //extern prototype_subpixvariance(vp8_sub_pixel_variance8x16_c);
 //extern prototype_subpixvariance(vp8_sub_pixel_variance16x8_c);
 extern prototype_subpixvariance(vp8_sub_pixel_variance16x16_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_h_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_v_neon);
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
 
 //extern prototype_getmbss(vp8_get_mb_ss_c);
 extern prototype_variance(vp8_mse16x16_neon);
@@ -84,6 +87,15 @@
 #undef  vp8_variance_subpixvar16x16
 #define vp8_variance_subpixvar16x16 vp8_sub_pixel_variance16x16_neon
 
+#undef  vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_variance_halfpixvar16x16_h_neon
+
+#undef  vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_variance_halfpixvar16x16_v_neon
+
+#undef  vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_variance_halfpixvar16x16_hv_neon
+
 //#undef  vp8_variance_getmbss
 //#define vp8_variance_getmbss vp8_get_mb_ss_c
 
diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index a790456..1c72b90 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c
@@ -113,7 +113,8 @@
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 
     // make sure block modes are set the way we want them for context updates
     for (b = 0; b < 16; b++)
@@ -157,7 +158,8 @@
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 
     // make sure block modes are set the way we want them for context updates
     for (b = 0; b < 16; b++)
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 99ee2b8..043eac2 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -639,7 +639,8 @@
 
     vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mb(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mb)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
 
@@ -656,7 +657,8 @@
 
     vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd);
 
-    vp8_recon16x16mby(IF_RTCD(&rtcd->common->recon), &x->e_mbd);
+    RECON_INVOKE(&rtcd->common->recon, recon_mby)
+        (IF_RTCD(&rtcd->common->recon), &x->e_mbd);
 }
 
 
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 607c3d2..691aee0 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -462,12 +462,11 @@
     int step_param = 3;                                       //3;          // Dont search over full range for first pass
     int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; //3;
     int n;
-    vp8_variance_fn_ptr_t v_fn_ptr;
+    vp8_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
     int new_mv_mode_penalty = 256;
 
+    // override the default variance function to use MSE
     v_fn_ptr.vf    = VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16);
-    v_fn_ptr.sdf   = cpi->fn_ptr.sdf;
-    v_fn_ptr.sdx4df = cpi->fn_ptr.sdx4df;
 
     // Set up pointers for this macro block recon buffer
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 4d60b92..8cc63f8 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -186,7 +186,7 @@
 #define MVC(r,c) (((mvcost[0][(r)-rr] + mvcost[1][(c) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
 #define PRE(r,c) (*(d->base_pre) + d->pre + ((r)>>2) * d->pre_stride + ((c)>>2)) // pointer to predictor base of a motionvector
 #define SP(x) (((x)&3)<<1) // convert motion vector component to offset for svf calc
-#define DIST(r,c) svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
+#define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function.
 #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e;
 #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best
@@ -195,7 +195,7 @@
 
 //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }
 
-int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col;
     unsigned char *z = (*(b->base_src) + b->src);
@@ -220,7 +220,7 @@
     bestmv->col <<= 3;
 
     // calculate central point error
-    besterr = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     besterr += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected)
@@ -309,7 +309,7 @@
 #undef CHECK_BETTER
 #undef MIN
 #undef MAX
-int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -336,13 +336,13 @@
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
@@ -352,7 +352,7 @@
     }
 
     this_mv.col += 8;
-    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -364,7 +364,7 @@
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
@@ -374,7 +374,7 @@
     }
 
     this_mv.row += 8;
-    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -386,10 +386,6 @@
 
     // now check 1 more diagonal
     whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
-    // whichdir must be 0-4. Therefore, one of the cases below
-    // must run through. However, because there is no default
-    // and diag is not set elsewhere, we get a compile warning
-    diag = 0;
     //for(whichdir =0;whichdir<4;whichdir++)
     //{
     this_mv = startmv;
@@ -399,22 +395,22 @@
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 3:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
         break;
     }
 
@@ -446,12 +442,12 @@
     if (startmv.col & 7)
     {
         this_mv.col = startmv.col - 2;
-        left = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        left = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.col = (startmv.col - 8) | 6;
-        left = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
+        left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);
     }
 
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
@@ -463,7 +459,7 @@
     }
 
     this_mv.col += 4;
-    right = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -478,12 +474,12 @@
     if (startmv.row & 7)
     {
         this_mv.row = startmv.row - 2;
-        up = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        up = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     }
     else
     {
         this_mv.row = (startmv.row - 8) | 6;
-        up = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+        up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
     }
 
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
@@ -495,7 +491,7 @@
     }
 
     this_mv.row += 4;
-    down = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+    down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -523,12 +519,12 @@
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+                diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+                diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
             }
         }
         else
@@ -538,12 +534,12 @@
             if (startmv.col & 7)
             {
                 this_mv.col -= 2;
-                diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+                diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
             }
             else
             {
                 this_mv.col = (startmv.col - 8) | 6;
-                diag = svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
+                diag = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse);
             }
         }
 
@@ -554,12 +550,12 @@
         if (startmv.row & 7)
         {
             this_mv.row -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.row = (startmv.row - 8) | 6;
-            diag = svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
+            diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse);
         }
 
         break;
@@ -569,19 +565,19 @@
         if (startmv.col & 7)
         {
             this_mv.col -= 2;
-            diag = svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+            diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         }
         else
         {
             this_mv.col = (startmv.col - 8) | 6;
-            diag = svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
+            diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);;
         }
 
         break;
     case 3:
         this_mv.col += 2;
         this_mv.row += 2;
-        diag = svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
+        diag = vfp->svf(y, d->pre_stride,  this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse);
         break;
     }
 
@@ -598,7 +594,7 @@
     return bestmse;
 }
 
-int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     int bestmse = INT_MAX;
     MV startmv;
@@ -623,13 +619,13 @@
     startmv = *bestmv;
 
     // calculate central point error
-    bestmse = vf(y, d->pre_stride, z, b->src_stride, &sse);
+    bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse);
     bestmse += vp8_mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit);
 
     // go left then right and check error
     this_mv.row = startmv.row;
     this_mv.col = ((startmv.col - 8) | 4);
-    left = svf(y - 1, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse);
     left += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (left < bestmse)
@@ -639,7 +635,7 @@
     }
 
     this_mv.col += 8;
-    right = svf(y, d->pre_stride, 4, 0, z, b->src_stride, &sse);
+    right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse);
     right += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (right < bestmse)
@@ -651,7 +647,7 @@
     // go up then down and check error
     this_mv.col = startmv.col;
     this_mv.row = ((startmv.row - 8) | 4);
-    up = svf(y - d->pre_stride, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     up += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (up < bestmse)
@@ -661,7 +657,7 @@
     }
 
     this_mv.row += 8;
-    down = svf(y, d->pre_stride, 0, 4, z, b->src_stride, &sse);
+    down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse);
     down += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (down < bestmse)
@@ -681,22 +677,22 @@
     case 0:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 1:
         this_mv.col += 4;
         this_mv.row = (this_mv.row - 8) | 4;
-        diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 2:
         this_mv.col = (this_mv.col - 8) | 4;
         this_mv.row += 4;
-        diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     case 3:
         this_mv.col += 4;
         this_mv.row += 4;
-        diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+        diag = vfp->svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
         break;
     }
 
@@ -711,7 +707,7 @@
 #else
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = (this_mv.row - 8) | 4;
-    diag = svf(y - 1 - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -721,7 +717,7 @@
     }
 
     this_mv.col += 8;
-    diag = svf(y - d->pre_stride, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -732,7 +728,7 @@
 
     this_mv.col = (this_mv.col - 8) | 4;
     this_mv.row = startmv.row + 4;
-    diag = svf(y - 1, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -742,7 +738,7 @@
     }
 
     this_mv.col += 8;
-    diag = svf(y, d->pre_stride, 4, 4, z, b->src_stride, &sse);
+    diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
     diag += vp8_mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit);
 
     if (diag < bestmse)
@@ -758,10 +754,18 @@
 
 #define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
 #define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
-#define DIST(r,c,v) sf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
+#define DIST(r,c,v) vfp->sdf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
 #define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
 #define CHECK_BETTER(v,r,c) if ((v = ERR(r,c,besterr)) < besterr) { besterr = v; br=r; bc=c; } // checks if (r,c) has better score than previous best
-
+static const MV next_chkpts[6][3] =
+{
+    {{ -2, 0}, { -1, -2}, {1, -2}},
+    {{ -1, -2}, {1, -2}, {2, 0}},
+    {{1, -2}, {2, 0}, {1, 2}},
+    {{2, 0}, {1, 2}, { -1, 2}},
+    {{1, 2}, { -1, 2}, { -2, 0}},
+    {{ -1, 2}, { -2, 0}, { -1, -2}}
+};
 int vp8_hex_search
 (
     MACROBLOCK *x,
@@ -772,44 +776,72 @@
     int search_param,
     int error_per_bit,
     int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t      sf,
+    const vp8_variance_fn_ptr_t *vfp,
     int *mvsadcost[2],
     int *mvcost[2]
 )
 {
-    MV hex[6] = { { -2, 0}, { -1, -2}, { -1, 2}, {2, 0}, {1, 2}, {1, -2} } ;
+    MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ;
     MV neighbors[8] = { { -1, -1}, { -1, 0}, { -1, 1}, {0, -1}, {0, 1}, {1, -1}, {1, 0}, {1, 1} } ;
     int i, j;
     unsigned char *src = (*(b->base_src) + b->src);
     int src_stride = b->src_stride;
-    int rr = ref_mv->row, rc = ref_mv->col, br = rr, bc = rc, tr, tc;
+    int rr = ref_mv->row, rc = ref_mv->col, br = rr >> 3, bc = rc >> 3, tr, tc;
     unsigned int besterr, thiserr = 0x7fffffff;
+    int k = -1, tk;
 
-    if (rc < x->mv_col_min) bc = x->mv_col_min;
+    if (bc < x->mv_col_min) bc = x->mv_col_min;
 
-    if (rc > x->mv_col_max) bc = x->mv_col_max;
+    if (bc > x->mv_col_max) bc = x->mv_col_max;
 
-    if (rr < x->mv_row_min) br = x->mv_row_min;
+    if (br < x->mv_row_min) br = x->mv_row_min;
 
-    if (rr > x->mv_row_max) br = x->mv_row_max;
+    if (br > x->mv_row_max) br = x->mv_row_max;
 
     rr >>= 1;
     rc >>= 1;
-    br >>= 3;
-    bc >>= 3;
 
     besterr = ERR(br, bc, thiserr);
 
-    // hex search  jbb changed to 127 to avoid max 256 problem steping by 2.
-    for (j = 0; j < 127; j++)
+    // hex search
+    //j=0
+    tr = br;
+    tc = bc;
+
+    for (i = 0; i < 6; i++)
+    {
+        int nr = tr + hex[i].row, nc = tc + hex[i].col;
+
+        if (nc < x->mv_col_min) continue;
+
+        if (nc > x->mv_col_max) continue;
+
+        if (nr < x->mv_row_min) continue;
+
+        if (nr > x->mv_row_max) continue;
+
+        //CHECK_BETTER(thiserr,nr,nc);
+        if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+        {
+            besterr = thiserr;
+            br = nr;
+            bc = nc;
+            k = i;
+        }
+    }
+
+    if (tr == br && tc == bc)
+        goto cal_neighbors;
+
+    for (j = 1; j < 127; j++)
     {
         tr = br;
         tc = bc;
+        tk = k;
 
-        for (i = 0; i < 6; i++)
+        for (i = 0; i < 3; i++)
         {
-            int nr = tr + hex[i].row, nc = tc + hex[i].col;
+            int nr = tr + next_chkpts[tk][i].row, nc = tc + next_chkpts[tk][i].col;
 
             if (nc < x->mv_col_min) continue;
 
@@ -819,7 +851,17 @@
 
             if (nr > x->mv_row_max) continue;
 
-            CHECK_BETTER(thiserr, nr, nc);
+            //CHECK_BETTER(thiserr,nr,nc);
+            if ((thiserr = ERR(nr, nc, besterr)) < besterr)
+            {
+                besterr = thiserr;
+                br = nr;
+                bc = nc; //k=(tk+5+i)%6;}
+                k = tk + 5 + i;
+
+                if (k >= 12) k -= 12;
+                else if (k >= 6) k -= 6;
+            }
         }
 
         if (tr == br && tc == bc)
@@ -827,6 +869,7 @@
     }
 
     // check 8 1 away neighbors
+cal_neighbors:
     tr = br;
     tc = bc;
 
@@ -848,7 +891,7 @@
     best_mv->row = br;
     best_mv->col = bc;
 
-    return vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
+    return vfp->vf(src, src_stride, PRE(br, bc), d->pre_stride, &thiserr) + MVC(br, bc) ;
 }
 #undef MVC
 #undef PRE
@@ -856,6 +899,8 @@
 #undef DIST
 #undef ERR
 #undef CHECK_BETTER
+
+
 int vp8_diamond_search_sad
 (
     MACROBLOCK *x,
@@ -1127,6 +1172,7 @@
 }
 
 
+#if !(CONFIG_REALTIME_ONLY)
 int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2])
 {
     unsigned char *what = (*(b->base_src) + b->src);
@@ -1339,6 +1385,8 @@
     else
         return INT_MAX;
 }
+#endif
+
 
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 7cc9242..181e958 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -42,14 +42,15 @@
     int search_param,
     int error_per_bit,
     int *num00,
-    vp8_variance_fn_t vf,
-    vp8_sad_fn_t sf,
+    const vp8_variance_fn_ptr_t *vf,
     int *mvsadcost[2],
     int *mvcost[2]
 
 );
 
-typedef int (fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2]);
+typedef int (fractional_mv_step_fp)
+    (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv,
+     int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]);
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively;
 extern fractional_mv_step_fp vp8_find_best_sub_pixel_step;
 extern fractional_mv_step_fp vp8_find_best_half_pixel_step;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index bd41b2c..7a78b29 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -2334,11 +2334,50 @@
 
     vp8cx_create_encoder_threads(cpi);
 
-    cpi->fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
-    cpi->fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
-    cpi->fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
-    cpi->fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
-    cpi->fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+    cpi->fn_ptr[BLOCK_16X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16);
+    cpi->fn_ptr[BLOCK_16X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16);
+    cpi->fn_ptr[BLOCK_16X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x16);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_h);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_v);
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = VARIANCE_INVOKE(&cpi->rtcd.variance, halfpixvar16x16_hv);
+    cpi->fn_ptr[BLOCK_16X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x3);
+    cpi->fn_ptr[BLOCK_16X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x16x4d);
+
+    cpi->fn_ptr[BLOCK_16X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
+    cpi->fn_ptr[BLOCK_16X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
+    cpi->fn_ptr[BLOCK_16X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_16X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
+    cpi->fn_ptr[BLOCK_16X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
+
+    cpi->fn_ptr[BLOCK_8X16].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
+    cpi->fn_ptr[BLOCK_8X16].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
+    cpi->fn_ptr[BLOCK_8X16].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X16].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
+    cpi->fn_ptr[BLOCK_8X16].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
+
+    cpi->fn_ptr[BLOCK_8X8].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
+    cpi->fn_ptr[BLOCK_8X8].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
+    cpi->fn_ptr[BLOCK_8X8].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_8X8].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
+    cpi->fn_ptr[BLOCK_8X8].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
+
+    cpi->fn_ptr[BLOCK_4X4].sdf            = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
+    cpi->fn_ptr[BLOCK_4X4].vf             = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
+    cpi->fn_ptr[BLOCK_4X4].svf            = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
+    cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;
+    cpi->fn_ptr[BLOCK_4X4].sdx3f          = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
+    cpi->fn_ptr[BLOCK_4X4].sdx4df         = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
 
 #if !(CONFIG_REALTIME_ONLY)
     cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search);
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index eb516e9..81e32f0 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -229,6 +229,16 @@
     vp8_search_rtcd_vtable_t    search;
 } VP8_ENCODER_RTCD;
 
+enum
+{
+    BLOCK_16X8,
+    BLOCK_8X16,
+    BLOCK_8X8,
+    BLOCK_4X4,
+    BLOCK_16X16,
+    BLOCK_MAX_SEGMENTS
+};
+
 typedef struct
 {
 
@@ -591,7 +601,7 @@
     fractional_mv_step_fp *find_fractional_mv_step;
     vp8_full_search_fn_t full_search_sad;
     vp8_diamond_search_fn_t diamond_search_sad;
-    vp8_variance_fn_ptr_t fn_ptr;
+    vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
     unsigned int time_receive_data;
     unsigned int time_compress_data;
     unsigned int time_pick_lpf;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 8821b3a..2f7dd9c 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -50,14 +50,13 @@
 extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv);
 
 
-int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, int *mvcost[2])
+int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2])
 {
     (void) b;
     (void) d;
     (void) ref_mv;
     (void) error_per_bit;
-    (void) svf;
-    (void) vf;
+    (void) vfp;
     (void) mvcost;
     bestmv->row <<= 3;
     bestmv->col <<= 3;
@@ -65,7 +64,7 @@
 }
 
 
-static int get_inter_mbpred_error(MACROBLOCK *mb, vp8_subpixvariance_fn_t svf, vp8_variance_fn_t vf, unsigned int *sse)
+static int get_inter_mbpred_error(MACROBLOCK *mb, const vp8_variance_fn_ptr_t *vfp, unsigned int *sse)
 {
 
     BLOCK *b = &mb->block[0];
@@ -81,11 +80,11 @@
 
     if (xoffset | yoffset)
     {
-        return svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
+        return vfp->svf(in_what, in_what_stride, xoffset, yoffset, what, what_stride, sse);
     }
     else
     {
-        return vf(what, what_stride, in_what, in_what_stride, sse);
+        return vfp->vf(what, what_stride, in_what, in_what_stride, sse);
     }
 
 }
@@ -719,13 +718,13 @@
 
             if (cpi->sf.search_method == HEX)
             {
-                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                bestsme = vp8_hex_search(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
             }
             else
             {
-                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -744,7 +743,7 @@
                         num00--;
                     else
                     {
-                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+                        thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv1, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
 
                         if (thissme < bestsme)
                         {
@@ -765,7 +764,7 @@
         }
 
         if (bestsme < INT_MAX)
-            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, cpi->fn_ptr.svf, cpi->fn_ptr.vf, cpi->mb.mvcost);
+            cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost);
 
         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -795,7 +794,7 @@
             x->e_mbd.block[0].bmi.mode = this_mode;
             x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int;
 
-            distortion2 = get_inter_mbpred_error(x, cpi->fn_ptr.svf, cpi->fn_ptr.vf, (unsigned int *)(&sse));
+            distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse));
 
             this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2);
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index dbef85b..8f406b9 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1130,6 +1130,8 @@
     int bsd = 0;
     int bestsegmentyrate = 0;
 
+    static const int segmentation_to_sseshift[4] = {3, 3, 2, 0};
+
     // FIX TO Rd error outrange bug PGW 9 june 2004
     B_PREDICTION_MODE bmodes[16] = {ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
                                     ZERO4X4, ZERO4X4, ZERO4X4, ZERO4X4,
@@ -1151,10 +1153,10 @@
         int rate = 0;
         int sbr = 0;
         int sbd = 0;
-        int UNINITIALIZED_IS_SAFE(sseshift);
+        int sseshift;
         int segmentyrate = 0;
 
-        vp8_variance_fn_ptr_t v_fn_ptr;
+        vp8_variance_fn_ptr_t *v_fn_ptr;
 
         ENTROPY_CONTEXT_PLANES t_above, t_left;
         ENTROPY_CONTEXT *ta;
@@ -1174,42 +1176,8 @@
         br = 0;
         bd = 0;
 
-        switch (segmentation)
-        {
-        case 0:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x8);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar16x8);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad16x8x4d);
-            sseshift = 3;
-            break;
-        case 1:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x16);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x16);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x16x4d);
-            sseshift = 3;
-            break;
-        case 2:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var8x8);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar8x8);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad8x8x4d);
-            sseshift = 2;
-            break;
-        case 3:
-            v_fn_ptr.vf    = VARIANCE_INVOKE(&cpi->rtcd.variance, var4x4);
-            v_fn_ptr.svf   = VARIANCE_INVOKE(&cpi->rtcd.variance, subpixvar4x4);
-            v_fn_ptr.sdf   = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4);
-            v_fn_ptr.sdx3f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x3);
-            v_fn_ptr.sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d);
-            sseshift = 0;
-            break;
-        }
-
+        v_fn_ptr = &cpi->fn_ptr[segmentation];
+        sseshift = segmentation_to_sseshift[segmentation];
         labels = vp8_mbsplits[segmentation];
         label_count = vp8_count_labels(labels);
 
@@ -1281,10 +1249,10 @@
                         int sadpb = x->sadperbit4;
 
                         if (cpi->sf.search_method == HEX)
-                            bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr.vf, v_fn_ptr.sdf, x->mvsadcost, mvcost);
+                            bestsme = vp8_hex_search(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
                         else
                         {
-                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+                            bestsme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &mode_mv[NEW4X4], step_param, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
 
                             n = num00;
                             num00 = 0;
@@ -1297,7 +1265,7 @@
                                     num00--;
                                 else
                                 {
-                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, &v_fn_ptr, x->mvsadcost, mvcost);
+                                    thissme = cpi->diamond_search_sad(x, c, e, best_ref_mv, &temp_mv, step_param + n, sadpb / 2/*x->errorperbit*/, &num00, v_fn_ptr, x->mvsadcost, mvcost);
 
                                     if (thissme < bestsme)
                                     {
@@ -1312,7 +1280,7 @@
                         // Should we do a full search (best quality only)
                         if ((compressor_speed == 0) && (bestsme >> sseshift) > 4000)
                         {
-                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, &v_fn_ptr, x->mvcost, x->mvsadcost);
+                            thissme = cpi->full_search_sad(x, c, e, best_ref_mv, sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost);
 
                             if (thissme < bestsme)
                             {
@@ -1330,9 +1298,9 @@
                     if (bestsme < INT_MAX)
                     {
                         if (!fullpixel)
-                            cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                            cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit / 2, v_fn_ptr, mvcost);
                         else
-                            vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr.svf, v_fn_ptr.vf, mvcost);
+                            vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], best_ref_mv, x->errorperbit, v_fn_ptr, mvcost);
                     }
                 }
 
@@ -1852,13 +1820,13 @@
 
                     if (cpi->sf.search_method == HEX)
                     {
-                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf, x->mvsadcost, x->mvcost);
+                        bestsme = vp8_hex_search(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost);
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
                     }
                     else
                     {
-                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb < 9
+                        bestsme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb < 9
                         mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                         mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
 
@@ -1877,7 +1845,7 @@
                                 num00--;
                             else
                             {
-                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr, x->mvsadcost, x->mvcost); //sadpb = 9
+                                thissme = cpi->diamond_search_sad(x, b, d, &best_ref_mv, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost); //sadpb = 9
 
                                 if (thissme < bestsme)
                                 {
@@ -1914,7 +1882,7 @@
                     search_range = (search_range > cpi->sf.max_fs_radius) ? cpi->sf.max_fs_radius : search_range;
                     {
                         int sadpb = x->sadperbit16 >> 2;
-                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr, x->mvcost, x->mvsadcost);
+                        thissme = cpi->full_search_sad(x, b, d, &best_ref_mv, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost);
                     }
 
                     // Barrier threshold to initiating full search
@@ -1939,7 +1907,7 @@
 
                 if (bestsme < INT_MAX)
                     // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost);  // normal mvc=11
-                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, cpi->fn_ptr.svf, cpi->fn_ptr.vf, x->mvcost);
+                    cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost);
 
                 mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
                 mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 630afdb..fd5dd7e 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -234,7 +234,7 @@
             &best_ref_mv1, &d->bmi.mv.as_mv,
             step_param,
             sadpb/*x->errorperbit*/,
-            &num00, cpi->fn_ptr.vf, cpi->fn_ptr.sdf,
+            &num00, &cpi->fn_ptr[BLOCK_16X16],
             mvsadcost, mvcost);
     }
     else
@@ -245,7 +245,7 @@
             &best_ref_mv1, &d->bmi.mv.as_mv,
             step_param,
             sadpb / 2/*x->errorperbit*/,
-            &num00, &cpi->fn_ptr,
+            &num00, &cpi->fn_ptr[BLOCK_16X16],
             mvsadcost, mvcost); //sadpb < 9
 
         // Further step/diamond searches as necessary
@@ -267,7 +267,7 @@
                     &best_ref_mv1, &d->bmi.mv.as_mv,
                     step_param + n,
                     sadpb / 4/*x->errorperbit*/,
-                    &num00, &cpi->fn_ptr,
+                    &num00, &cpi->fn_ptr[BLOCK_16X16],
                     mvsadcost, mvcost); //sadpb = 9
 
                 if (thissme < bestsme)
@@ -291,8 +291,8 @@
     {
         bestsme = cpi->find_fractional_mv_step(x, b, d,
                     &d->bmi.mv.as_mv, &best_ref_mv1,
-                    x->errorperbit, cpi->fn_ptr.svf,
-                    cpi->fn_ptr.vf, cpi->mb.mvcost);
+                    x->errorperbit, &cpi->fn_ptr[BLOCK_16X16],
+                    cpi->mb.mvcost);
     }
 #endif
 
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 3c9ae98..f60038f 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -219,6 +219,21 @@
 #endif
 extern prototype_subpixvariance(vp8_variance_subpixvar16x16);
 
+#ifndef vp8_variance_halfpixvar16x16_h
+#define vp8_variance_halfpixvar16x16_h vp8_half_pixel_variance16x16_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_h);
+
+#ifndef vp8_variance_halfpixvar16x16_v
+#define vp8_variance_halfpixvar16x16_v vp8_half_pixel_variance16x16_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_v);
+
+#ifndef vp8_variance_halfpixvar16x16_hv
+#define vp8_variance_halfpixvar16x16_hv vp8_half_pixel_variance16x16_c
+#endif
+extern prototype_variance(vp8_variance_halfpixvar16x16_hv);
+
 #ifndef vp8_variance_subpixmse16x16
 #define vp8_variance_subpixmse16x16 vp8_sub_pixel_mse16x16_c
 #endif
@@ -283,6 +298,9 @@
     vp8_subpixvariance_fn_t  subpixvar8x16;
     vp8_subpixvariance_fn_t  subpixvar16x8;
     vp8_subpixvariance_fn_t  subpixvar16x16;
+    vp8_variance_fn_t        halfpixvar16x16_h;
+    vp8_variance_fn_t        halfpixvar16x16_v;
+    vp8_variance_fn_t        halfpixvar16x16_hv;
     vp8_subpixvariance_fn_t  subpixmse16x16;
 
     vp8_getmbss_fn_t         getmbss;
@@ -309,11 +327,14 @@
 
 typedef struct
 {
-    vp8_sad_fn_t  sdf;
-    vp8_sad_multi_fn_t sdx3f;
-    vp8_sad_multi_d_fn_t sdx4df;
-    vp8_variance_fn_t vf;
+    vp8_sad_fn_t            sdf;
+    vp8_variance_fn_t       vf;
     vp8_subpixvariance_fn_t svf;
+    vp8_variance_fn_t       svf_halfpix_h;
+    vp8_variance_fn_t       svf_halfpix_v;
+    vp8_variance_fn_t       svf_halfpix_hv;
+    vp8_sad_multi_fn_t      sdx3f;
+    vp8_sad_multi_d_fn_t    sdx4df;
 } vp8_variance_fn_ptr_t;
 
 #if CONFIG_RUNTIME_CPU_DETECT
@@ -322,7 +343,4 @@
 #define VARIANCE_INVOKE(ctx,fn) vp8_variance_##fn
 #endif
 
-/* TODO: Determine if this USEBILINEAR flag is necessary. */
-#define USEBILINEAR
-
 #endif
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index 1774143..48d5bb5 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -24,7 +24,6 @@
 };
 
 
-#ifdef USEBILINEAR
 const int VP8_FILTER_WEIGHT = 128;
 const int VP8_FILTER_SHIFT  =   7;
 const int vp8_bilinear_taps[8][2] =
@@ -461,6 +460,19 @@
     return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
 }
 
+
+unsigned int vp8_half_pixel_variance16x16_c(
+    const unsigned char *src_ptr,
+    int  source_stride,
+    const unsigned char *ref_ptr,
+    int  recon_stride,
+    unsigned int *sse)
+{
+    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
+                                         ref_ptr, recon_stride, sse);
+}
+
+
 unsigned int vp8_sub_pixel_mse16x16_c
 (
     const unsigned char  *src_ptr,
@@ -525,4 +537,3 @@
 
     return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
 }
-#endif
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 9ab92b3..bb3f825 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -118,8 +118,6 @@
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/bilinearfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/loopfilter_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/recon_arm.c
-VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra4x4_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/reconintra_arm.c
 VP8_COMMON_SRCS-$(HAVE_ARMV6)  += common/arm/vpx_asm_offsets.c
 
@@ -163,6 +161,7 @@
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon16x16mb_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/buildintrapredictorsmby_neon$(ASM)
 VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/save_neon_reg$(ASM)
+VP8_COMMON_SRCS-$(HAVE_ARMV7)  += common/arm/neon/recon_neon.c
 
 
 #
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index d126faf..da27e08 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -19,7 +19,6 @@
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/quantize_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/picklpf_arm.c
 VP8_CX_SRCS-$(HAVE_ARMV5TE) += encoder/arm/boolhuff_arm.c
-VP8_CX_SRCS-$(HAVE_ARMV7)  += encoder/arm/mcomp_arm.c
 
 VP8_CX_SRCS_REMOVE-$(HAVE_ARMV5TE)  += encoder/boolhuff.c
 
diff --git a/vpxenc.c b/vpxenc.c
index a178782..bc69f5d 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -430,6 +430,8 @@
 
 struct EbmlGlobal
 {
+    int debug;
+
     FILE    *stream;
     uint64_t last_pts_ms;
     vpx_rational_t  framerate;
@@ -552,8 +554,10 @@
         Ebml_SerializeUnsigned(ebml, TimecodeScale, 1000000);
         Ebml_SerializeFloat(ebml, Segment_Duration,
                             ebml->last_pts_ms + frame_time);
-        Ebml_SerializeString(ebml, 0x4D80, "vpxenc" VERSION_STRING);
-        Ebml_SerializeString(ebml, 0x5741, "vpxenc" VERSION_STRING);
+        Ebml_SerializeString(ebml, 0x4D80,
+            ebml->debug ? "vpxenc" : "vpxenc" VERSION_STRING);
+        Ebml_SerializeString(ebml, 0x5741,
+            ebml->debug ? "vpxenc" : "vpxenc" VERSION_STRING);
         Ebml_EndSubElement(ebml, &startInfo);
     }
 }
@@ -740,6 +744,8 @@
 
 #include "args.h"
 
+static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
+        "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
         "Output filename");
 static const arg_def_t use_yv12 = ARG_DEF(NULL, "yv12", 0,
@@ -774,6 +780,7 @@
         "Output IVF (default is WebM)");
 static const arg_def_t *main_args[] =
 {
+    &debugmode,
     &outputfile, &codecarg, &passes, &pass_arg, &fpf_name, &limit, &deadline,
     &best_dl, &good_dl, &rt_dl,
     &verbosearg, &psnrarg, &use_ivf, &framerate,
@@ -1052,6 +1059,8 @@
             write_webm = 0;
         else if (arg_match(&arg, &outputfile, argi))
             out_fn = arg.val;
+        else if (arg_match(&arg, &debugmode, argi))
+            ebml.debug = 1;
         else
             argj++;
     }