Merge "[svc] Disable tiles for spatial svc case"

commit: 45db29784d4ae2d32c387d58ac0a670a7a8533af [log] [tgz]
author: Minghai Shang <minghai@google.com> Thu May 28 22:13:53 2015 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> Thu May 28 22:13:54 2015 +0000
tree: c8b3cf60fc3d00fa1213c54d89b9cf272484fbfa
parent: 7c16dcc79bb0d9c8be8c56f4ae43cd8bc3cb9fb9 [diff]
parent: 9843e7c6355cecc84229bc812c10bb88bb7b74ca [diff]
diff --git a/args.c b/args.c
index 9dabc9b..14b0310 100644
--- a/args.c
+++ b/args.c

@@ -14,9 +14,7 @@
 #include <limits.h>
 #include "args.h"
 
-#ifdef _MSC_VER
-#define snprintf _snprintf
-#endif
+#include "vpx_ports/msvc.h"
 
 #if defined(__GNUC__) && __GNUC__
 extern void die(const char *fmt, ...) __attribute__((noreturn));

diff --git a/configure b/configure
index 310c42a..d3d6e67 100755
--- a/configure
+++ b/configure

@@ -184,6 +184,10 @@
     [ $doxy_minor -eq 5 ] && [ $doxy_patch -ge 3 ] && enable_feature doxygen
 fi
 
+# disable codecs when their source directory does not exist
+[ -d "${source_path}/vp8" ] || disable_feature vp8
+[ -d "${source_path}/vp9" ] || disable_feature vp9
+
 # install everything except the sources, by default. sources will have
 # to be enabled when doing dist builds, since that's no longer a common
 # case.
@@ -199,31 +203,16 @@
 enable_feature os_support
 enable_feature temporal_denoising
 
-[ -d "${source_path}/../include" ] && enable_feature alt_tree_layout
-for d in vp8 vp9; do
-    [ -d "${source_path}/${d}" ] && disable_feature alt_tree_layout;
-done
-
-if ! enabled alt_tree_layout; then
-# development environment
-[ -d "${source_path}/vp8" ] && CODECS="${CODECS} vp8_encoder vp8_decoder"
-[ -d "${source_path}/vp9" ] && CODECS="${CODECS} vp9_encoder vp9_decoder"
-else
-# customer environment
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] && CODECS="${CODECS} vp8_encoder"
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] && CODECS="${CODECS} vp8_decoder"
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] && CODECS="${CODECS} vp9_encoder"
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] && CODECS="${CODECS} vp9_decoder"
-[ -f "${source_path}/../include/vpx/vp8cx.h" ] || disable_feature vp8_encoder
-[ -f "${source_path}/../include/vpx/vp8dx.h" ] || disable_feature vp8_decoder
-[ -f "${source_path}/../include/vpx/vp9cx.h" ] || disable_feature vp9_encoder
-[ -f "${source_path}/../include/vpx/vp9dx.h" ] || disable_feature vp9_decoder
-
-[ -f "${source_path}/../lib/*/*mt.lib" ] && soft_enable static_msvcrt
-fi
-
-CODECS="$(echo ${CODECS} | tr ' ' '\n')"
-CODEC_FAMILIES="$(for c in ${CODECS}; do echo ${c%_*}; done | sort | uniq)"
+CODECS="
+    vp8_encoder
+    vp8_decoder
+    vp9_encoder
+    vp9_decoder
+"
+CODEC_FAMILIES="
+    vp8
+    vp9
+"
 
 ARCH_LIST="
     arm
@@ -255,7 +244,6 @@
     ${ARCH_EXT_LIST}
     vpx_ports
     stdint_h
-    alt_tree_layout
     pthread_h
     sys_mman_h
     unistd_h

diff --git a/examples.mk b/examples.mk
index b92507a..174c71d 100644
--- a/examples.mk
+++ b/examples.mk

@@ -56,6 +56,7 @@
 vpxdec.SRCS                 += md5_utils.c md5_utils.h
 vpxdec.SRCS                 += vpx_ports/mem_ops.h
 vpxdec.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxdec.SRCS                 += vpx_ports/msvc.h
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
@@ -80,6 +81,7 @@
 vpxenc.SRCS                 += warnings.c warnings.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
 vpxenc.SRCS                 += vpx_ports/mem_ops_aligned.h
+vpxenc.SRCS                 += vpx_ports/msvc.h
 vpxenc.SRCS                 += vpx_ports/vpx_timer.h
 vpxenc.SRCS                 += vpxstats.c vpxstats.h
 ifeq ($(CONFIG_LIBYUV),yes)
@@ -98,6 +100,7 @@
   vp9_spatial_svc_encoder.SRCS        += tools_common.c tools_common.h
   vp9_spatial_svc_encoder.SRCS        += video_common.h
   vp9_spatial_svc_encoder.SRCS        += video_writer.h video_writer.c
+  vp9_spatial_svc_encoder.SRCS        += vpx_ports/msvc.h
   vp9_spatial_svc_encoder.SRCS        += vpxstats.c vpxstats.h
   vp9_spatial_svc_encoder.GUID        = 4A38598D-627D-4505-9C7B-D4020C84100D
   vp9_spatial_svc_encoder.DESCRIPTION = VP9 Spatial SVC Encoder
@@ -112,6 +115,7 @@
 vpx_temporal_svc_encoder.SRCS        += tools_common.c tools_common.h
 vpx_temporal_svc_encoder.SRCS        += video_common.h
 vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
+vpx_temporal_svc_encoder.SRCS        += vpx_ports/msvc.h
 vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
 vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
 EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
@@ -122,6 +126,7 @@
 simple_decoder.SRCS                += video_reader.h video_reader.c
 simple_decoder.SRCS                += vpx_ports/mem_ops.h
 simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
+simple_decoder.SRCS                += vpx_ports/msvc.h
 simple_decoder.DESCRIPTION          = Simplified decoder loop
 EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
 postproc.SRCS                      += ivfdec.h ivfdec.c
@@ -130,6 +135,7 @@
 postproc.SRCS                      += video_reader.h video_reader.c
 postproc.SRCS                      += vpx_ports/mem_ops.h
 postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
+postproc.SRCS                      += vpx_ports/msvc.h
 postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
 postproc.DESCRIPTION                = Decoder postprocessor control
 EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
@@ -140,6 +146,7 @@
 decode_to_md5.SRCS                 += video_reader.h video_reader.c
 decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
 decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
+decode_to_md5.SRCS                 += vpx_ports/msvc.h
 decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
 decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
 EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
@@ -147,6 +154,7 @@
 simple_encoder.SRCS             += tools_common.h tools_common.c
 simple_encoder.SRCS             += video_common.h
 simple_encoder.SRCS             += video_writer.h video_writer.c
+simple_encoder.SRCS             += vpx_ports/msvc.h
 simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
 simple_encoder.DESCRIPTION       = Simplified encoder loop
 EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
@@ -154,6 +162,7 @@
 vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
 vp9_lossless_encoder.SRCS       += video_common.h
 vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.SRCS       += vpx_ports/msvc.h
 vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
 vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
 EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
@@ -161,6 +170,7 @@
 twopass_encoder.SRCS            += tools_common.h tools_common.c
 twopass_encoder.SRCS            += video_common.h
 twopass_encoder.SRCS            += video_writer.h video_writer.c
+twopass_encoder.SRCS            += vpx_ports/msvc.h
 twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
 twopass_encoder.DESCRIPTION      = Two-pass encoder loop
 EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
@@ -170,6 +180,7 @@
 decode_with_drops.SRCS          += video_reader.h video_reader.c
 decode_with_drops.SRCS          += vpx_ports/mem_ops.h
 decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
+decode_with_drops.SRCS          += vpx_ports/msvc.h
 decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
 decode_with_drops.DESCRIPTION    = Drops frames while decoding
 EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
@@ -177,6 +188,7 @@
 set_maps.SRCS                      += tools_common.h tools_common.c
 set_maps.SRCS                      += video_common.h
 set_maps.SRCS                      += video_writer.h video_writer.c
+set_maps.SRCS                      += vpx_ports/msvc.h
 set_maps.GUID                       = ECB2D24D-98B8-4015-A465-A4AF3DCC145F
 set_maps.DESCRIPTION                = Set active and ROI maps
 EXAMPLES-$(CONFIG_VP8_ENCODER)     += vp8cx_set_ref.c
@@ -184,6 +196,7 @@
 vp8cx_set_ref.SRCS                 += tools_common.h tools_common.c
 vp8cx_set_ref.SRCS                 += video_common.h
 vp8cx_set_ref.SRCS                 += video_writer.h video_writer.c
+vp8cx_set_ref.SRCS                 += vpx_ports/msvc.h
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
@@ -194,6 +207,7 @@
 vp8_multi_resolution_encoder.SRCS       += ivfenc.h ivfenc.c
 vp8_multi_resolution_encoder.SRCS       += tools_common.h tools_common.c
 vp8_multi_resolution_encoder.SRCS       += video_writer.h video_writer.c
+vp8_multi_resolution_encoder.SRCS       += vpx_ports/msvc.h
 vp8_multi_resolution_encoder.SRCS       += $(LIBYUV_SRCS)
 vp8_multi_resolution_encoder.GUID        = 04f8738e-63c8-423b-90fa-7c2703a374de
 vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 48a8006..b37d8e3 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc

@@ -20,6 +20,7 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 5c0b09b..4c12bb4 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -20,6 +20,7 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"

diff --git a/test/test_vectors.cc b/test/test_vectors.cc
index 07d306f..434a382 100644
--- a/test/test_vectors.cc
+++ b/test/test_vectors.cc

@@ -165,7 +165,10 @@
   "vp90-2-11-size-351x287.webm", "vp90-2-11-size-351x288.webm",
   "vp90-2-11-size-352x287.webm", "vp90-2-12-droppable_1.ivf",
   "vp90-2-12-droppable_2.ivf", "vp90-2-12-droppable_3.ivf",
+#if !CONFIG_SIZE_LIMIT || \
+    (DECODE_WIDTH_LIMIT >= 20400 && DECODE_HEIGHT_LIMIT >= 120)
   "vp90-2-13-largescaling.webm",
+#endif
   "vp90-2-14-resize-fp-tiles-1-16.webm",
   "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
   "vp90-2-14-resize-fp-tiles-1-2.webm", "vp90-2-14-resize-fp-tiles-1-4.webm",

diff --git a/test/variance_test.cc b/test/variance_test.cc
index e4e27af..e45d90f 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -21,22 +21,45 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#if CONFIG_VP8_ENCODER
-# include "./vp8_rtcd.h"
-# include "vp8/common/variance.h"
-#endif
 #if CONFIG_VP9_ENCODER
 # include "./vp9_rtcd.h"
 # include "vp9/encoder/vp9_variance.h"
-#endif
+#endif  // CONFIG_VP9_ENCODER
+#include "./vpx_dsp_rtcd.h"
 
 namespace {
 
+typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
+                                        unsigned int *sse);
+typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride);
+
+
 using ::std::tr1::get;
 using ::std::tr1::make_tuple;
 using ::std::tr1::tuple;
 using libvpx_test::ACMRandom;
 
+// Truncate high bit depth results by downshifting (with rounding) by:
+// 2 * (bit_depth - 8) for sse
+// (bit_depth - 8) for se
+static void RoundHighBitDepth(int bit_depth, int64_t *se, uint64_t *sse) {
+  switch (bit_depth) {
+    case VPX_BITS_12:
+      *sse = (*sse + 128) >> 8;
+      *se = (*se + 8) >> 4;
+      break;
+    case VPX_BITS_10:
+      *sse = (*sse + 8) >> 4;
+      *se = (*se + 2) >> 2;
+      break;
+    case VPX_BITS_8:
+    default:
+      break;
+  }
+}
+
 static unsigned int mb_ss_ref(const int16_t *src) {
   unsigned int res = 0;
   for (int i = 0; i < 256; ++i) {
@@ -50,7 +73,6 @@
                                  int ref_stride_coeff, uint32_t *sse_ptr,
                                  bool use_high_bit_depth_,
                                  vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
   int64_t se = 0;
   uint64_t sse = 0;
   const int w = 1 << l2w;
@@ -63,32 +85,17 @@
                src[w * y * src_stride_coeff + x];
         se += diff;
         sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         diff = CONVERT_TO_SHORTPTR(ref)[w * y * ref_stride_coeff + x] -
                CONVERT_TO_SHORTPTR(src)[w * y * src_stride_coeff + x];
         se += diff;
         sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
-  if (bit_depth > VPX_BITS_8) {
-    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
-    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
-  }
-#else
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      int diff = ref[w * y * ref_stride_coeff + x] -
-                 src[w * y * src_stride_coeff + x];
-      se += diff;
-      sse += diff * diff;
-    }
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -98,7 +105,6 @@
                                         unsigned int *sse_ptr,
                                         bool use_high_bit_depth_,
                                         vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
   int64_t se = 0;
   uint64_t sse = 0;
   const int w = 1 << l2w;
@@ -117,6 +123,7 @@
         const int diff = r - src[w * y + x];
         se += diff;
         sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
         uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -130,34 +137,11 @@
         const int diff = r - src16[w * y + x];
         se += diff;
         sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
-  if (bit_depth > VPX_BITS_8) {
-    sse = ROUND_POWER_OF_TWO(sse, 2 * (bit_depth - 8));
-    se = ROUND_POWER_OF_TWO(se, bit_depth - 8);
-  }
-#else
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // Bilinear interpolation at a 16th pel step.
-      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-      const int r = a + (((b - a) * yoff + 8) >> 4);
-      const int diff = r - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
-    }
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -229,36 +213,30 @@
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
       ref_ = new uint8_t[block_size_ * 2];
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
           vpx_memalign(16, block_size_ * 2 * sizeof(uint16_t))));
       ref_ = CONVERT_TO_BYTEPTR(new uint16_t[block_size_ * 2]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_ * 2));
-    ref_ = new uint8_t[block_size_ * 2];
-#endif
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       vpx_free(src_);
       delete[] ref_;
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       vpx_free(CONVERT_TO_SHORTPTR(src_));
       delete[] CONVERT_TO_SHORTPTR(ref_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    vpx_free(src_);
-    delete[] ref_;
-#endif
     libvpx_test::ClearSystemState();
   }
 
@@ -283,27 +261,23 @@
 template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::ZeroTest() {
   for (int i = 0; i <= 255; ++i) {
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       memset(src_, i, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       vpx_memset16(CONVERT_TO_SHORTPTR(src_), i << (bit_depth_ - 8),
                    block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    memset(src_, i, block_size_);
-#endif
     for (int j = 0; j <= 255; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
       if (!use_high_bit_depth_) {
         memset(ref_, j, block_size_);
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_), j  << (bit_depth_ - 8),
                      block_size_);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
-#else
-      memset(ref_, j, block_size_);
-#endif
       unsigned int sse;
       unsigned int var;
       ASM_REGISTER_STATE_CHECK(
@@ -317,18 +291,15 @@
 void VarianceTest<VarianceFunctionType>::RefTest() {
   for (int i = 0; i < 10; ++i) {
     for (int j = 0; j < block_size_; j++) {
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       src_[j] = rnd_.Rand8();
       ref_[j] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() && mask_;
       CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-      src_[j] = rnd_.Rand8();
-      ref_[j] = rnd_.Rand8();
-#endif
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -352,18 +323,15 @@
     for (int j = 0; j < block_size_; j++) {
       int ref_ind = (j / width_) * ref_stride_coeff * width_ + j % width_;
       int src_ind = (j / width_) * src_stride_coeff * width_ + j % width_;
-#if CONFIG_VP9_HIGHBITDEPTH
       if (!use_high_bit_depth_) {
         src_[src_ind] = rnd_.Rand8();
         ref_[ref_ind] = rnd_.Rand8();
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         CONVERT_TO_SHORTPTR(src_)[src_ind] = rnd_.Rand16() && mask_;
         CONVERT_TO_SHORTPTR(ref_)[ref_ind] = rnd_.Rand16() && mask_;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
-#else
-      src_[src_ind] = rnd_.Rand8();
-      ref_[ref_ind] = rnd_.Rand8();
-#endif
     }
     unsigned int sse1, sse2;
     unsigned int var1;
@@ -383,22 +351,18 @@
 template<typename VarianceFunctionType>
 void VarianceTest<VarianceFunctionType>::OneQuarterTest() {
   const int half = block_size_ / 2;
-#if CONFIG_VP9_HIGHBITDEPTH
   if (!use_high_bit_depth_) {
     memset(src_, 255, block_size_);
     memset(ref_, 255, half);
     memset(ref_ + half, 0, half);
+#if CONFIG_VP9_HIGHBITDEPTH
   } else {
     vpx_memset16(CONVERT_TO_SHORTPTR(src_), 255 << (bit_depth_ - 8),
                  block_size_);
     vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 255 << (bit_depth_ - 8), half);
     vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, 0, half);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
-#else
-  memset(src_, 255, block_size_);
-  memset(ref_, 255, half);
-  memset(ref_ + half, 0, half);
-#endif
   unsigned int sse;
   unsigned int var;
   ASM_REGISTER_STATE_CHECK(var = variance_(src_, width_, ref_, width_, &sse));
@@ -406,7 +370,6 @@
   EXPECT_EQ(expected, var);
 }
 
-#if CONFIG_VP8_ENCODER
 template<typename MseFunctionType>
 class MseTest
     : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > {
@@ -500,9 +463,7 @@
   const unsigned int expected = block_size_ * 255 * 255;
   EXPECT_EQ(expected, var);
 }
-#endif
 
-#if CONFIG_VP9_ENCODER
 unsigned int subpel_avg_variance_ref(const uint8_t *ref,
                                      const uint8_t *src,
                                      const uint8_t *second_pred,
@@ -511,7 +472,6 @@
                                      unsigned int *sse_ptr,
                                      bool use_high_bit_depth,
                                      vpx_bit_depth_t bit_depth) {
-#if CONFIG_VP9_HIGHBITDEPTH
   int64_t se = 0;
   uint64_t sse = 0;
   const int w = 1 << l2w;
@@ -530,6 +490,7 @@
         const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
         se += diff;
         sse += diff * diff;
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         uint16_t *ref16 = CONVERT_TO_SHORTPTR(ref);
         uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
@@ -544,34 +505,11 @@
         const int diff = ((r + sec16[w * y + x] + 1) >> 1) - src16[w * y + x];
         se += diff;
         sse += diff * diff;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
-  if (bit_depth > 8) {
-    sse = ROUND_POWER_OF_TWO(sse, 2*(bit_depth-8));
-    se = ROUND_POWER_OF_TWO(se, bit_depth-8);
-  }
-#else
-  int se = 0;
-  unsigned int sse = 0;
-  const int w = 1 << l2w;
-  const int h = 1 << l2h;
-  for (int y = 0; y < h; y++) {
-    for (int x = 0; x < w; x++) {
-      // bilinear interpolation at a 16th pel step
-      const int a1 = ref[(w + 1) * (y + 0) + x + 0];
-      const int a2 = ref[(w + 1) * (y + 0) + x + 1];
-      const int b1 = ref[(w + 1) * (y + 1) + x + 0];
-      const int b2 = ref[(w + 1) * (y + 1) + x + 1];
-      const int a = a1 + (((a2 - a1) * xoff + 8) >> 4);
-      const int b = b1 + (((b2 - b1) * xoff + 8) >> 4);
-      const int r = a + (((b - a) * yoff + 8) >> 4);
-      const int diff = ((r + second_pred[w * y + x] + 1) >> 1) - src[w * y + x];
-      se += diff;
-      sse += diff * diff;
-    }
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+  RoundHighBitDepth(bit_depth, &se, &sse);
   *sse_ptr = sse;
   return sse - (((int64_t) se * se) >> (l2w + l2h));
 }
@@ -600,11 +538,11 @@
 
     rnd_.Reset(ACMRandom::DeterministicSeed());
     block_size_ = width_ * height_;
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
       sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
       ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       src_ = CONVERT_TO_BYTEPTR(
           reinterpret_cast<uint16_t *>(
@@ -614,33 +552,25 @@
               vpx_memalign(16, block_size_*sizeof(uint16_t))));
       ref_ = CONVERT_TO_BYTEPTR(
           new uint16_t[block_size_ + width_ + height_ + 1]);
-    }
-#else
-    src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-    sec_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_));
-    ref_ = new uint8_t[block_size_ + width_ + height_ + 1];
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
     ASSERT_TRUE(src_ != NULL);
     ASSERT_TRUE(sec_ != NULL);
     ASSERT_TRUE(ref_ != NULL);
   }
 
   virtual void TearDown() {
-#if CONFIG_VP9_HIGHBITDEPTH
     if (!use_high_bit_depth_) {
       vpx_free(src_);
       delete[] ref_;
       vpx_free(sec_);
+#if CONFIG_VP9_HIGHBITDEPTH
     } else {
       vpx_free(CONVERT_TO_SHORTPTR(src_));
       delete[] CONVERT_TO_SHORTPTR(ref_);
       vpx_free(CONVERT_TO_SHORTPTR(sec_));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-#else
-    vpx_free(src_);
-    delete[] ref_;
-    vpx_free(sec_);
-#endif
     libvpx_test::ClearSystemState();
   }
 
@@ -664,7 +594,6 @@
 void SubpelVarianceTest<SubpelVarianceFunctionType>::RefTest() {
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
-#if CONFIG_VP9_HIGHBITDEPTH
       if (!use_high_bit_depth_) {
         for (int j = 0; j < block_size_; j++) {
           src_[j] = rnd_.Rand8();
@@ -672,6 +601,7 @@
         for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         for (int j = 0; j < block_size_; j++) {
           CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
@@ -679,15 +609,8 @@
         for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
           CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
         }
-      }
-#else
-      for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd_.Rand8();
-      }
-      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd_.Rand8();
-      }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(var1 = subpel_variance_(ref_, width_ + 1, x, y,
@@ -710,25 +633,20 @@
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
       const int half = block_size_ / 2;
-#if CONFIG_VP9_HIGHBITDEPTH
       if (!use_high_bit_depth_) {
         memset(src_, 0, half);
         memset(src_ + half, 255, half);
         memset(ref_, 255, half);
         memset(ref_ + half, 0, half + width_ + height_ + 1);
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         vpx_memset16(CONVERT_TO_SHORTPTR(src_), mask_, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(src_) + half, 0, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 0, half);
         vpx_memset16(CONVERT_TO_SHORTPTR(ref_) + half, mask_,
                      half + width_ + height_ + 1);
-      }
-#else
-      memset(src_, 0, half);
-      memset(src_ + half, 255, half);
-      memset(ref_, 255, half);
-      memset(ref_ + half, 0, half + width_ + height_ + 1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
@@ -742,11 +660,11 @@
   }
 }
 
+#if CONFIG_VP9_ENCODER
 template<>
 void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 16; ++x) {
     for (int y = 0; y < 16; ++y) {
-#if CONFIG_VP9_HIGHBITDEPTH
       if (!use_high_bit_depth_) {
         for (int j = 0; j < block_size_; j++) {
           src_[j] = rnd_.Rand8();
@@ -755,6 +673,7 @@
         for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
           ref_[j] = rnd_.Rand8();
         }
+#if CONFIG_VP9_HIGHBITDEPTH
       } else {
         for (int j = 0; j < block_size_; j++) {
           CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask_;
@@ -763,16 +682,8 @@
         for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
           CONVERT_TO_SHORTPTR(ref_)[j] = rnd_.Rand16() & mask_;
         }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
-#else
-      for (int j = 0; j < block_size_; j++) {
-        src_[j] = rnd_.Rand8();
-        sec_[j] = rnd_.Rand8();
-      }
-      for (int j = 0; j < block_size_ + width_ + height_ + 1; j++) {
-        ref_[j] = rnd_.Rand8();
-      }
-#endif
       unsigned int sse1, sse2;
       unsigned int var1;
       ASM_REGISTER_STATE_CHECK(
@@ -788,272 +699,407 @@
     }
   }
 }
-
 #endif  // CONFIG_VP9_ENCODER
 
-// -----------------------------------------------------------------------------
-// VP8 test cases.
+typedef MseTest<Get4x4SseFunc> VpxSseTest;
+typedef MseTest<VarianceMxNFunc> VpxMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
 
-namespace vp8 {
-
-#if CONFIG_VP8_ENCODER
-typedef unsigned int (*vp8_sse_fn_t)(const unsigned char *src_ptr,
-    int source_stride, const unsigned char *ref_ptr, int  ref_stride);
-
-typedef MseTest<vp8_sse_fn_t> VP8SseTest;
-typedef MseTest<vp8_variance_fn_t> VP8MseTest;
-typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest;
-
-TEST_P(VP8SseTest, Ref_sse) { RefTest_sse(); }
-TEST_P(VP8SseTest, Max_sse) { MaxTest_sse(); }
-TEST_P(VP8MseTest, Ref_mse) { RefTest_mse(); }
-TEST_P(VP8MseTest, Max_mse) { MaxTest_mse(); }
-TEST_P(VP8VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP8VarianceTest, Ref) { RefTest(); }
-TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); }
-
-const vp8_sse_fn_t get4x4sse_cs_c = vp8_get4x4sse_cs_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP8SseTest,
-    ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
-
-const vp8_variance_fn_t mse16x16_c = vp8_mse16x16_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP8MseTest,
-    ::testing::Values(make_tuple(4, 4, mse16x16_c)));
-
-const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c;
-const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c;
-const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c;
-const vp8_variance_fn_t variance16x8_c = vp8_variance16x8_c;
-const vp8_variance_fn_t variance16x16_c = vp8_variance16x16_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
-                      make_tuple(3, 3, variance8x8_c, 0),
-                      make_tuple(3, 4, variance8x16_c, 0),
-                      make_tuple(4, 3, variance16x8_c, 0),
-                      make_tuple(4, 4, variance16x16_c, 0)));
-
-#if HAVE_NEON
-const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP8SseTest,
-    ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
-
-const vp8_variance_fn_t mse16x16_neon = vp8_mse16x16_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP8MseTest,
-    ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
-
-const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon;
-const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon;
-const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon;
-const vp8_variance_fn_t variance16x16_neon = vp8_variance16x16_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP8VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
-                      make_tuple(3, 4, variance8x16_neon, 0),
-                      make_tuple(4, 3, variance16x8_neon, 0),
-                      make_tuple(4, 4, variance16x16_neon, 0)));
-#endif
-
-#if HAVE_MMX
-const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx;
-const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
-const vp8_variance_fn_t variance8x16_mmx = vp8_variance8x16_mmx;
-const vp8_variance_fn_t variance16x8_mmx = vp8_variance16x8_mmx;
-const vp8_variance_fn_t variance16x16_mmx = vp8_variance16x16_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_mmx, 0),
-                      make_tuple(3, 3, variance8x8_mmx, 0),
-                      make_tuple(3, 4, variance8x16_mmx, 0),
-                      make_tuple(4, 3, variance16x8_mmx, 0),
-                      make_tuple(4, 4, variance16x16_mmx, 0)));
-#endif
-
-#if HAVE_SSE2
-const vp8_variance_fn_t variance4x4_wmt = vp8_variance4x4_wmt;
-const vp8_variance_fn_t variance8x8_wmt = vp8_variance8x8_wmt;
-const vp8_variance_fn_t variance8x16_wmt = vp8_variance8x16_wmt;
-const vp8_variance_fn_t variance16x8_wmt = vp8_variance16x8_wmt;
-const vp8_variance_fn_t variance16x16_wmt = vp8_variance16x16_wmt;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP8VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_wmt, 0),
-                      make_tuple(3, 3, variance8x8_wmt, 0),
-                      make_tuple(3, 4, variance8x16_wmt, 0),
-                      make_tuple(4, 3, variance16x8_wmt, 0),
-                      make_tuple(4, 4, variance16x16_wmt, 0)));
-#endif
-#endif  // CONFIG_VP8_ENCODER
-
-}  // namespace vp8
-
-// -----------------------------------------------------------------------------
-// VP9 test cases.
-
-namespace vp9 {
-
-#if CONFIG_VP9_ENCODER
+TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
+TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
+TEST_P(VpxMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
-                        ::testing::Values(vp9_get_mb_ss_c));
+                        ::testing::Values(vpx_get_mb_ss_c));
 
-typedef VarianceTest<vp9_variance_fn_t> VP9VarianceTest;
+const Get4x4SseFunc get4x4sse_cs_c = vpx_get4x4sse_cs_c;
+INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c)));
+
+const VarianceMxNFunc mse16x16_c = vpx_mse16x16_c;
+const VarianceMxNFunc mse16x8_c = vpx_mse16x8_c;
+const VarianceMxNFunc mse8x16_c = vpx_mse8x16_c;
+const VarianceMxNFunc mse8x8_c = vpx_mse8x8_c;
+INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_c),
+                                          make_tuple(4, 3, mse16x8_c),
+                                          make_tuple(3, 4, mse8x16_c),
+                                          make_tuple(3, 3, mse8x8_c)));
+
+const VarianceMxNFunc variance64x64_c = vpx_variance64x64_c;
+const VarianceMxNFunc variance64x32_c = vpx_variance64x32_c;
+const VarianceMxNFunc variance32x64_c = vpx_variance32x64_c;
+const VarianceMxNFunc variance32x32_c = vpx_variance32x32_c;
+const VarianceMxNFunc variance32x16_c = vpx_variance32x16_c;
+const VarianceMxNFunc variance16x32_c = vpx_variance16x32_c;
+const VarianceMxNFunc variance16x16_c = vpx_variance16x16_c;
+const VarianceMxNFunc variance16x8_c = vpx_variance16x8_c;
+const VarianceMxNFunc variance8x16_c = vpx_variance8x16_c;
+const VarianceMxNFunc variance8x8_c = vpx_variance8x8_c;
+const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
+const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
+const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
+                      make_tuple(6, 5, variance64x32_c, 0),
+                      make_tuple(5, 6, variance32x64_c, 0),
+                      make_tuple(5, 5, variance32x32_c, 0),
+                      make_tuple(5, 4, variance32x16_c, 0),
+                      make_tuple(4, 5, variance16x32_c, 0),
+                      make_tuple(4, 4, variance16x16_c, 0),
+                      make_tuple(4, 3, variance16x8_c, 0),
+                      make_tuple(3, 4, variance8x16_c, 0),
+                      make_tuple(3, 3, variance8x8_c, 0),
+                      make_tuple(3, 2, variance8x4_c, 0),
+                      make_tuple(2, 3, variance4x8_c, 0),
+                      make_tuple(2, 2, variance4x4_c, 0)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
+typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+
+TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
+TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
+TEST_P(VpxHBDVarianceTest, Zero) { ZeroTest(); }
+TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
+TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
+const VarianceMxNFunc highbd_12_mse16x8_c = vpx_highbd_12_mse16x8_c;
+const VarianceMxNFunc highbd_12_mse8x16_c = vpx_highbd_12_mse8x16_c;
+const VarianceMxNFunc highbd_12_mse8x8_c = vpx_highbd_12_mse8x8_c;
+
+const VarianceMxNFunc highbd_10_mse16x16_c = vpx_highbd_10_mse16x16_c;
+const VarianceMxNFunc highbd_10_mse16x8_c = vpx_highbd_10_mse16x8_c;
+const VarianceMxNFunc highbd_10_mse8x16_c = vpx_highbd_10_mse8x16_c;
+const VarianceMxNFunc highbd_10_mse8x8_c = vpx_highbd_10_mse8x8_c;
+
+const VarianceMxNFunc highbd_8_mse16x16_c = vpx_highbd_8_mse16x16_c;
+const VarianceMxNFunc highbd_8_mse16x8_c = vpx_highbd_8_mse16x8_c;
+const VarianceMxNFunc highbd_8_mse8x16_c = vpx_highbd_8_mse8x16_c;
+const VarianceMxNFunc highbd_8_mse8x8_c = vpx_highbd_8_mse8x8_c;
+
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_c),
+                                        make_tuple(4, 4, highbd_12_mse16x8_c),
+                                        make_tuple(4, 4, highbd_12_mse8x16_c),
+                                        make_tuple(4, 4, highbd_12_mse8x8_c),
+                                        make_tuple(4, 4, highbd_10_mse16x16_c),
+                                        make_tuple(4, 4, highbd_10_mse16x8_c),
+                                        make_tuple(4, 4, highbd_10_mse8x16_c),
+                                        make_tuple(4, 4, highbd_10_mse8x8_c),
+                                        make_tuple(4, 4, highbd_8_mse16x16_c),
+                                        make_tuple(4, 4, highbd_8_mse16x8_c),
+                                        make_tuple(4, 4, highbd_8_mse8x16_c),
+                                        make_tuple(4, 4, highbd_8_mse8x8_c)));
+*/
+
+
+const VarianceMxNFunc highbd_12_variance64x64_c = vpx_highbd_12_variance64x64_c;
+const VarianceMxNFunc highbd_12_variance64x32_c = vpx_highbd_12_variance64x32_c;
+const VarianceMxNFunc highbd_12_variance32x64_c = vpx_highbd_12_variance32x64_c;
+const VarianceMxNFunc highbd_12_variance32x32_c = vpx_highbd_12_variance32x32_c;
+const VarianceMxNFunc highbd_12_variance32x16_c = vpx_highbd_12_variance32x16_c;
+const VarianceMxNFunc highbd_12_variance16x32_c = vpx_highbd_12_variance16x32_c;
+const VarianceMxNFunc highbd_12_variance16x16_c = vpx_highbd_12_variance16x16_c;
+const VarianceMxNFunc highbd_12_variance16x8_c = vpx_highbd_12_variance16x8_c;
+const VarianceMxNFunc highbd_12_variance8x16_c = vpx_highbd_12_variance8x16_c;
+const VarianceMxNFunc highbd_12_variance8x8_c = vpx_highbd_12_variance8x8_c;
+const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
+const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
+const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
+
+const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
+const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
+const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
+const VarianceMxNFunc highbd_10_variance32x32_c = vpx_highbd_10_variance32x32_c;
+const VarianceMxNFunc highbd_10_variance32x16_c = vpx_highbd_10_variance32x16_c;
+const VarianceMxNFunc highbd_10_variance16x32_c = vpx_highbd_10_variance16x32_c;
+const VarianceMxNFunc highbd_10_variance16x16_c = vpx_highbd_10_variance16x16_c;
+const VarianceMxNFunc highbd_10_variance16x8_c = vpx_highbd_10_variance16x8_c;
+const VarianceMxNFunc highbd_10_variance8x16_c = vpx_highbd_10_variance8x16_c;
+const VarianceMxNFunc highbd_10_variance8x8_c = vpx_highbd_10_variance8x8_c;
+const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
+const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
+const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
+
+const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
+const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
+const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
+const VarianceMxNFunc highbd_8_variance32x32_c = vpx_highbd_8_variance32x32_c;
+const VarianceMxNFunc highbd_8_variance32x16_c = vpx_highbd_8_variance32x16_c;
+const VarianceMxNFunc highbd_8_variance16x32_c = vpx_highbd_8_variance16x32_c;
+const VarianceMxNFunc highbd_8_variance16x16_c = vpx_highbd_8_variance16x16_c;
+const VarianceMxNFunc highbd_8_variance16x8_c = vpx_highbd_8_variance16x8_c;
+const VarianceMxNFunc highbd_8_variance8x16_c = vpx_highbd_8_variance8x16_c;
+const VarianceMxNFunc highbd_8_variance8x8_c = vpx_highbd_8_variance8x8_c;
+const VarianceMxNFunc highbd_8_variance8x4_c = vpx_highbd_8_variance8x4_c;
+const VarianceMxNFunc highbd_8_variance4x8_c = vpx_highbd_8_variance4x8_c;
+const VarianceMxNFunc highbd_8_variance4x4_c = vpx_highbd_8_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_c, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),
+                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),
+                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),
+                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),
+                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),
+                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),
+                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),
+                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),
+                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),
+                      make_tuple(2, 2, highbd_10_variance4x4_c, 10),
+                      make_tuple(6, 6, highbd_8_variance64x64_c, 8),
+                      make_tuple(6, 5, highbd_8_variance64x32_c, 8),
+                      make_tuple(5, 6, highbd_8_variance32x64_c, 8),
+                      make_tuple(5, 5, highbd_8_variance32x32_c, 8),
+                      make_tuple(5, 4, highbd_8_variance32x16_c, 8),
+                      make_tuple(4, 5, highbd_8_variance16x32_c, 8),
+                      make_tuple(4, 4, highbd_8_variance16x16_c, 8),
+                      make_tuple(4, 3, highbd_8_variance16x8_c, 8),
+                      make_tuple(3, 4, highbd_8_variance8x16_c, 8),
+                      make_tuple(3, 3, highbd_8_variance8x8_c, 8),
+                      make_tuple(3, 2, highbd_8_variance8x4_c, 8),
+                      make_tuple(2, 3, highbd_8_variance4x8_c, 8),
+                      make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if HAVE_MMX
+const VarianceMxNFunc mse16x16_mmx = vpx_mse16x16_mmx;
+INSTANTIATE_TEST_CASE_P(MMX, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_mmx)));
+
+INSTANTIATE_TEST_CASE_P(MMX, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_mmx));
+
+const VarianceMxNFunc variance16x16_mmx = vpx_variance16x16_mmx;
+const VarianceMxNFunc variance16x8_mmx = vpx_variance16x8_mmx;
+const VarianceMxNFunc variance8x16_mmx = vpx_variance8x16_mmx;
+const VarianceMxNFunc variance8x8_mmx = vpx_variance8x8_mmx;
+const VarianceMxNFunc variance4x4_mmx = vpx_variance4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_mmx, 0),
+                      make_tuple(4, 3, variance16x8_mmx, 0),
+                      make_tuple(3, 4, variance8x16_mmx, 0),
+                      make_tuple(3, 3, variance8x8_mmx, 0),
+                      make_tuple(2, 2, variance4x4_mmx, 0)));
+#endif  // HAVE_MMX
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
+                        ::testing::Values(vpx_get_mb_ss_sse2));
+
+const VarianceMxNFunc mse16x16_sse2 = vpx_mse16x16_sse2;
+const VarianceMxNFunc mse16x8_sse2 = vpx_mse16x8_sse2;
+const VarianceMxNFunc mse8x16_sse2 = vpx_mse8x16_sse2;
+const VarianceMxNFunc mse8x8_sse2 = vpx_mse8x8_sse2;
+INSTANTIATE_TEST_CASE_P(SSE2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_sse2),
+                                          make_tuple(4, 3, mse16x8_sse2),
+                                          make_tuple(3, 4, mse8x16_sse2),
+                                          make_tuple(3, 3, mse8x8_sse2)));
+
+const VarianceMxNFunc variance64x64_sse2 = vpx_variance64x64_sse2;
+const VarianceMxNFunc variance64x32_sse2 = vpx_variance64x32_sse2;
+const VarianceMxNFunc variance32x64_sse2 = vpx_variance32x64_sse2;
+const VarianceMxNFunc variance32x32_sse2 = vpx_variance32x32_sse2;
+const VarianceMxNFunc variance32x16_sse2 = vpx_variance32x16_sse2;
+const VarianceMxNFunc variance16x32_sse2 = vpx_variance16x32_sse2;
+const VarianceMxNFunc variance16x16_sse2 = vpx_variance16x16_sse2;
+const VarianceMxNFunc variance16x8_sse2 = vpx_variance16x8_sse2;
+const VarianceMxNFunc variance8x16_sse2 = vpx_variance8x16_sse2;
+const VarianceMxNFunc variance8x8_sse2 = vpx_variance8x8_sse2;
+const VarianceMxNFunc variance8x4_sse2 = vpx_variance8x4_sse2;
+const VarianceMxNFunc variance4x8_sse2 = vpx_variance4x8_sse2;
+const VarianceMxNFunc variance4x4_sse2 = vpx_variance4x4_sse2;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_sse2, 0),
+                      make_tuple(6, 5, variance64x32_sse2, 0),
+                      make_tuple(5, 6, variance32x64_sse2, 0),
+                      make_tuple(5, 5, variance32x32_sse2, 0),
+                      make_tuple(5, 4, variance32x16_sse2, 0),
+                      make_tuple(4, 5, variance16x32_sse2, 0),
+                      make_tuple(4, 4, variance16x16_sse2, 0),
+                      make_tuple(4, 3, variance16x8_sse2, 0),
+                      make_tuple(3, 4, variance8x16_sse2, 0),
+                      make_tuple(3, 3, variance8x8_sse2, 0),
+                      make_tuple(3, 2, variance8x4_sse2, 0),
+                      make_tuple(2, 3, variance4x8_sse2, 0),
+                      make_tuple(2, 2, variance4x4_sse2, 0)));
+#if CONFIG_VP9_HIGHBITDEPTH
+/* TODO(debargha): This test does not support the highbd version
+const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
+const VarianceMxNFunc highbd_12_mse16x8_sse2 = vpx_highbd_12_mse16x8_sse2;
+const VarianceMxNFunc highbd_12_mse8x16_sse2 = vpx_highbd_12_mse8x16_sse2;
+const VarianceMxNFunc highbd_12_mse8x8_sse2 = vpx_highbd_12_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_10_mse16x16_sse2 = vpx_highbd_10_mse16x16_sse2;
+const VarianceMxNFunc highbd_10_mse16x8_sse2 = vpx_highbd_10_mse16x8_sse2;
+const VarianceMxNFunc highbd_10_mse8x16_sse2 = vpx_highbd_10_mse8x16_sse2;
+const VarianceMxNFunc highbd_10_mse8x8_sse2 = vpx_highbd_10_mse8x8_sse2;
+
+const VarianceMxNFunc highbd_8_mse16x16_sse2 = vpx_highbd_8_mse16x16_sse2;
+const VarianceMxNFunc highbd_8_mse16x8_sse2 = vpx_highbd_8_mse16x8_sse2;
+const VarianceMxNFunc highbd_8_mse8x16_sse2 = vpx_highbd_8_mse8x16_sse2;
+const VarianceMxNFunc highbd_8_mse8x8_sse2 = vpx_highbd_8_mse8x8_sse2;
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDMseTest, ::testing::Values(make_tuple(4, 4, highbd_12_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_12_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_12_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_12_mse8x8_sse2),
+                                           make_tuple(4, 4, highbd_10_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_10_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_10_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_10_mse8x8_sse2),
+                                           make_tuple(4, 4, highbd_8_mse16x16_sse2),
+                                           make_tuple(4, 3, highbd_8_mse16x8_sse2),
+                                           make_tuple(3, 4, highbd_8_mse8x16_sse2),
+                                           make_tuple(3, 3, highbd_8_mse8x8_sse2)));
+*/
+
+const VarianceMxNFunc highbd_12_variance64x64_sse2 =
+    vpx_highbd_12_variance64x64_sse2;
+const VarianceMxNFunc highbd_12_variance64x32_sse2 =
+    vpx_highbd_12_variance64x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x64_sse2 =
+    vpx_highbd_12_variance32x64_sse2;
+const VarianceMxNFunc highbd_12_variance32x32_sse2 =
+    vpx_highbd_12_variance32x32_sse2;
+const VarianceMxNFunc highbd_12_variance32x16_sse2 =
+    vpx_highbd_12_variance32x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x32_sse2 =
+    vpx_highbd_12_variance16x32_sse2;
+const VarianceMxNFunc highbd_12_variance16x16_sse2 =
+    vpx_highbd_12_variance16x16_sse2;
+const VarianceMxNFunc highbd_12_variance16x8_sse2 =
+    vpx_highbd_12_variance16x8_sse2;
+const VarianceMxNFunc highbd_12_variance8x16_sse2 =
+    vpx_highbd_12_variance8x16_sse2;
+const VarianceMxNFunc highbd_12_variance8x8_sse2 =
+    vpx_highbd_12_variance8x8_sse2;
+const VarianceMxNFunc highbd_10_variance64x64_sse2 =
+    vpx_highbd_10_variance64x64_sse2;
+const VarianceMxNFunc highbd_10_variance64x32_sse2 =
+    vpx_highbd_10_variance64x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x64_sse2 =
+    vpx_highbd_10_variance32x64_sse2;
+const VarianceMxNFunc highbd_10_variance32x32_sse2 =
+    vpx_highbd_10_variance32x32_sse2;
+const VarianceMxNFunc highbd_10_variance32x16_sse2 =
+    vpx_highbd_10_variance32x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x32_sse2 =
+    vpx_highbd_10_variance16x32_sse2;
+const VarianceMxNFunc highbd_10_variance16x16_sse2 =
+    vpx_highbd_10_variance16x16_sse2;
+const VarianceMxNFunc highbd_10_variance16x8_sse2 =
+    vpx_highbd_10_variance16x8_sse2;
+const VarianceMxNFunc highbd_10_variance8x16_sse2 =
+    vpx_highbd_10_variance8x16_sse2;
+const VarianceMxNFunc highbd_10_variance8x8_sse2 =
+    vpx_highbd_10_variance8x8_sse2;
+const VarianceMxNFunc highbd_8_variance64x64_sse2 =
+    vpx_highbd_8_variance64x64_sse2;
+const VarianceMxNFunc highbd_8_variance64x32_sse2 =
+    vpx_highbd_8_variance64x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x64_sse2 =
+    vpx_highbd_8_variance32x64_sse2;
+const VarianceMxNFunc highbd_8_variance32x32_sse2 =
+    vpx_highbd_8_variance32x32_sse2;
+const VarianceMxNFunc highbd_8_variance32x16_sse2 =
+    vpx_highbd_8_variance32x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x32_sse2 =
+    vpx_highbd_8_variance16x32_sse2;
+const VarianceMxNFunc highbd_8_variance16x16_sse2 =
+    vpx_highbd_8_variance16x16_sse2;
+const VarianceMxNFunc highbd_8_variance16x8_sse2 =
+    vpx_highbd_8_variance16x8_sse2;
+const VarianceMxNFunc highbd_8_variance8x16_sse2 =
+    vpx_highbd_8_variance8x16_sse2;
+const VarianceMxNFunc highbd_8_variance8x8_sse2 =
+    vpx_highbd_8_variance8x8_sse2;
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxHBDVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
+                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
+                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
+                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
+                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
+                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
+                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
+                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
+                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
+                      make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
+                      make_tuple(6, 6, highbd_8_variance64x64_sse2, 8),
+                      make_tuple(6, 5, highbd_8_variance64x32_sse2, 8),
+                      make_tuple(5, 6, highbd_8_variance32x64_sse2, 8),
+                      make_tuple(5, 5, highbd_8_variance32x32_sse2, 8),
+                      make_tuple(5, 4, highbd_8_variance32x16_sse2, 8),
+                      make_tuple(4, 5, highbd_8_variance16x32_sse2, 8),
+                      make_tuple(4, 4, highbd_8_variance16x16_sse2, 8),
+                      make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
+                      make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
+                      make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSE2
+
+#if CONFIG_VP9_ENCODER
 typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceTest;
 typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
 
-TEST_P(VP9VarianceTest, Zero) { ZeroTest(); }
-TEST_P(VP9VarianceTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceTest, OneQuarter) { OneQuarterTest(); }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-typedef VarianceTest<vp9_variance_fn_t> VP9VarianceHighTest;
 typedef SubpelVarianceTest<vp9_subpixvariance_fn_t> VP9SubpelVarianceHighTest;
 typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
     VP9SubpelAvgVarianceHighTest;
 
-TEST_P(VP9VarianceHighTest, Zero) { ZeroTest(); }
-TEST_P(VP9VarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceHighTest, RefStride) { RefStrideTest(); }
 TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
 TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
 TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9VarianceHighTest, OneQuarter) { OneQuarterTest(); }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-const vp9_variance_fn_t variance4x4_c = vp9_variance4x4_c;
-const vp9_variance_fn_t variance4x8_c = vp9_variance4x8_c;
-const vp9_variance_fn_t variance8x4_c = vp9_variance8x4_c;
-const vp9_variance_fn_t variance8x8_c = vp9_variance8x8_c;
-const vp9_variance_fn_t variance8x16_c = vp9_variance8x16_c;
-const vp9_variance_fn_t variance16x8_c = vp9_variance16x8_c;
-const vp9_variance_fn_t variance16x16_c = vp9_variance16x16_c;
-const vp9_variance_fn_t variance16x32_c = vp9_variance16x32_c;
-const vp9_variance_fn_t variance32x16_c = vp9_variance32x16_c;
-const vp9_variance_fn_t variance32x32_c = vp9_variance32x32_c;
-const vp9_variance_fn_t variance32x64_c = vp9_variance32x64_c;
-const vp9_variance_fn_t variance64x32_c = vp9_variance64x32_c;
-const vp9_variance_fn_t variance64x64_c = vp9_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_c, 0),
-                      make_tuple(2, 3, variance4x8_c, 0),
-                      make_tuple(3, 2, variance8x4_c, 0),
-                      make_tuple(3, 3, variance8x8_c, 0),
-                      make_tuple(3, 4, variance8x16_c, 0),
-                      make_tuple(4, 3, variance16x8_c, 0),
-                      make_tuple(4, 4, variance16x16_c, 0),
-                      make_tuple(4, 5, variance16x32_c, 0),
-                      make_tuple(5, 4, variance32x16_c, 0),
-                      make_tuple(5, 5, variance32x32_c, 0),
-                      make_tuple(5, 6, variance32x64_c, 0),
-                      make_tuple(6, 5, variance64x32_c, 0),
-                      make_tuple(6, 6, variance64x64_c, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const vp9_variance_fn_t highbd_10_variance4x4_c = vp9_highbd_10_variance4x4_c;
-const vp9_variance_fn_t highbd_10_variance4x8_c = vp9_highbd_10_variance4x8_c;
-const vp9_variance_fn_t highbd_10_variance8x4_c = vp9_highbd_10_variance8x4_c;
-const vp9_variance_fn_t highbd_10_variance8x8_c = vp9_highbd_10_variance8x8_c;
-const vp9_variance_fn_t highbd_10_variance8x16_c = vp9_highbd_10_variance8x16_c;
-const vp9_variance_fn_t highbd_10_variance16x8_c = vp9_highbd_10_variance16x8_c;
-const vp9_variance_fn_t highbd_10_variance16x16_c =
-    vp9_highbd_10_variance16x16_c;
-const vp9_variance_fn_t highbd_10_variance16x32_c =
-    vp9_highbd_10_variance16x32_c;
-const vp9_variance_fn_t highbd_10_variance32x16_c =
-    vp9_highbd_10_variance32x16_c;
-const vp9_variance_fn_t highbd_10_variance32x32_c =
-    vp9_highbd_10_variance32x32_c;
-const vp9_variance_fn_t highbd_10_variance32x64_c =
-    vp9_highbd_10_variance32x64_c;
-const vp9_variance_fn_t highbd_10_variance64x32_c =
-    vp9_highbd_10_variance64x32_c;
-const vp9_variance_fn_t highbd_10_variance64x64_c =
-    vp9_highbd_10_variance64x64_c;
-const vp9_variance_fn_t highbd_12_variance4x4_c = vp9_highbd_12_variance4x4_c;
-const vp9_variance_fn_t highbd_12_variance4x8_c = vp9_highbd_12_variance4x8_c;
-const vp9_variance_fn_t highbd_12_variance8x4_c = vp9_highbd_12_variance8x4_c;
-const vp9_variance_fn_t highbd_12_variance8x8_c = vp9_highbd_12_variance8x8_c;
-const vp9_variance_fn_t highbd_12_variance8x16_c = vp9_highbd_12_variance8x16_c;
-const vp9_variance_fn_t highbd_12_variance16x8_c = vp9_highbd_12_variance16x8_c;
-const vp9_variance_fn_t highbd_12_variance16x16_c =
-    vp9_highbd_12_variance16x16_c;
-const vp9_variance_fn_t highbd_12_variance16x32_c =
-    vp9_highbd_12_variance16x32_c;
-const vp9_variance_fn_t highbd_12_variance32x16_c =
-    vp9_highbd_12_variance32x16_c;
-const vp9_variance_fn_t highbd_12_variance32x32_c =
-    vp9_highbd_12_variance32x32_c;
-const vp9_variance_fn_t highbd_12_variance32x64_c =
-    vp9_highbd_12_variance32x64_c;
-const vp9_variance_fn_t highbd_12_variance64x32_c =
-    vp9_highbd_12_variance64x32_c;
-const vp9_variance_fn_t highbd_12_variance64x64_c =
-    vp9_highbd_12_variance64x64_c;
-const vp9_variance_fn_t highbd_variance4x4_c = vp9_highbd_variance4x4_c;
-const vp9_variance_fn_t highbd_variance4x8_c = vp9_highbd_variance4x8_c;
-const vp9_variance_fn_t highbd_variance8x4_c = vp9_highbd_variance8x4_c;
-const vp9_variance_fn_t highbd_variance8x8_c = vp9_highbd_variance8x8_c;
-const vp9_variance_fn_t highbd_variance8x16_c = vp9_highbd_variance8x16_c;
-const vp9_variance_fn_t highbd_variance16x8_c = vp9_highbd_variance16x8_c;
-const vp9_variance_fn_t highbd_variance16x16_c = vp9_highbd_variance16x16_c;
-const vp9_variance_fn_t highbd_variance16x32_c = vp9_highbd_variance16x32_c;
-const vp9_variance_fn_t highbd_variance32x16_c = vp9_highbd_variance32x16_c;
-const vp9_variance_fn_t highbd_variance32x32_c = vp9_highbd_variance32x32_c;
-const vp9_variance_fn_t highbd_variance32x64_c = vp9_highbd_variance32x64_c;
-const vp9_variance_fn_t highbd_variance64x32_c = vp9_highbd_variance64x32_c;
-const vp9_variance_fn_t highbd_variance64x64_c = vp9_highbd_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9VarianceHighTest,
-    ::testing::Values(make_tuple(2, 2, highbd_10_variance4x4_c, 10),
-                      make_tuple(2, 3, highbd_10_variance4x8_c, 10),
-                      make_tuple(3, 2, highbd_10_variance8x4_c, 10),
-                      make_tuple(3, 3, highbd_10_variance8x8_c, 10),
-                      make_tuple(3, 4, highbd_10_variance8x16_c, 10),
-                      make_tuple(4, 3, highbd_10_variance16x8_c, 10),
-                      make_tuple(4, 4, highbd_10_variance16x16_c, 10),
-                      make_tuple(4, 5, highbd_10_variance16x32_c, 10),
-                      make_tuple(5, 4, highbd_10_variance32x16_c, 10),
-                      make_tuple(5, 5, highbd_10_variance32x32_c, 10),
-                      make_tuple(5, 6, highbd_10_variance32x64_c, 10),
-                      make_tuple(6, 5, highbd_10_variance64x32_c, 10),
-                      make_tuple(6, 6, highbd_10_variance64x64_c, 10),
-                      make_tuple(2, 2, highbd_12_variance4x4_c, 12),
-                      make_tuple(2, 3, highbd_12_variance4x8_c, 12),
-                      make_tuple(3, 2, highbd_12_variance8x4_c, 12),
-                      make_tuple(3, 3, highbd_12_variance8x8_c, 12),
-                      make_tuple(3, 4, highbd_12_variance8x16_c, 12),
-                      make_tuple(4, 3, highbd_12_variance16x8_c, 12),
-                      make_tuple(4, 4, highbd_12_variance16x16_c, 12),
-                      make_tuple(4, 5, highbd_12_variance16x32_c, 12),
-                      make_tuple(5, 4, highbd_12_variance32x16_c, 12),
-                      make_tuple(5, 5, highbd_12_variance32x32_c, 12),
-                      make_tuple(5, 6, highbd_12_variance32x64_c, 12),
-                      make_tuple(6, 5, highbd_12_variance64x32_c, 12),
-                      make_tuple(6, 6, highbd_12_variance64x64_c, 12),
-                      make_tuple(2, 2, highbd_variance4x4_c, 8),
-                      make_tuple(2, 3, highbd_variance4x8_c, 8),
-                      make_tuple(3, 2, highbd_variance8x4_c, 8),
-                      make_tuple(3, 3, highbd_variance8x8_c, 8),
-                      make_tuple(3, 4, highbd_variance8x16_c, 8),
-                      make_tuple(4, 3, highbd_variance16x8_c, 8),
-                      make_tuple(4, 4, highbd_variance16x16_c, 8),
-                      make_tuple(4, 5, highbd_variance16x32_c, 8),
-                      make_tuple(5, 4, highbd_variance32x16_c, 8),
-                      make_tuple(5, 5, highbd_variance32x32_c, 8),
-                      make_tuple(5, 6, highbd_variance32x64_c, 8),
-                      make_tuple(6, 5, highbd_variance64x32_c, 8),
-                      make_tuple(6, 6, highbd_variance64x64_c, 8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 const vp9_subpixvariance_fn_t subpel_variance4x4_c =
     vp9_sub_pixel_variance4x4_c;
 const vp9_subpixvariance_fn_t subpel_variance4x8_c =
@@ -1377,40 +1423,11 @@
         make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
         make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP9_ENCODER
 
+#if CONFIG_VP9_ENCODER
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
-INSTANTIATE_TEST_CASE_P(SSE2, SumOfSquaresTest,
-                        ::testing::Values(vp9_get_mb_ss_sse2));
-
-const vp9_variance_fn_t variance4x4_sse2 = vp9_variance4x4_sse2;
-const vp9_variance_fn_t variance4x8_sse2 = vp9_variance4x8_sse2;
-const vp9_variance_fn_t variance8x4_sse2 = vp9_variance8x4_sse2;
-const vp9_variance_fn_t variance8x8_sse2 = vp9_variance8x8_sse2;
-const vp9_variance_fn_t variance8x16_sse2 = vp9_variance8x16_sse2;
-const vp9_variance_fn_t variance16x8_sse2 = vp9_variance16x8_sse2;
-const vp9_variance_fn_t variance16x16_sse2 = vp9_variance16x16_sse2;
-const vp9_variance_fn_t variance16x32_sse2 = vp9_variance16x32_sse2;
-const vp9_variance_fn_t variance32x16_sse2 = vp9_variance32x16_sse2;
-const vp9_variance_fn_t variance32x32_sse2 = vp9_variance32x32_sse2;
-const vp9_variance_fn_t variance32x64_sse2 = vp9_variance32x64_sse2;
-const vp9_variance_fn_t variance64x32_sse2 = vp9_variance64x32_sse2;
-const vp9_variance_fn_t variance64x64_sse2 = vp9_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9VarianceTest,
-    ::testing::Values(make_tuple(2, 2, variance4x4_sse2, 0),
-                      make_tuple(2, 3, variance4x8_sse2, 0),
-                      make_tuple(3, 2, variance8x4_sse2, 0),
-                      make_tuple(3, 3, variance8x8_sse2, 0),
-                      make_tuple(3, 4, variance8x16_sse2, 0),
-                      make_tuple(4, 3, variance16x8_sse2, 0),
-                      make_tuple(4, 4, variance16x16_sse2, 0),
-                      make_tuple(4, 5, variance16x32_sse2, 0),
-                      make_tuple(5, 4, variance32x16_sse2, 0),
-                      make_tuple(5, 5, variance32x32_sse2, 0),
-                      make_tuple(5, 6, variance32x64_sse2, 0),
-                      make_tuple(6, 5, variance64x32_sse2, 0),
-                      make_tuple(6, 6, variance64x64_sse2, 0)));
 const vp9_subpixvariance_fn_t subpel_variance4x4_sse =
     vp9_sub_pixel_variance4x4_sse;
 const vp9_subpixvariance_fn_t subpel_variance4x8_sse =
@@ -1494,96 +1511,6 @@
                       make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
                       make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
 #if CONFIG_VP9_HIGHBITDEPTH
-const vp9_variance_fn_t highbd_variance8x8_sse2 = vp9_highbd_variance8x8_sse2;
-const vp9_variance_fn_t highbd_10_variance8x8_sse2 =
-    vp9_highbd_10_variance8x8_sse2;
-const vp9_variance_fn_t highbd_12_variance8x8_sse2 =
-    vp9_highbd_12_variance8x8_sse2;
-const vp9_variance_fn_t highbd_variance8x16_sse2 = vp9_highbd_variance8x16_sse2;
-const vp9_variance_fn_t highbd_10_variance8x16_sse2 =
-    vp9_highbd_10_variance8x16_sse2;
-const vp9_variance_fn_t highbd_12_variance8x16_sse2 =
-    vp9_highbd_12_variance8x16_sse2;
-const vp9_variance_fn_t highbd_variance16x8_sse2 =
-    vp9_highbd_variance16x8_sse2;
-const vp9_variance_fn_t highbd_10_variance16x8_sse2 =
-    vp9_highbd_10_variance16x8_sse2;
-const vp9_variance_fn_t highbd_12_variance16x8_sse2 =
-    vp9_highbd_12_variance16x8_sse2;
-const vp9_variance_fn_t highbd_variance16x16_sse2 =
-    vp9_highbd_variance16x16_sse2;
-const vp9_variance_fn_t highbd_10_variance16x16_sse2 =
-    vp9_highbd_10_variance16x16_sse2;
-const vp9_variance_fn_t highbd_12_variance16x16_sse2 =
-    vp9_highbd_12_variance16x16_sse2;
-const vp9_variance_fn_t highbd_variance16x32_sse2 =
-    vp9_highbd_variance16x32_sse2;
-const vp9_variance_fn_t highbd_10_variance16x32_sse2 =
-    vp9_highbd_10_variance16x32_sse2;
-const vp9_variance_fn_t highbd_12_variance16x32_sse2 =
-    vp9_highbd_12_variance16x32_sse2;
-const vp9_variance_fn_t highbd_variance32x16_sse2 =
-    vp9_highbd_variance32x16_sse2;
-const vp9_variance_fn_t highbd_10_variance32x16_sse2 =
-    vp9_highbd_10_variance32x16_sse2;
-const vp9_variance_fn_t highbd_12_variance32x16_sse2 =
-    vp9_highbd_12_variance32x16_sse2;
-const vp9_variance_fn_t highbd_variance32x32_sse2 =
-    vp9_highbd_variance32x32_sse2;
-const vp9_variance_fn_t highbd_10_variance32x32_sse2 =
-    vp9_highbd_10_variance32x32_sse2;
-const vp9_variance_fn_t highbd_12_variance32x32_sse2 =
-    vp9_highbd_12_variance32x32_sse2;
-const vp9_variance_fn_t highbd_variance32x64_sse2 =
-    vp9_highbd_variance32x64_sse2;
-const vp9_variance_fn_t highbd_10_variance32x64_sse2 =
-    vp9_highbd_10_variance32x64_sse2;
-const vp9_variance_fn_t highbd_12_variance32x64_sse2 =
-    vp9_highbd_12_variance32x64_sse2;
-const vp9_variance_fn_t highbd_variance64x32_sse2 =
-    vp9_highbd_variance64x32_sse2;
-const vp9_variance_fn_t highbd_10_variance64x32_sse2 =
-    vp9_highbd_10_variance64x32_sse2;
-const vp9_variance_fn_t highbd_12_variance64x32_sse2 =
-    vp9_highbd_12_variance64x32_sse2;
-const vp9_variance_fn_t highbd_variance64x64_sse2 =
-    vp9_highbd_variance64x64_sse2;
-const vp9_variance_fn_t highbd_10_variance64x64_sse2 =
-    vp9_highbd_10_variance64x64_sse2;
-const vp9_variance_fn_t highbd_12_variance64x64_sse2 =
-    vp9_highbd_12_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9VarianceHighTest,
-    ::testing::Values(make_tuple(3, 3, highbd_10_variance8x8_sse2, 10),
-                      make_tuple(3, 4, highbd_10_variance8x16_sse2, 10),
-                      make_tuple(4, 3, highbd_10_variance16x8_sse2, 10),
-                      make_tuple(4, 4, highbd_10_variance16x16_sse2, 10),
-                      make_tuple(4, 5, highbd_10_variance16x32_sse2, 10),
-                      make_tuple(5, 4, highbd_10_variance32x16_sse2, 10),
-                      make_tuple(5, 5, highbd_10_variance32x32_sse2, 10),
-                      make_tuple(5, 6, highbd_10_variance32x64_sse2, 10),
-                      make_tuple(6, 5, highbd_10_variance64x32_sse2, 10),
-                      make_tuple(6, 6, highbd_10_variance64x64_sse2, 10),
-                      make_tuple(3, 3, highbd_12_variance8x8_sse2, 12),
-                      make_tuple(3, 4, highbd_12_variance8x16_sse2, 12),
-                      make_tuple(4, 3, highbd_12_variance16x8_sse2, 12),
-                      make_tuple(4, 4, highbd_12_variance16x16_sse2, 12),
-                      make_tuple(4, 5, highbd_12_variance16x32_sse2, 12),
-                      make_tuple(5, 4, highbd_12_variance32x16_sse2, 12),
-                      make_tuple(5, 5, highbd_12_variance32x32_sse2, 12),
-                      make_tuple(5, 6, highbd_12_variance32x64_sse2, 12),
-                      make_tuple(6, 5, highbd_12_variance64x32_sse2, 12),
-                      make_tuple(6, 6, highbd_12_variance64x64_sse2, 12),
-                      make_tuple(3, 3, highbd_variance8x8_sse2, 8),
-                      make_tuple(3, 4, highbd_variance8x16_sse2, 8),
-                      make_tuple(4, 3, highbd_variance16x8_sse2, 8),
-                      make_tuple(4, 4, highbd_variance16x16_sse2, 8),
-                      make_tuple(4, 5, highbd_variance16x32_sse2, 8),
-                      make_tuple(5, 4, highbd_variance32x16_sse2, 8),
-                      make_tuple(5, 5, highbd_variance32x32_sse2, 8),
-                      make_tuple(5, 6, highbd_variance32x64_sse2, 8),
-                      make_tuple(6, 5, highbd_variance64x32_sse2, 8),
-                      make_tuple(6, 6, highbd_variance64x64_sse2, 8)));
 const vp9_subpixvariance_fn_t highbd_subpel_variance8x4_sse2 =
     vp9_highbd_sub_pixel_variance8x4_sse2;
 const vp9_subpixvariance_fn_t highbd_subpel_variance8x8_sse2 =
@@ -1790,6 +1717,9 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_ENCODER
+
+#if CONFIG_VP9_ENCODER
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
 
@@ -1877,22 +1807,27 @@
                       make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
+#endif  // CONFIG_VP9_ENCODER
 
 #if HAVE_AVX2
+const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
+INSTANTIATE_TEST_CASE_P(AVX2, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_avx2)));
 
-const vp9_variance_fn_t variance16x16_avx2 = vp9_variance16x16_avx2;
-const vp9_variance_fn_t variance32x16_avx2 = vp9_variance32x16_avx2;
-const vp9_variance_fn_t variance32x32_avx2 = vp9_variance32x32_avx2;
-const vp9_variance_fn_t variance64x32_avx2 = vp9_variance64x32_avx2;
-const vp9_variance_fn_t variance64x64_avx2 = vp9_variance64x64_avx2;
+const VarianceMxNFunc variance64x64_avx2 = vpx_variance64x64_avx2;
+const VarianceMxNFunc variance64x32_avx2 = vpx_variance64x32_avx2;
+const VarianceMxNFunc variance32x32_avx2 = vpx_variance32x32_avx2;
+const VarianceMxNFunc variance32x16_avx2 = vpx_variance32x16_avx2;
+const VarianceMxNFunc variance16x16_avx2 = vpx_variance16x16_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9VarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_avx2, 0),
-                      make_tuple(5, 4, variance32x16_avx2, 0),
-                      make_tuple(5, 5, variance32x32_avx2, 0),
+    AVX2, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_avx2, 0),
                       make_tuple(6, 5, variance64x32_avx2, 0),
-                      make_tuple(6, 6, variance64x64_avx2, 0)));
+                      make_tuple(5, 5, variance32x32_avx2, 0),
+                      make_tuple(5, 4, variance32x16_avx2, 0),
+                      make_tuple(4, 4, variance16x16_avx2, 0)));
 
+#if CONFIG_VP9_ENCODER
 const vp9_subpixvariance_fn_t subpel_variance32x32_avx2 =
     vp9_sub_pixel_variance32x32_avx2;
 const vp9_subpixvariance_fn_t subpel_variance64x64_avx2 =
@@ -1910,23 +1845,38 @@
     AVX2, VP9SubpelAvgVarianceTest,
     ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
                       make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
+#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_AVX2
-#if HAVE_NEON
-const vp9_variance_fn_t variance8x8_neon = vp9_variance8x8_neon;
-const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
-const vp9_variance_fn_t variance32x32_neon = vp9_variance32x32_neon;
-const vp9_variance_fn_t variance32x64_neon = vp9_variance32x64_neon;
-const vp9_variance_fn_t variance64x32_neon = vp9_variance64x32_neon;
-const vp9_variance_fn_t variance64x64_neon = vp9_variance64x64_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP9VarianceTest,
-    ::testing::Values(make_tuple(3, 3, variance8x8_neon, 0),
-                      make_tuple(4, 4, variance16x16_neon, 0),
-                      make_tuple(5, 5, variance32x32_neon, 0),
-                      make_tuple(5, 6, variance32x64_neon, 0),
-                      make_tuple(6, 5, variance64x32_neon, 0),
-                      make_tuple(6, 6, variance64x64_neon, 0)));
 
+#if HAVE_NEON
+const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest,
+                        ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon)));
+
+const VarianceMxNFunc mse16x16_neon = vpx_mse16x16_neon;
+INSTANTIATE_TEST_CASE_P(NEON, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_neon)));
+
+const VarianceMxNFunc variance64x64_neon = vpx_variance64x64_neon;
+const VarianceMxNFunc variance64x32_neon = vpx_variance64x32_neon;
+const VarianceMxNFunc variance32x64_neon = vpx_variance32x64_neon;
+const VarianceMxNFunc variance32x32_neon = vpx_variance32x32_neon;
+const VarianceMxNFunc variance16x16_neon = vpx_variance16x16_neon;
+const VarianceMxNFunc variance16x8_neon = vpx_variance16x8_neon;
+const VarianceMxNFunc variance8x16_neon = vpx_variance8x16_neon;
+const VarianceMxNFunc variance8x8_neon = vpx_variance8x8_neon;
+INSTANTIATE_TEST_CASE_P(
+    NEON, VpxVarianceTest,
+    ::testing::Values(make_tuple(6, 6, variance64x64_neon, 0),
+                      make_tuple(6, 5, variance64x32_neon, 0),
+                      make_tuple(5, 6, variance32x64_neon, 0),
+                      make_tuple(5, 5, variance32x32_neon, 0),
+                      make_tuple(4, 4, variance16x16_neon, 0),
+                      make_tuple(4, 3, variance16x8_neon, 0),
+                      make_tuple(3, 4, variance8x16_neon, 0),
+                      make_tuple(3, 3, variance8x8_neon, 0)));
+
+#if CONFIG_VP9_ENCODER
 const vp9_subpixvariance_fn_t subpel_variance8x8_neon =
     vp9_sub_pixel_variance8x8_neon;
 const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
@@ -1941,8 +1891,19 @@
                       make_tuple(4, 4, subpel_variance16x16_neon, 0),
                       make_tuple(5, 5, subpel_variance32x32_neon, 0),
                       make_tuple(6, 6, subpel_variance64x64_neon, 0)));
-#endif  // HAVE_NEON
 #endif  // CONFIG_VP9_ENCODER
+#endif  // HAVE_NEON
 
-}  // namespace vp9
+#if HAVE_MEDIA
+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+
+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
+                      make_tuple(3, 3, variance8x8_media, 0)));
+#endif  // HAVE_MEDIA
 }  // namespace

diff --git a/test/vp9_error_block_test.cc b/test/vp9_error_block_test.cc
index d7ba1b0..ac19c2e 100644
--- a/test/vp9_error_block_test.cc
+++ b/test/vp9_error_block_test.cc

@@ -21,6 +21,7 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 2d91046..943c00b 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc

@@ -21,6 +21,8 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_scan.h"
+#include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 
 using libvpx_test::ACMRandom;

diff --git a/tools_common.h b/tools_common.h
index a87e814..aa7f025 100644
--- a/tools_common.h
+++ b/tools_common.h

@@ -16,6 +16,7 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_image.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"
 
 #if CONFIG_ENCODERS
 #include "./y4minput.h"
@@ -34,7 +35,6 @@
 #if CONFIG_OS_SUPPORT
 #if defined(_MSC_VER)
 #include <io.h>  /* NOLINT */
-#define snprintf _snprintf
 #define isatty   _isatty
 #define fileno   _fileno
 #else

diff --git a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm b/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
deleted file mode 100644
index 3991957..0000000
--- a/vp8/common/arm/armv6/vp8_variance16x16_armv6.asm
+++ /dev/null

@@ -1,154 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance16x16_armv6|
-
-    ARM
-    REQUIRE8
-    PRESERVE8
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance16x16_armv6| PROC
-
-    stmfd   sp!, {r4-r12, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r8, #0              ; initialize sum = 0
-    mov     r11, #0             ; initialize sse = 0
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-
-loop
-    ; 1st 4 pixels
-    ldr     r4, [r0, #0]        ; load 4 src pixels
-    ldr     r5, [r2, #0]        ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r6, r4, r5          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-    ; calculate total sum
-    adds    r8, r8, r4          ; add positive differences to sum
-    subs    r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r4, [r0, #4]        ; load 4 src pixels
-    ldr     r5, [r2, #4]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r4, [r0, #8]        ; load 4 src pixels
-    ldr     r5, [r2, #8]        ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r4, [r0, #12]       ; load 4 src pixels
-    ldr     r5, [r2, #12]       ; load 4 ref pixels
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-    usub8   r6, r4, r5          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r6, lr          ; select bytes with positive difference
-    usub8   r9, r5, r4          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r6, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r4, r7, lr          ; calculate sum of positive differences
-    usad8   r5, r6, lr          ; calculate sum of negative differences
-    orr     r6, r6, r7          ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r8, r8, r4          ; add positive differences to sum
-    sub     r8, r8, r5          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r5, r6              ; byte (two pixels) to halfwords
-    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
-    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
-    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
-
-
-    subs    r12, r12, #1
-
-    bne     loop
-
-    ; return stuff
-    ldr     r6, [sp, #40]       ; get address of sse
-    mul     r0, r8, r8          ; sum * sum
-    str     r11, [r6]           ; store sse
-    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
-
-    ldmfd   sp!, {r4-r12, pc}
-
-    ENDP
-
-    END
-

diff --git a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm b/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
deleted file mode 100644
index 915ee49..0000000
--- a/vp8/common/arm/armv6/vp8_variance8x8_armv6.asm
+++ /dev/null

@@ -1,101 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_variance8x8_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-|vp8_variance8x8_armv6| PROC
-
-    push    {r4-r10, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #8             ; set loop counter to 8 (=block height)
-    mov     r4, #0              ; initialize sum = 0
-    mov     r5, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r6, [r0, #0x0]      ; load 4 src pixels
-    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r6, r7          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-    ; calculate total sum
-    add    r4, r4, r6           ; add positive differences to sum
-    sub    r4, r4, r7           ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r0, #0x4]      ; load 4 src pixels
-    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r6, r7          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r10, r8, lr         ; select bytes with positive difference
-    usub8   r9, r7, r6          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r6, r10, lr         ; calculate sum of positive differences
-    usad8   r7, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r10         ; differences of all 4 pixels
-
-    ; calculate total sum
-    add     r4, r4, r6          ; add positive differences to sum
-    sub     r4, r4, r7          ; subtract negative differences from sum
-
-    ; calculate sse
-    uxtb16  r7, r8              ; byte (two pixels) to halfwords
-    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
-    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
-    subs    r12, r12, #1        ; next row
-    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r8, [sp, #32]       ; get address of sse
-    mul     r1, r4, r4          ; sum * sum
-    str     r5, [r8]            ; store sse
-    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
-
-    pop     {r4-r10, pc}
-
-    ENDP
-
-    END

diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c
deleted file mode 100644
index 1b19790..0000000
--- a/vp8/common/arm/neon/variance_neon.c
+++ /dev/null

@@ -1,320 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-
-unsigned int vp8_variance16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance16x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {  // variance16x8_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d2u8, d4u8, d6u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // variance8x16_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        __builtin_prefetch(src_ptr);
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        __builtin_prefetch(ref_ptr);
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d2u8, d6u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance8x8_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 2; i++) {  // variance8x8_neon_loop
-        d0u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d1u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d2u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-        d3u8 = vld1_u8(src_ptr);
-        src_ptr += source_stride;
-
-        d4u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d5u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d6u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        d7u8 = vld1_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(d0u8, d4u8);
-        q12u16 = vsubl_u8(d1u8, d5u8);
-        q13u16 = vsubl_u8(d2u8, d6u8);
-        q14u16 = vsubl_u8(d3u8, d7u8);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
-    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 6);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}

diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
index 467a509..0f293f0 100644
--- a/vp8/common/arm/variance_arm.c
+++ b/vp8/common/arm/variance_arm.c

@@ -9,10 +9,14 @@
  */
 
 #include "vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp8/common/variance.h"
 #include "vp8/common/filter.h"
 
+// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
+#if CONFIG_VP8_ENCODER
+
 #if HAVE_MEDIA
 #include "vp8/common/arm/bilinearfilter_arm.h"
 
@@ -40,8 +44,8 @@
     vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                              8, 8, 8, VFilter);
 
-    return vp8_variance8x8_armv6(second_pass, 8, dst_ptr,
-                                   dst_pixels_per_line, sse);
+    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                                 dst_pixels_per_line, sse);
 }
 
 unsigned int vp8_sub_pixel_variance16x16_armv6
@@ -86,13 +90,13 @@
         vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
                                                  16, 16, 16, VFilter);
 
-        var = vp8_variance16x16_armv6(second_pass, 16, dst_ptr,
-                                       dst_pixels_per_line, sse);
+        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                      dst_pixels_per_line, sse);
     }
     return var;
 }
 
-#endif /* HAVE_MEDIA */
+#endif  // HAVE_MEDIA
 
 
 #if HAVE_NEON
@@ -129,4 +133,5 @@
     return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
 }
 
-#endif
+#endif  // HAVE_NEON
+#endif  // CONFIG_VP8_ENCODER

diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index d12dea1..5c0680f 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c

@@ -151,14 +151,14 @@
 
     if (blksize == 16)
     {
-        actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
-        act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        actd = (vpx_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+        act = (vpx_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance16x16(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 128)>>8;
-        vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 32)>>6;
-        vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 32)>>6;
 #else
         sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
@@ -168,14 +168,14 @@
     }
     else /* if (blksize == 8) */
     {
-        actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
-        act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        actd = (vpx_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+        act = (vpx_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
+        vpx_variance8x8(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 32)>>6;
-        vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
+        vpx_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 8)>>4;
-        vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
+        vpx_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 8)>>4;
 #else
         sad = (vpx_sad8x8(y, y_stride, yd, yd_stride) + 32) >> 6;

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index c9f14d5..4b82033 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -237,31 +237,6 @@
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 
 #
-# Whole-pixel Variance
-#
-add_proto qw/unsigned int vp8_variance4x4/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance4x4 mmx sse2/;
-$vp8_variance4x4_sse2=vp8_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_variance8x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x8 mmx sse2 media neon/;
-$vp8_variance8x8_sse2=vp8_variance8x8_wmt;
-$vp8_variance8x8_media=vp8_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_variance8x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance8x16 mmx sse2 neon/;
-$vp8_variance8x16_sse2=vp8_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_variance16x8/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x8 mmx sse2 neon/;
-$vp8_variance16x8_sse2=vp8_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_variance16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance16x16 mmx sse2 media neon/;
-$vp8_variance16x16_sse2=vp8_variance16x16_wmt;
-$vp8_variance16x16_media=vp8_variance16x16_armv6;
-
-#
 # Sub-pixel Variance
 #
 add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
@@ -309,26 +284,12 @@
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
 
 #
-# Sum of squares (vector)
-#
-add_proto qw/unsigned int vp8_get_mb_ss/, "const short *";
-specialize qw/vp8_get_mb_ss mmx sse2/;
-
-#
 # SSE (Sum Squared Error)
 #
 add_proto qw/unsigned int vp8_sub_pixel_mse16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
 specialize qw/vp8_sub_pixel_mse16x16 mmx sse2/;
 $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt;
 
-add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_mse16x16 mmx sse2 media neon/;
-$vp8_mse16x16_sse2=vp8_mse16x16_wmt;
-$vp8_mse16x16_media=vp8_mse16x16_armv6;
-
-add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-specialize qw/vp8_get4x4sse_cs mmx neon/;
-
 #
 # Block copy
 #

diff --git a/vp8/common/variance.h b/vp8/common/variance.h
index b62cc61..c6c9f41 100644
--- a/vp8/common/variance.h
+++ b/vp8/common/variance.h

@@ -39,6 +39,7 @@
     const unsigned char *ref_array,
     int  ref_stride,
     unsigned int *sad_array);
+
 typedef void (*vpx_sad_multi_d_fn_t)
     (
      const unsigned char *src_ptr,
@@ -48,7 +49,7 @@
      unsigned int *sad_array
     );
 
-typedef unsigned int (*vp8_variance_fn_t)
+typedef unsigned int (*vpx_variance_fn_t)
     (
      const unsigned char *src_ptr,
      int source_stride,
@@ -68,37 +69,14 @@
       unsigned int *sse
     );
 
-typedef void (*vp8_ssimpf_fn_t)
-      (
-        unsigned char *s,
-        int sp,
-        unsigned char *r,
-        int rp,
-        unsigned long *sum_s,
-        unsigned long *sum_r,
-        unsigned long *sum_sq_s,
-        unsigned long *sum_sq_r,
-        unsigned long *sum_sxr
-      );
-
-typedef unsigned int (*vp8_getmbss_fn_t)(const short *);
-
-typedef unsigned int (*vp8_get16x16prederror_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride
-    );
-
 typedef struct variance_vtable
 {
     vpx_sad_fn_t            sdf;
-    vp8_variance_fn_t       vf;
+    vpx_variance_fn_t       vf;
     vp8_subpixvariance_fn_t svf;
-    vp8_variance_fn_t       svf_halfpix_h;
-    vp8_variance_fn_t       svf_halfpix_v;
-    vp8_variance_fn_t       svf_halfpix_hv;
+    vpx_variance_fn_t       svf_halfpix_h;
+    vpx_variance_fn_t       svf_halfpix_v;
+    vpx_variance_fn_t       svf_halfpix_hv;
     vpx_sad_multi_fn_t      sdx3f;
     vpx_sad_multi_fn_t      sdx8f;
     vpx_sad_multi_d_fn_t    sdx4df;

diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c
index dc95bfe..79d1ca0 100644
--- a/vp8/common/variance_c.c
+++ b/vp8/common/variance_c.c

@@ -8,44 +8,34 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "./vp8_rtcd.h"
 #include "filter.h"
 #include "variance.h"
 
-
-unsigned int vp8_get_mb_ss_c
-(
-    const short *src_ptr
-)
-{
-    unsigned int i = 0, sum = 0;
-
-    do
-    {
-        sum += (src_ptr[i] * src_ptr[i]);
-        i++;
-    }
-    while (i < 256);
-
-    return sum;
+/* This is a bad idea.
+ * ctz = count trailing zeros */
+static int ctz(int a) {
+  int b = 0;
+  while (a != 1) {
+    a >>= 1;
+    b++;
+  }
+  return b;
 }
 
-
-static void variance(
+static unsigned int variance(
     const unsigned char *src_ptr,
     int  source_stride,
     const unsigned char *ref_ptr,
     int  recon_stride,
     int  w,
     int  h,
-    unsigned int *sse,
-    int *sum)
+    unsigned int *sse)
 {
     int i, j;
-    int diff;
+    int diff, sum;
 
-    *sum = 0;
+    sum = 0;
     *sse = 0;
 
     for (i = 0; i < h; i++)
@@ -53,114 +43,17 @@
         for (j = 0; j < w; j++)
         {
             diff = src_ptr[j] - ref_ptr[j];
-            *sum += diff;
+            sum += diff;
             *sse += diff * diff;
         }
 
         src_ptr += source_stride;
         ref_ptr += recon_stride;
     }
+
+    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
 }
 
-
-unsigned int vp8_variance16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance8x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 16, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-unsigned int vp8_variance16x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-}
-
-
-unsigned int vp8_variance8x8_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-}
-
-unsigned int vp8_variance4x4_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 4, 4, &var, &avg);
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-}
-
-
-unsigned int vp8_mse16x16_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, &var, &avg);
-    *sse = var;
-    return var;
-}
-
-
 /****************************************************************************
  *
  *  ROUTINE       : filter_block2d_bil_first_pass
@@ -304,7 +197,7 @@
     /* Now filter Verticaly */
     var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
 
-    return vp8_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
 }
 
 
@@ -329,7 +222,7 @@
     var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
     var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
 
-    return vp8_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
 }
 
 unsigned int vp8_sub_pixel_variance16x16_c
@@ -353,7 +246,7 @@
     var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
     var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
 
-    return vp8_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
 }
 
 
@@ -429,7 +322,7 @@
     var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
     var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
 
-    return vp8_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
 }
 
 unsigned int vp8_sub_pixel_variance8x16_c
@@ -455,5 +348,5 @@
     var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
     var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
 
-    return vp8_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
+    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
 }

diff --git a/vp8/common/x86/variance_impl_mmx.asm b/vp8/common/x86/variance_impl_mmx.asm
index 7d5e681..97f2527 100644
--- a/vp8/common/x86/variance_impl_mmx.asm
+++ b/vp8/common/x86/variance_impl_mmx.asm

@@ -11,504 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
-global sym(vp8_get_mb_ss_mmx) PRIVATE
-sym(vp8_get_mb_ss_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 8
-    ; end prolog
-
-        mov         rax, arg(0) ;src_ptr
-        mov         rcx, 16
-        pxor        mm4, mm4
-
-.NEXTROW:
-        movq        mm0, [rax]
-        movq        mm1, [rax+8]
-        movq        mm2, [rax+16]
-        movq        mm3, [rax+24]
-        pmaddwd     mm0, mm0
-        pmaddwd     mm1, mm1
-        pmaddwd     mm2, mm2
-        pmaddwd     mm3, mm3
-
-        paddd       mm4, mm0
-        paddd       mm4, mm1
-        paddd       mm4, mm2
-        paddd       mm4, mm3
-
-        add         rax, 32
-        dec         rcx
-        ja          .NEXTROW
-        movq        QWORD PTR [rsp], mm4
-
-        ;return sum[0]+sum[1];
-        movsxd      rax, dword ptr [rsp]
-        movsxd      rcx, dword ptr [rsp+4]
-        add         rax, rcx
-
-
-    ; begin epilog
-    add rsp, 8
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_get8x8var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp8_get8x8var_mmx) PRIVATE
-sym(vp8_get8x8var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-
-        ; Row 2
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 3
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 4
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 5
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        ;              movq        mm4, [rbx + rdx]
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 6
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 7
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Row 8
-        movq        mm0, [rax]                  ; Copy eight bytes to mm0
-        movq        mm2, mm0                    ; Take copies
-        movq        mm3, mm1                    ; Take copies
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
-        punpckhbw   mm3, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        psubsw      mm2, mm3                    ; A-B (high order) to MM2
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        paddw       mm5, mm2                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        pmaddwd     mm2, mm2                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        paddd       mm7, mm0                    ; accumulate in mm7
-        paddd       mm7, mm2                    ; accumulate in mm7
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp8_get4x4var_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride,
-;    unsigned int *SSE,
-;    int *Sum
-;)
-global sym(vp8_get4x4var_mmx) PRIVATE
-sym(vp8_get4x4var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    push rsi
-    push rdi
-    push rbx
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor        mm5, mm5                    ; Blank mmx6
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher precision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy four bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy four bytes to mm0
-
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        paddw       mm5, mm0                    ; accumulate differences in mm5
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-
-        ; Now accumulate the final results.
-        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
-        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
-        movsx       rdx, WORD PTR [rsp+8]
-        movsx       rcx, WORD PTR [rsp+10]
-        movsx       rbx, WORD PTR [rsp+12]
-        movsx       rax, WORD PTR [rsp+14]
-        add         rdx, rcx
-        add         rbx, rax
-        add         rdx, rbx    ;XSum
-        movsxd      rax, DWORD PTR [rsp]
-        movsxd      rcx, DWORD PTR [rsp+4]
-        add         rax, rcx    ;XXSum
-        mov         rsi, arg(4) ;SSE
-        mov         rdi, arg(5) ;Sum
-        mov         dword ptr [rsi], eax
-        mov         dword ptr [rdi], edx
-        xor         rax, rax    ; return 0
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-;unsigned int
-;vp8_get4x4sse_cs_mmx
-;(
-;    unsigned char *src_ptr,
-;    int  source_stride,
-;    unsigned char *ref_ptr,
-;    int  recon_stride
-;)
-global sym(vp8_get4x4sse_cs_mmx) PRIVATE
-sym(vp8_get4x4sse_cs_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-
-        pxor        mm6, mm6                    ; Blank mmx7
-        pxor        mm7, mm7                    ; Blank mmx7
-
-        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
-        mov         rbx, arg(2) ;[ref_ptr]
-        movsxd      rcx, dword ptr arg(1) ;[source_stride]
-        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
-        ; Row 1
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 2
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 3
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm1, mm6
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        add         rbx,rdx                     ; Inc pointer into ref data
-        add         rax,rcx                     ; Inc pointer into the new data
-        movd        mm1, [rbx]                  ; Copy eight bytes to mm1
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        ; Row 4
-        movd        mm0, [rax]                  ; Copy eight bytes to mm0
-        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
-        punpcklbw   mm1, mm6
-        psubsw      mm0, mm1                    ; A-B (low order) to MM0
-        pmaddwd     mm0, mm0                    ; square and accumulate
-        paddd       mm7, mm0                    ; accumulate in mm7
-
-        movq        mm0,    mm7                 ;
-        psrlq       mm7,    32
-
-        paddd       mm0,    mm7
-        movq        rax,    mm0
-
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 %define mmx_filter_shift            7
 
 ;void vp8_filter_block2d_bil4x4_var_mmx

diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm
index 761433c..26de5e8 100644
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ b/vp8/common/x86/variance_impl_sse2.asm

@@ -13,393 +13,6 @@
 
 %define xmm_filter_shift            7
 
-;unsigned int vp8_get_mb_ss_sse2
-;(
-;    short *src_ptr
-;)
-global sym(vp8_get_mb_ss_sse2) PRIVATE
-sym(vp8_get_mb_ss_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 1
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        mov         rax, arg(0) ;[src_ptr]
-        mov         rcx, 8
-        pxor        xmm4, xmm4
-
-.NEXTROW:
-        movdqa      xmm0, [rax]
-        movdqa      xmm1, [rax+16]
-        movdqa      xmm2, [rax+32]
-        movdqa      xmm3, [rax+48]
-        pmaddwd     xmm0, xmm0
-        pmaddwd     xmm1, xmm1
-        pmaddwd     xmm2, xmm2
-        pmaddwd     xmm3, xmm3
-
-        paddd       xmm0, xmm1
-        paddd       xmm2, xmm3
-        paddd       xmm4, xmm0
-        paddd       xmm4, xmm2
-
-        add         rax, 0x40
-        dec         rcx
-        ja          .NEXTROW
-
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,8
-        paddd       xmm4,xmm3
-        movdqa      xmm3,xmm4
-        psrldq      xmm4,4
-        paddd       xmm4,xmm3
-        movq        rax,xmm4
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;unsigned int vp8_get16x16var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get16x16var_sse2) PRIVATE
-sym(vp8_get16x16var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    push rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        ; Prefetch data
-        lea             rcx,    [rax+rax*2]
-        prefetcht0      [rsi]
-        prefetcht0      [rsi+rax]
-        prefetcht0      [rsi+rax*2]
-        prefetcht0      [rsi+rcx]
-        lea             rbx,    [rsi+rax*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rax]
-        prefetcht0      [rbx+rax*2]
-        prefetcht0      [rbx+rcx]
-
-        lea             rcx,    [rdx+rdx*2]
-        prefetcht0      [rdi]
-        prefetcht0      [rdi+rdx]
-        prefetcht0      [rdi+rdx*2]
-        prefetcht0      [rdi+rcx]
-        lea             rbx,    [rdi+rdx*4]
-        prefetcht0      [rbx]
-        prefetcht0      [rbx+rdx]
-        prefetcht0      [rbx+rdx*2]
-        prefetcht0      [rbx+rcx]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        pxor        xmm6,           xmm6                        ; clear xmm6 for accumulating sse
-        mov         rcx,            16
-
-.var16loop:
-        movdqu      xmm1,           XMMWORD PTR [rsi]
-        movdqu      xmm2,           XMMWORD PTR [rdi]
-
-        prefetcht0      [rsi+rax*8]
-        prefetcht0      [rdi+rdx*8]
-
-        movdqa      xmm3,           xmm1
-        movdqa      xmm4,           xmm2
-
-
-        punpcklbw   xmm1,           xmm0
-        punpckhbw   xmm3,           xmm0
-
-        punpcklbw   xmm2,           xmm0
-        punpckhbw   xmm4,           xmm0
-
-
-        psubw       xmm1,           xmm2
-        psubw       xmm3,           xmm4
-
-        paddw       xmm7,           xmm1
-        pmaddwd     xmm1,           xmm1
-
-        paddw       xmm7,           xmm3
-        pmaddwd     xmm3,           xmm3
-
-        paddd       xmm6,           xmm1
-        paddd       xmm6,           xmm3
-
-        add         rsi,            rax
-        add         rdi,            rdx
-
-        sub         rcx,            1
-        jnz         .var16loop
-
-
-        movdqa      xmm1,           xmm6
-        pxor        xmm6,           xmm6
-
-        pxor        xmm5,           xmm5
-        punpcklwd   xmm6,           xmm7
-
-        punpckhwd   xmm5,           xmm7
-        psrad       xmm5,           16
-
-        psrad       xmm6,           16
-        paddd       xmm6,           xmm5
-
-        movdqa      xmm2,           xmm1
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddd       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddd       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movd DWORD PTR [rax],       xmm7
-        movd DWORD PTR [rdi],       xmm1
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    pop rbx
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;unsigned int vp8_get8x8var_sse2
-;(
-;    unsigned char   *  src_ptr,
-;    int             source_stride,
-;    unsigned char   *  ref_ptr,
-;    int             recon_stride,
-;    unsigned int    *  SSE,
-;    int             *  Sum
-;)
-global sym(vp8_get8x8var_sse2) PRIVATE
-sym(vp8_get8x8var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 6
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        mov         rsi,            arg(0) ;[src_ptr]
-        mov         rdi,            arg(2) ;[ref_ptr]
-
-        movsxd      rax,            DWORD PTR arg(1) ;[source_stride]
-        movsxd      rdx,            DWORD PTR arg(3) ;[recon_stride]
-
-        pxor        xmm0,           xmm0                        ; clear xmm0 for unpack
-        pxor        xmm7,           xmm7                        ; clear xmm7 for accumulating diffs
-
-        movq        xmm1,           QWORD PTR [rsi]
-        movq        xmm2,           QWORD PTR [rdi]
-
-        punpcklbw   xmm1,           xmm0
-        punpcklbw   xmm2,           xmm0
-
-        psubsw      xmm1,           xmm2
-        paddw       xmm7,           xmm1
-
-        pmaddwd     xmm1,           xmm1
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax * 2]
-        movq        xmm3,           QWORD PTR[rdi + rdx * 2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-        movq        xmm2,           QWORD PTR[rsi + rax *2]
-        movq        xmm3,           QWORD PTR[rdi + rdx *2]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        lea         rsi,            [rsi + rax * 2]
-        lea         rdi,            [rdi + rdx * 2]
-
-        movq        xmm2,           QWORD PTR[rsi + rax]
-        movq        xmm3,           QWORD PTR[rdi + rdx]
-
-        punpcklbw   xmm2,           xmm0
-        punpcklbw   xmm3,           xmm0
-
-        psubsw      xmm2,           xmm3
-        paddw       xmm7,           xmm2
-
-        pmaddwd     xmm2,           xmm2
-        paddd       xmm1,           xmm2
-
-
-        movdqa      xmm6,           xmm7
-        punpcklwd   xmm6,           xmm0
-
-        punpckhwd   xmm7,           xmm0
-        movdqa      xmm2,           xmm1
-
-        paddw       xmm6,           xmm7
-        punpckldq   xmm1,           xmm0
-
-        punpckhdq   xmm2,           xmm0
-        movdqa      xmm7,           xmm6
-
-        paddd       xmm1,           xmm2
-        punpckldq   xmm6,           xmm0
-
-        punpckhdq   xmm7,           xmm0
-        paddw       xmm6,           xmm7
-
-        movdqa      xmm2,           xmm1
-        movdqa      xmm7,           xmm6
-
-        psrldq      xmm1,           8
-        psrldq      xmm6,           8
-
-        paddw       xmm7,           xmm6
-        paddd       xmm1,           xmm2
-
-        mov         rax,            arg(5) ;[Sum]
-        mov         rdi,            arg(4) ;[SSE]
-
-        movq        rdx,            xmm7
-        movsx       rcx,            dx
-
-        mov  dword ptr [rax],       ecx
-        movd DWORD PTR [rdi],       xmm1
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
 ;void vp8_filter_block2d_bil_var_sse2
 ;(
 ;    unsigned char *ref_ptr,

diff --git a/vp8/common/x86/variance_mmx.c b/vp8/common/x86/variance_mmx.c
index 10a58b8..25ae576 100644
--- a/vp8/common/x86/variance_mmx.c
+++ b/vp8/common/x86/variance_mmx.c

@@ -35,25 +35,6 @@
     short *filter
 );
 
-extern unsigned int vp8_get_mb_ss_mmx(const short *src_ptr);
-extern unsigned int vp8_get8x8var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 extern void vp8_filter_block2d_bil4x4_var_mmx
 (
     const unsigned char *ref_ptr,
@@ -78,127 +59,6 @@
     unsigned int *sumsquared
 );
 
-
-unsigned int vp8_variance4x4_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-unsigned int vp8_mse16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    *sse = var;
-    return var;
-}
-
-
-unsigned int vp8_variance16x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 8));
-}
-
-unsigned int vp8_variance16x8_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
-unsigned int vp8_variance8x16_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-
 unsigned int vp8_sub_pixel_variance4x4_mmx
 (
     const unsigned char  *src_ptr,

diff --git a/vp8/common/x86/variance_sse2.c b/vp8/common/x86/variance_sse2.c
index 6c6539d..f6dfb27 100644
--- a/vp8/common/x86/variance_sse2.c
+++ b/vp8/common/x86/variance_sse2.c

@@ -31,38 +31,6 @@
     unsigned int *sumsquared
 );
 
-extern unsigned int vp8_get4x4var_mmx
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-
-unsigned int vp8_get_mb_ss_sse2
-(
-    const short *src_ptr
-);
-unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
-unsigned int vp8_get8x8var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 void vp8_filter_block2d_bil_var_sse2
 (
     const unsigned char *ref_ptr,
@@ -136,115 +104,6 @@
     unsigned int *sumsquared
 );
 
-unsigned int vp8_variance4x4_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 4));
-
-}
-
-unsigned int vp8_variance8x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int var;
-    int avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 6));
-
-}
-
-
-unsigned int vp8_variance16x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0;
-    int sum0;
-
-
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return (sse0 - (((unsigned int)sum0 * sum0) >> 8));
-}
-unsigned int vp8_mse16x16_wmt(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-
-    unsigned int sse0;
-    int sum0;
-    vp8_get16x16var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    *sse = sse0;
-    return sse0;
-
-}
-
-
-unsigned int vp8_variance16x8_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
-unsigned int vp8_variance8x16_wmt
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    unsigned int sse0, sse1, var;
-    int sum0, sum1, avg;
-
-    vp8_get8x8var_sse2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_sse2(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
-
-    var = sse0 + sse1;
-    avg = sum0 + sum1;
-    *sse = var;
-    return (var - (((unsigned int)avg * avg) >> 7));
-
-}
-
 unsigned int vp8_sub_pixel_variance4x4_wmt
 (
     const unsigned char  *src_ptr,

diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
index d8c8da5..2a0df64 100644
--- a/vp8/common/x86/variance_ssse3.c
+++ b/vp8/common/x86/variance_ssse3.c

@@ -13,15 +13,6 @@
 #include "vp8/common/variance.h"
 #include "vpx_ports/mem.h"
 
-extern unsigned int vp8_get16x16var_sse2
-(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_ptr,
-    int recon_stride,
-    unsigned int *SSE,
-    int *Sum
-);
 extern void vp8_half_horiz_vert_variance16x_h_sse2
 (
     const unsigned char *ref_ptr,

diff --git a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm b/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
deleted file mode 100644
index 000805d..0000000
--- a/vp8/encoder/arm/armv6/vp8_mse16x16_armv6.asm
+++ /dev/null

@@ -1,138 +0,0 @@
-;
-;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    EXPORT  |vp8_mse16x16_armv6|
-
-    ARM
-
-    AREA ||.text||, CODE, READONLY, ALIGN=2
-
-; r0    unsigned char *src_ptr
-; r1    int source_stride
-; r2    unsigned char *ref_ptr
-; r3    int  recon_stride
-; stack unsigned int *sse
-;
-;note: Based on vp8_variance16x16_armv6. In this function, sum is never used.
-;      So, we can remove this part of calculation.
-
-|vp8_mse16x16_armv6| PROC
-
-    push    {r4-r9, lr}
-
-    pld     [r0, r1, lsl #0]
-    pld     [r2, r3, lsl #0]
-
-    mov     r12, #16            ; set loop counter to 16 (=block height)
-    mov     r4, #0              ; initialize sse = 0
-
-loop
-    ; 1st 4 pixels
-    ldr     r5, [r0, #0x0]      ; load 4 src pixels
-    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
-
-    mov     lr, #0              ; constant zero
-
-    usub8   r8, r5, r6          ; calculate difference
-    pld     [r0, r1, lsl #1]
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    pld     [r2, r3, lsl #1]
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0x4]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 2nd 4 pixels
-    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-    ldr     r5, [r0, #0x8]      ; load 4 src pixels
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 3rd 4 pixels
-    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    ldr     r5, [r0, #0xc]      ; load 4 src pixels
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-
-    ; 4th 4 pixels
-    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    usub8   r8, r5, r6          ; calculate difference
-    add     r0, r0, r1          ; set src_ptr to next row
-    sel     r7, r8, lr          ; select bytes with positive difference
-    usub8   r9, r6, r5          ; calculate difference with reversed operands
-    add     r2, r2, r3          ; set dst_ptr to next row
-    sel     r8, r9, lr          ; select bytes with negative difference
-
-    ; calculate partial sums
-    usad8   r5, r7, lr          ; calculate sum of positive differences
-    usad8   r6, r8, lr          ; calculate sum of negative differences
-    orr     r8, r8, r7          ; differences of all 4 pixels
-
-    subs    r12, r12, #1        ; next row
-
-    ; calculate sse
-    uxtb16  r6, r8              ; byte (two pixels) to halfwords
-    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
-    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
-    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
-
-    bne     loop
-
-    ; return stuff
-    ldr     r1, [sp, #28]       ; get address of sse
-    mov     r0, r4              ; return sse
-    str     r4, [r1]            ; store sse
-
-    pop     {r4-r9, pc}
-
-    ENDP
-
-    END

diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
deleted file mode 100644
index f806809..0000000
--- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
+++ /dev/null

@@ -1,131 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-
-unsigned int vp8_mse16x16_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride,
-        unsigned int *sse) {
-    int i;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    int64x1_t d0s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    q7s32 = vdupq_n_s32(0);
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
-        q0u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q1u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q2u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
-        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
-        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q10s32 = vaddq_s32(q7s32, q9s32);
-
-    q1s64 = vpaddlq_s32(q10s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}
-
-unsigned int vp8_get4x4sse_cs_neon(
-        const unsigned char *src_ptr,
-        int source_stride,
-        const unsigned char *ref_ptr,
-        int recon_stride) {
-    int16x4_t d22s16, d24s16, d26s16, d28s16;
-    int64x1_t d0s64;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int32x4_t q7s32, q8s32, q9s32, q10s32;
-    uint16x8_t q11u16, q12u16, q13u16, q14u16;
-    int64x2_t q1s64;
-
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d4u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d1u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d5u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d6u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-    d3u8 = vld1_u8(src_ptr);
-    src_ptr += source_stride;
-    d7u8 = vld1_u8(ref_ptr);
-    ref_ptr += recon_stride;
-
-    q11u16 = vsubl_u8(d0u8, d4u8);
-    q12u16 = vsubl_u8(d1u8, d5u8);
-    q13u16 = vsubl_u8(d2u8, d6u8);
-    q14u16 = vsubl_u8(d3u8, d7u8);
-
-    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
-    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
-    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
-    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
-
-    q7s32 = vmull_s16(d22s16, d22s16);
-    q8s32 = vmull_s16(d24s16, d24s16);
-    q9s32 = vmull_s16(d26s16, d26s16);
-    q10s32 = vmull_s16(d28s16, d28s16);
-
-    q7s32 = vaddq_s32(q7s32, q8s32);
-    q9s32 = vaddq_s32(q9s32, q10s32);
-    q9s32 = vaddq_s32(q7s32, q9s32);
-
-    q1s64 = vpaddlq_s32(q9s32);
-    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
-
-    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
-}

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 378e902..d381d8d 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c

@@ -11,6 +11,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "encodemb.h"
 #include "encodemv.h"
 #include "vp8/common/common.h"
@@ -90,7 +91,7 @@
      *  lambda using a non-linear combination (e.g., the smallest, or second
      *  smallest, etc.).
      */
-    act =  vp8_variance16x16(x->src.y_buffer,
+    act =  vpx_variance16x16(x->src.y_buffer,
                     x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
     act = act<<4;
 

diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c
index cfa4cb9..e2de5ee 100644
--- a/vp8/encoder/encodeintra.c
+++ b/vp8/encoder/encodeintra.c

@@ -11,6 +11,7 @@
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "quantize.h"
 #include "vp8/common/reconintra4x4.h"
 #include "encodemb.h"
@@ -44,7 +45,7 @@
         }
     }
 
-    intra_pred_var = vp8_get_mb_ss(x->src_diff);
+    intra_pred_var = vpx_get_mb_ss(x->src_diff);
 
     return intra_pred_var;
 }

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index a6ff0e7..3deb4ab 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -12,6 +12,7 @@
 #include <limits.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
@@ -422,14 +423,14 @@
     /* Set up pointers for this macro block raw buffer */
     raw_ptr = (unsigned char *)(raw_buffer->y_buffer + recon_yoffset
                                 + d->offset);
-    vp8_mse16x16 ( src_ptr, src_stride, raw_ptr, raw_stride,
-                   (unsigned int *)(raw_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, raw_ptr, raw_stride,
+                 (unsigned int *)(raw_motion_err));
 
     /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
     ref_ptr = (unsigned char *)(xd->pre.y_buffer + d->offset );
-    vp8_mse16x16 ( src_ptr, src_stride, ref_ptr, ref_stride,
-                   (unsigned int *)(best_motion_err));
+    vpx_mse16x16(src_ptr, src_stride, ref_ptr, ref_stride,
+                 (unsigned int *)(best_motion_err));
 }
 
 static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x,
@@ -453,7 +454,7 @@
     int new_mv_mode_penalty = 256;
 
     /* override the default variance function to use MSE */
-    v_fn_ptr.vf    = vp8_mse16x16;
+    v_fn_ptr.vf    = vpx_mse16x16;
 
     /* Set up pointers for this macro block recon buffer */
     xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index c2bb232..40e29e1 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -2131,7 +2131,7 @@
 #endif
 
     cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
-    cpi->fn_ptr[BLOCK_16X16].vf             = vp8_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
     cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
     cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
     cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
@@ -2141,7 +2141,7 @@
     cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
 
     cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
-    cpi->fn_ptr[BLOCK_16X8].vf             = vp8_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
@@ -2151,7 +2151,7 @@
     cpi->fn_ptr[BLOCK_16X8].sdx4df         = vpx_sad16x8x4d;
 
     cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
-    cpi->fn_ptr[BLOCK_8X16].vf             = vp8_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
@@ -2161,7 +2161,7 @@
     cpi->fn_ptr[BLOCK_8X16].sdx4df         = vpx_sad8x16x4d;
 
     cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
-    cpi->fn_ptr[BLOCK_8X8].vf             = vp8_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
@@ -2171,7 +2171,7 @@
     cpi->fn_ptr[BLOCK_8X8].sdx4df         = vpx_sad8x8x4d;
 
     cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
-    cpi->fn_ptr[BLOCK_4X4].vf             = vp8_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
@@ -2558,7 +2558,7 @@
         {
             unsigned int sse;
 
-            vp8_mse16x16(orig + col, orig_stride,
+            vpx_mse16x16(orig + col, orig_stride,
                                             recon + col, recon_stride,
                                             &sse);
             total_sse += sse;
@@ -3384,7 +3384,7 @@
                 int index = block_index_row + (j >> 4);
                 if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
                   unsigned int sse;
-                  Total += vp8_mse16x16(src + j,
+                  Total += vpx_mse16x16(src + j,
                                         source->y_stride,
                                         dst + j, dest->y_stride,
                                         &sse);
@@ -3448,7 +3448,7 @@
       int index = block_index_row + (j >> 4);
       if (cpi->consec_zero_last[index] >= min_consec_zero_last) {
         unsigned int sse;
-        const unsigned int var = vp8_variance16x16(src + j,
+        const unsigned int var = vpx_variance16x16(src + j,
                                                    ystride,
                                                    dst + j,
                                                    ystride,
@@ -3458,7 +3458,7 @@
         // is small (to avoid effects from lighting change).
         if ((sse - var) < 128) {
           unsigned int sse2;
-          const unsigned int act = vp8_variance16x16(src + j,
+          const unsigned int act = vpx_variance16x16(src + j,
                                                      ystride,
                                                      const_source,
                                                      0,
@@ -5993,7 +5993,8 @@
         for (j = 0; j < source->y_width; j += 16)
         {
             unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride, dst + j, dest->y_stride, &sse);
+            Total += vpx_mse16x16(src + j, source->y_stride,
+                                  dst + j, dest->y_stride, &sse);
         }
 
         src += 16 * source->y_stride;

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 98ea5a0..053bf11 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c

@@ -11,6 +11,7 @@
 
 #include <limits.h>
 #include "vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 #include "onyx_int.h"
 #include "modecosts.h"
 #include "encodeintra.h"
@@ -215,33 +216,6 @@
 
 }
 
-
-unsigned int vp8_get4x4sse_cs_c
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride
-)
-{
-    int distortion = 0;
-    int r, c;
-
-    for (r = 0; r < 4; r++)
-    {
-        for (c = 0; c < 4; c++)
-        {
-            int diff = src_ptr[c] - ref_ptr[c];
-            distortion += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-
-    return distortion;
-}
-
 static int get_prediction_error(BLOCK *be, BLOCKD *b)
 {
     unsigned char *sptr;
@@ -249,7 +223,7 @@
     sptr = (*(be->base_src) + be->src);
     dptr = b->predictor;
 
-    return vp8_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
+    return vpx_get4x4sse_cs(sptr, be->src_stride, dptr, 16);
 
 }
 
@@ -1037,7 +1011,7 @@
             else
             {
                 rate2 += rate;
-                distortion2 = vp8_variance16x16(
+                distortion2 = vpx_variance16x16(
                                     *(b->base_src), b->src_stride,
                                     x->e_mbd.predictor, 16, &sse);
                 this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
@@ -1066,7 +1040,7 @@
                                              xd->dst.y_stride,
                                              xd->predictor,
                                              16);
-            distortion2 = vp8_variance16x16
+            distortion2 = vpx_variance16x16
                                           (*(b->base_src), b->src_stride,
                                           x->e_mbd.predictor, 16, &sse);
             rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode];
@@ -1547,7 +1521,7 @@
                                          xd->dst.y_stride,
                                          xd->predictor,
                                          16);
-        distortion = vp8_variance16x16
+        distortion = vpx_variance16x16
             (*(b->base_src), b->src_stride, xd->predictor, 16, &sse);
         rate = x->mbmode_cost[xd->frame_type][mode];
         this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);

diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index 890053d..875b37f 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c

@@ -9,6 +9,7 @@
  */
 
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vp8/common/onyxc_int.h"
 #include "onyx_int.h"
@@ -83,7 +84,7 @@
         for (j = 0; j < source->y_width; j += 16)
         {
             unsigned int sse;
-            Total += vp8_mse16x16(src + j, source->y_stride,
+            Total += vpx_mse16x16(src + j, source->y_stride,
                                                      dst + j, dest->y_stride,
                                                      &sse);
         }

diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 10d3408..e8796a1 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c

@@ -1587,7 +1587,7 @@
     // Threshold for the average (over all macroblocks) of the pixel-sum
     // residual error over 16x16 block. Should add QP dependence on threshold?
     int thresh_pred_err_mb = (256 << 4);
-    int pred_err_mb = cpi->mb.prediction_error / cpi->common.MBs;
+    int pred_err_mb = (int)(cpi->mb.prediction_error / cpi->common.MBs);
     if (Q < thresh_qp &&
         cpi->projected_frame_size > thresh_rate &&
         pred_err_mb > thresh_pred_err_mb) {
@@ -1601,7 +1601,9 @@
       cpi->force_maxqp = 0;
       return 0;
     }
+    cpi->force_maxqp = 0;
     return 0;
   }
+  cpi->force_maxqp = 0;
   return 0;
 }

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 9ccd85e..17194f0 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -15,6 +15,7 @@
 #include <assert.h>
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "tokenize.h"
 #include "treewriter.h"
 #include "onyx_int.h"
@@ -507,9 +508,9 @@
     }
     else
     {
-        vp8_variance8x8(uptr, pre_stride,
+        vpx_variance8x8(uptr, pre_stride,
             upred_ptr, uv_stride, &sse2);
-        vp8_variance8x8(vptr, pre_stride,
+        vpx_variance8x8(vptr, pre_stride,
             vpred_ptr, uv_stride, &sse1);
         sse2 += sse1;
     }
@@ -1783,7 +1784,7 @@
         if(threshold < x->encode_breakout)
             threshold = x->encode_breakout;
 
-        var = vp8_variance16x16
+        var = vpx_variance16x16
                 (*(b->base_src), b->src_stride,
                 x->e_mbd.predictor, 16, &sse);
 

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index b4c8140..c71d592 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -145,8 +145,6 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance8x8_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance16x16_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
@@ -168,7 +166,6 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/variance_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 0500301..0b0f6a7 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk

@@ -18,7 +18,6 @@
 #File list for media
 # encoder
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM)
-VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/vp8_mse16x16_armv6$(ASM)
 VP8_CX_SRCS-$(HAVE_MEDIA)  += encoder/arm/armv6/walsh_v6$(ASM)
 
 #File list for neon
@@ -27,5 +26,4 @@
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_mse16x16_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c

diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index d0beaa7..66cf660 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c

@@ -11,463 +11,415 @@
 #include <stddef.h>
 #include <arm_neon.h>
 
-void vp9_v_predictor_4x4_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int i;
-    uint32x2_t d0u32 = vdup_n_u32(0);
-    (void)left;
+void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint32x2_t d0u32 = vdup_n_u32(0);
+  (void)left;
 
-    d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
-    for (i = 0; i < 4; i++, dst += y_stride)
-        vst1_lane_u32((uint32_t *)dst, d0u32, 0);
-    return;
+  d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+  for (i = 0; i < 4; i++, dst += y_stride)
+    vst1_lane_u32((uint32_t *)dst, d0u32, 0);
 }
 
-void vp9_v_predictor_8x8_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int i;
-    uint8x8_t d0u8 = vdup_n_u8(0);
-    (void)left;
+void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  (void)left;
 
-    d0u8 = vld1_u8(above);
-    for (i = 0; i < 8; i++, dst += y_stride)
-        vst1_u8(dst, d0u8);
-    return;
+  d0u8 = vld1_u8(above);
+  for (i = 0; i < 8; i++, dst += y_stride)
+    vst1_u8(dst, d0u8);
 }
 
-void vp9_v_predictor_16x16_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int i;
-    uint8x16_t q0u8 = vdupq_n_u8(0);
-    (void)left;
+void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  (void)left;
 
-    q0u8 = vld1q_u8(above);
-    for (i = 0; i < 16; i++, dst += y_stride)
-        vst1q_u8(dst, q0u8);
-    return;
+  q0u8 = vld1q_u8(above);
+  for (i = 0; i < 16; i++, dst += y_stride)
+    vst1q_u8(dst, q0u8);
 }
 
-void vp9_v_predictor_32x32_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int i;
-    uint8x16_t q0u8 = vdupq_n_u8(0);
-    uint8x16_t q1u8 = vdupq_n_u8(0);
-    (void)left;
+void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)left;
 
-    q0u8 = vld1q_u8(above);
-    q1u8 = vld1q_u8(above + 16);
-    for (i = 0; i < 32; i++, dst += y_stride) {
-        vst1q_u8(dst, q0u8);
-        vst1q_u8(dst + 16, q1u8);
-    }
-    return;
+  q0u8 = vld1q_u8(above);
+  q1u8 = vld1q_u8(above + 16);
+  for (i = 0; i < 32; i++, dst += y_stride) {
+    vst1q_u8(dst, q0u8);
+    vst1q_u8(dst + 16, q1u8);
+  }
 }
 
-void vp9_h_predictor_4x4_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    uint8x8_t d0u8 = vdup_n_u8(0);
-    uint32x2_t d1u32 = vdup_n_u32(0);
-    (void)above;
+void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d1u32 = vdup_n_u32(0);
+  (void)above;
 
-    d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+  d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
 
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
-    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-    return;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+  vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
 }
 
-void vp9_h_predictor_8x8_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    uint8x8_t d0u8 = vdup_n_u8(0);
-    uint64x1_t d1u64 = vdup_n_u64(0);
-    (void)above;
+void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint64x1_t d1u64 = vdup_n_u64(0);
+  (void)above;
 
-    d1u64 = vld1_u64((const uint64_t *)left);
+  d1u64 = vld1_u64((const uint64_t *)left);
 
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
-    vst1_u8(dst, d0u8);
-    dst += y_stride;
-    d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
-    vst1_u8(dst, d0u8);
-    return;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+  vst1_u8(dst, d0u8);
+  dst += y_stride;
+  d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+  vst1_u8(dst, d0u8);
 }
 
-void vp9_h_predictor_16x16_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int j;
-    uint8x8_t d2u8 = vdup_n_u8(0);
-    uint8x16_t q0u8 = vdupq_n_u8(0);
-    uint8x16_t q1u8 = vdupq_n_u8(0);
-    (void)above;
+void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
 
+  q1u8 = vld1q_u8(left);
+  d2u8 = vget_low_u8(q1u8);
+  for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+    q0u8 = vdupq_lane_u8(d2u8, 0);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 1);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 2);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 3);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 4);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 5);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 6);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+    q0u8 = vdupq_lane_u8(d2u8, 7);
+    vst1q_u8(dst, q0u8);
+    dst += y_stride;
+  }
+}
+
+void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint8x8_t d2u8 = vdup_n_u8(0);
+  uint8x16_t q0u8 = vdupq_n_u8(0);
+  uint8x16_t q1u8 = vdupq_n_u8(0);
+  (void)above;
+
+  for (k = 0; k < 2; k++, left += 16) {
     q1u8 = vld1q_u8(left);
     d2u8 = vget_low_u8(q1u8);
     for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-        q0u8 = vdupq_lane_u8(d2u8, 0);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 1);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 2);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 3);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 4);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 5);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 6);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
-        q0u8 = vdupq_lane_u8(d2u8, 7);
-        vst1q_u8(dst, q0u8);
-        dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 0);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 1);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 2);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 3);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 4);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 5);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 6);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
+      q0u8 = vdupq_lane_u8(d2u8, 7);
+      vst1q_u8(dst, q0u8);
+      vst1q_u8(dst + 16, q0u8);
+      dst += y_stride;
     }
-    return;
+  }
 }
 
-void vp9_h_predictor_32x32_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int j, k;
-    uint8x8_t d2u8 = vdup_n_u8(0);
-    uint8x16_t q0u8 = vdupq_n_u8(0);
-    uint8x16_t q1u8 = vdupq_n_u8(0);
-    (void)above;
+void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int i;
+  uint16x8_t q1u16, q3u16;
+  int16x8_t q1s16;
+  uint8x8_t d0u8 = vdup_n_u8(0);
+  uint32x2_t d2u32 = vdup_n_u32(0);
 
-    for (k = 0; k < 2; k++, left += 16) {
-        q1u8 = vld1q_u8(left);
-        d2u8 = vget_low_u8(q1u8);
-        for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
-            q0u8 = vdupq_lane_u8(d2u8, 0);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 1);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 2);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 3);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 4);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 5);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 6);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-            q0u8 = vdupq_lane_u8(d2u8, 7);
-            vst1q_u8(dst, q0u8);
-            vst1q_u8(dst + 16, q0u8);
-            dst += y_stride;
-        }
-    }
-    return;
+  d0u8 = vdup_n_u8(above[-1]);
+  d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+  q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+  for (i = 0; i < 4; i++, dst += y_stride) {
+    q1u16 = vdupq_n_u16((uint16_t)left[i]);
+    q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+                      vreinterpretq_s16_u16(q3u16));
+    d0u8 = vqmovun_s16(q1s16);
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+  }
 }
 
-void vp9_tm_predictor_4x4_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int i;
-    uint16x8_t q1u16, q3u16;
-    int16x8_t q1s16;
-    uint8x8_t d0u8 = vdup_n_u8(0);
-    uint32x2_t d2u32 = vdup_n_u32(0);
+void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+                               const uint8_t *above, const uint8_t *left) {
+  int j;
+  uint16x8_t q0u16, q3u16, q10u16;
+  int16x8_t q0s16;
+  uint16x4_t d20u16;
+  uint8x8_t d0u8, d2u8, d30u8;
 
-    d0u8 = vdup_n_u8(above[-1]);
-    d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
-    q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
-    for (i = 0; i < 4; i++, dst += y_stride) {
-        q1u16 = vdupq_n_u16((uint16_t)left[i]);
-        q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
-                          vreinterpretq_s16_u16(q3u16));
-        d0u8 = vqmovun_s16(q1s16);
-        vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
-    }
-    return;
+  d0u8 = vdup_n_u8(above[-1]);
+  d30u8 = vld1_u8(left);
+  d2u8 = vld1_u8(above);
+  q10u16 = vmovl_u8(d30u8);
+  q3u16 = vsubl_u8(d2u8, d0u8);
+  d20u16 = vget_low_u16(q10u16);
+  for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+    q0u16 = vdupq_lane_u16(d20u16, 0);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += y_stride;
+    q0u16 = vdupq_lane_u16(d20u16, 1);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += y_stride;
+    q0u16 = vdupq_lane_u16(d20u16, 2);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += y_stride;
+    q0u16 = vdupq_lane_u16(d20u16, 3);
+    q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+                      vreinterpretq_s16_u16(q0u16));
+    d0u8 = vqmovun_s16(q0s16);
+    vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+    dst += y_stride;
+  }
 }
 
-void vp9_tm_predictor_8x8_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int j;
-    uint16x8_t q0u16, q3u16, q10u16;
-    int16x8_t q0s16;
-    uint16x4_t d20u16;
-    uint8x8_t d0u8, d2u8, d30u8;
+void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+  uint8x16_t q0u8, q1u8;
+  int16x8_t q0s16, q1s16, q8s16, q11s16;
+  uint16x4_t d20u16;
+  uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
 
-    d0u8 = vdup_n_u8(above[-1]);
-    d30u8 = vld1_u8(left);
-    d2u8 = vld1_u8(above);
-    q10u16 = vmovl_u8(d30u8);
-    q3u16 = vsubl_u8(d2u8, d0u8);
+  q0u8 = vdupq_n_u8(above[-1]);
+  q1u8 = vld1q_u8(above);
+  q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  for (k = 0; k < 2; k++, left += 8) {
+    d18u8 = vld1_u8(left);
+    q10u16 = vmovl_u8(d18u8);
     d20u16 = vget_low_u16(q10u16);
     for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-        q0u16 = vdupq_lane_u16(d20u16, 0);
-        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                          vreinterpretq_s16_u16(q0u16));
-        d0u8 = vqmovun_s16(q0s16);
-        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-        dst += y_stride;
-        q0u16 = vdupq_lane_u16(d20u16, 1);
-        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                          vreinterpretq_s16_u16(q0u16));
-        d0u8 = vqmovun_s16(q0s16);
-        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-        dst += y_stride;
-        q0u16 = vdupq_lane_u16(d20u16, 2);
-        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                          vreinterpretq_s16_u16(q0u16));
-        d0u8 = vqmovun_s16(q0s16);
-        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-        dst += y_stride;
-        q0u16 = vdupq_lane_u16(d20u16, 3);
-        q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
-                          vreinterpretq_s16_u16(q0u16));
-        d0u8 = vqmovun_s16(q0s16);
-        vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
-        dst += y_stride;
+      q0u16 = vdupq_lane_u16(d20u16, 0);
+      q8u16 = vdupq_lane_u16(d20u16, 1);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += y_stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += y_stride;
+
+      q0u16 = vdupq_lane_u16(d20u16, 2);
+      q8u16 = vdupq_lane_u16(d20u16, 3);
+      q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q2u16));
+      q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                        vreinterpretq_s16_u16(q3u16));
+      q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                         vreinterpretq_s16_u16(q2u16));
+      q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+                        vreinterpretq_s16_u16(q3u16));
+      d2u8 = vqmovun_s16(q1s16);
+      d3u8 = vqmovun_s16(q0s16);
+      d22u8 = vqmovun_s16(q11s16);
+      d23u8 = vqmovun_s16(q8s16);
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+      dst += y_stride;
+      vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+      vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+      dst += y_stride;
     }
-    return;
+  }
 }
 
-void vp9_tm_predictor_16x16_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int j, k;
-    uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
-    uint8x16_t q0u8, q1u8;
-    int16x8_t q0s16, q1s16, q8s16, q11s16;
-    uint16x4_t d20u16;
-    uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
+void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  int j, k;
+  uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+  uint8x16_t q0u8, q1u8, q2u8;
+  int16x8_t q12s16, q13s16, q14s16, q15s16;
+  uint16x4_t d6u16;
+  uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
 
-    q0u8 = vdupq_n_u8(above[-1]);
-    q1u8 = vld1q_u8(above);
-    q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-    q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-    for (k = 0; k < 2; k++, left += 8) {
-        d18u8 = vld1_u8(left);
-        q10u16 = vmovl_u8(d18u8);
-        d20u16 = vget_low_u16(q10u16);
-        for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
-            q0u16 = vdupq_lane_u16(d20u16, 0);
-            q8u16 = vdupq_lane_u16(d20u16, 1);
-            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                              vreinterpretq_s16_u16(q2u16));
-            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                              vreinterpretq_s16_u16(q3u16));
-            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                              vreinterpretq_s16_u16(q2u16));
-            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                              vreinterpretq_s16_u16(q3u16));
-            d2u8 = vqmovun_s16(q1s16);
-            d3u8 = vqmovun_s16(q0s16);
-            d22u8 = vqmovun_s16(q11s16);
-            d23u8 = vqmovun_s16(q8s16);
-            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-            dst += y_stride;
-            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-            dst += y_stride;
+  q0u8 = vdupq_n_u8(above[-1]);
+  q1u8 = vld1q_u8(above);
+  q2u8 = vld1q_u8(above + 16);
+  q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+  q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+  q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+  q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+  for (k = 0; k < 4; k++, left += 8) {
+    d26u8 = vld1_u8(left);
+    q3u16 = vmovl_u8(d26u8);
+    d6u16 = vget_low_u16(q3u16);
+    for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+      q0u16 = vdupq_lane_u16(d6u16, 0);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += y_stride;
 
-            q0u16 = vdupq_lane_u16(d20u16, 2);
-            q8u16 = vdupq_lane_u16(d20u16, 3);
-            q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                              vreinterpretq_s16_u16(q2u16));
-            q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                              vreinterpretq_s16_u16(q3u16));
-            q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                              vreinterpretq_s16_u16(q2u16));
-            q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
-                              vreinterpretq_s16_u16(q3u16));
-            d2u8 = vqmovun_s16(q1s16);
-            d3u8 = vqmovun_s16(q0s16);
-            d22u8 = vqmovun_s16(q11s16);
-            d23u8 = vqmovun_s16(q8s16);
-            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
-            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
-            dst += y_stride;
-            vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
-            vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
-            dst += y_stride;
-        }
+      q0u16 = vdupq_lane_u16(d6u16, 1);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += y_stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 2);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += y_stride;
+
+      q0u16 = vdupq_lane_u16(d6u16, 3);
+      q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q8u16));
+      q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q9u16));
+      q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q10u16));
+      q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+                         vreinterpretq_s16_u16(q11u16));
+      d0u8 = vqmovun_s16(q12s16);
+      d1u8 = vqmovun_s16(q13s16);
+      d2u8 = vqmovun_s16(q14s16);
+      d3u8 = vqmovun_s16(q15s16);
+      q0u8 = vcombine_u8(d0u8, d1u8);
+      q1u8 = vcombine_u8(d2u8, d3u8);
+      vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+      vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+      dst += y_stride;
     }
-    return;
-}
-
-void vp9_tm_predictor_32x32_neon(
-        uint8_t *dst,
-        ptrdiff_t y_stride,
-        const uint8_t *above,
-        const uint8_t *left) {
-    int j, k;
-    uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
-    uint8x16_t q0u8, q1u8, q2u8;
-    int16x8_t q12s16, q13s16, q14s16, q15s16;
-    uint16x4_t d6u16;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
-
-    q0u8 = vdupq_n_u8(above[-1]);
-    q1u8 = vld1q_u8(above);
-    q2u8 = vld1q_u8(above + 16);
-    q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
-    q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
-    q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
-    q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
-    for (k = 0; k < 4; k++, left += 8) {
-        d26u8 = vld1_u8(left);
-        q3u16 = vmovl_u8(d26u8);
-        d6u16 = vget_low_u16(q3u16);
-        for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
-            q0u16 = vdupq_lane_u16(d6u16, 0);
-            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q8u16));
-            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q9u16));
-            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q10u16));
-            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q11u16));
-            d0u8 = vqmovun_s16(q12s16);
-            d1u8 = vqmovun_s16(q13s16);
-            d2u8 = vqmovun_s16(q14s16);
-            d3u8 = vqmovun_s16(q15s16);
-            q0u8 = vcombine_u8(d0u8, d1u8);
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-            dst += y_stride;
-
-            q0u16 = vdupq_lane_u16(d6u16, 1);
-            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q8u16));
-            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q9u16));
-            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q10u16));
-            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q11u16));
-            d0u8 = vqmovun_s16(q12s16);
-            d1u8 = vqmovun_s16(q13s16);
-            d2u8 = vqmovun_s16(q14s16);
-            d3u8 = vqmovun_s16(q15s16);
-            q0u8 = vcombine_u8(d0u8, d1u8);
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-            dst += y_stride;
-
-            q0u16 = vdupq_lane_u16(d6u16, 2);
-            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q8u16));
-            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q9u16));
-            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q10u16));
-            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q11u16));
-            d0u8 = vqmovun_s16(q12s16);
-            d1u8 = vqmovun_s16(q13s16);
-            d2u8 = vqmovun_s16(q14s16);
-            d3u8 = vqmovun_s16(q15s16);
-            q0u8 = vcombine_u8(d0u8, d1u8);
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-            dst += y_stride;
-
-            q0u16 = vdupq_lane_u16(d6u16, 3);
-            q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q8u16));
-            q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q9u16));
-            q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q10u16));
-            q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
-                               vreinterpretq_s16_u16(q11u16));
-            d0u8 = vqmovun_s16(q12s16);
-            d1u8 = vqmovun_s16(q13s16);
-            d2u8 = vqmovun_s16(q14s16);
-            d3u8 = vqmovun_s16(q15s16);
-            q0u8 = vcombine_u8(d0u8, d1u8);
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
-            vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
-            dst += y_stride;
-        }
-    }
-    return;
+  }
 }

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 600cb13..8eda491 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -57,6 +57,7 @@
     if (cm->seg_map_array[i] == NULL)
       return 1;
   }
+  cm->seg_map_alloc_size = seg_map_size;
 
   // Init the index.
   cm->seg_map_idx = 0;
@@ -118,25 +119,36 @@
 }
 
 int vp9_alloc_context_buffers(VP9_COMMON *cm, int width, int height) {
-  vp9_free_context_buffers(cm);
+  int new_mi_size;
 
   vp9_set_mb_mi(cm, width, height);
-  if (cm->alloc_mi(cm, cm->mi_stride * calc_mi_size(cm->mi_rows)))
-    goto fail;
+  new_mi_size = cm->mi_stride * calc_mi_size(cm->mi_rows);
+  if (cm->mi_alloc_size < new_mi_size) {
+    cm->free_mi(cm);
+    if (cm->alloc_mi(cm, new_mi_size))
+      goto fail;
+  }
 
-  // Create the segmentation map structure and set to 0.
-  free_seg_map(cm);
-  if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
-    goto fail;
+  if (cm->seg_map_alloc_size < cm->mi_rows * cm->mi_cols) {
+    // Create the segmentation map structure and set to 0.
+    free_seg_map(cm);
+    if (alloc_seg_map(cm, cm->mi_rows * cm->mi_cols))
+      goto fail;
+  }
 
-  cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-      2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-      sizeof(*cm->above_context));
-  if (!cm->above_context) goto fail;
+  if (cm->above_context_alloc_cols < cm->mi_cols) {
+    vpx_free(cm->above_context);
+    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
+        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
+        sizeof(*cm->above_context));
+    if (!cm->above_context) goto fail;
 
-  cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
-      mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
-  if (!cm->above_seg_context) goto fail;
+    vpx_free(cm->above_seg_context);
+    cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
+        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+    if (!cm->above_seg_context) goto fail;
+    cm->above_context_alloc_cols = cm->mi_cols;
+  }
 
   return 0;
 

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 097053a..319d348 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -18,74 +18,28 @@
 #include "vpx_scale/yv12config.h"
 
 #include "vp9/common/vp9_common_data.h"
-#include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_mv.h"
 #include "vp9/common/vp9_scale.h"
+#include "vp9/common/vp9_seg_common.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#define BLOCK_SIZE_GROUPS 4
-#define SKIP_CONTEXTS 3
-#define INTER_MODE_CONTEXTS 7
-
-/* Segment Feature Masks */
-#define MAX_MV_REF_CANDIDATES 2
-
-#define INTRA_INTER_CONTEXTS 4
-#define COMP_INTER_CONTEXTS 5
-#define REF_CONTEXTS 5
-
-typedef enum {
-  PLANE_TYPE_Y  = 0,
-  PLANE_TYPE_UV = 1,
-  PLANE_TYPES
-} PLANE_TYPE;
-
 #define MAX_MB_PLANE 3
 
-typedef char ENTROPY_CONTEXT;
-
-static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
-                                           ENTROPY_CONTEXT b) {
-  return (a != 0) + (b != 0);
-}
-
 typedef enum {
   KEY_FRAME = 0,
   INTER_FRAME = 1,
   FRAME_TYPES,
 } FRAME_TYPE;
 
-typedef enum {
-  DC_PRED,         // Average of above and left pixels
-  V_PRED,          // Vertical
-  H_PRED,          // Horizontal
-  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
-  D135_PRED,       // Directional 135 deg = 180 - 45
-  D117_PRED,       // Directional 117 deg = 180 - 63
-  D153_PRED,       // Directional 153 deg = 180 - 27
-  D207_PRED,       // Directional 207 deg = 180 + 27
-  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
-  TM_PRED,         // True-motion
-  NEARESTMV,
-  NEARMV,
-  ZEROMV,
-  NEWMV,
-  MB_MODE_COUNT
-} PREDICTION_MODE;
-
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
   return mode >= NEARESTMV && mode <= NEWMV;
 }
 
-#define INTRA_MODES (TM_PRED + 1)
-
-#define INTER_MODES (1 + NEWMV - NEARESTMV)
-
-#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
-
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -281,6 +235,27 @@
   return ss_size_lookup[bsize][pd->subsampling_x][pd->subsampling_y];
 }
 
+static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    struct macroblockd_plane *const pd = &xd->plane[i];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    memset(pd->above_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
+    memset(pd->left_context, 0,
+           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
+  }
+}
+
+static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
+                                               const MODE_INFO *above_mi,
+                                               const MODE_INFO *left_mi,
+                                               int block) {
+  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
+  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
+  return vp9_kf_y_mode_prob[above][left];
+}
+
 typedef void (*foreach_transformed_block_visitor)(int plane, int block,
                                                   BLOCK_SIZE plane_bsize,
                                                   TX_SIZE tx_size,

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 5a9007b..4e02630 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -14,8 +14,8 @@
 #include "vpx/vpx_integer.h"
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_enums.h"
 #include "vp9/common/vp9_prob.h"
-#include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -137,18 +137,6 @@
 void vp9_default_coef_probs(struct VP9Common *cm);
 void vp9_adapt_coef_probs(struct VP9Common *cm);
 
-static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) {
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-    memset(pd->above_context, 0,
-           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide_lookup[plane_bsize]);
-    memset(pd->left_context, 0,
-           sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high_lookup[plane_bsize]);
-  }
-}
-
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
 // This macro is currently unused but may be used by certain implementations
@@ -185,6 +173,13 @@
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full);
 
+typedef char ENTROPY_CONTEXT;
+
+static INLINE int combine_entropy_contexts(ENTROPY_CONTEXT a,
+                                           ENTROPY_CONTEXT b) {
+  return (a != 0) + (b != 0);
+}
+
 static INLINE int get_entropy_context(TX_SIZE tx_size, const ENTROPY_CONTEXT *a,
                                       const ENTROPY_CONTEXT *l) {
   ENTROPY_CONTEXT above_ec = 0, left_ec = 0;
@@ -214,18 +209,6 @@
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
-static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
-                                         PLANE_TYPE type, int block_idx) {
-  const MODE_INFO *const mi = xd->mi[0];
-
-  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
-    return &vp9_default_scan_orders[tx_size];
-  } else {
-    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
-    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
-  }
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/common/vp9_entropymode.h b/vp9/common/vp9_entropymode.h
index f4e20e1..a0619ec 100644
--- a/vp9/common/vp9_entropymode.h
+++ b/vp9/common/vp9_entropymode.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_VP9_ENTROPYMODE_H_
 #define VP9_COMMON_VP9_ENTROPYMODE_H_
 
-#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_filter.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymv.h"
 
@@ -19,8 +19,12 @@
 extern "C" {
 #endif
 
+#define BLOCK_SIZE_GROUPS 4
+
 #define TX_SIZE_CONTEXTS 2
 
+#define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+
 struct VP9Common;
 
 struct tx_probs {
@@ -97,15 +101,6 @@
 void tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
                                     unsigned int (*ct_8x8p)[2]);
 
-static INLINE const vp9_prob *get_y_mode_probs(const MODE_INFO *mi,
-                                               const MODE_INFO *above_mi,
-                                               const MODE_INFO *left_mi,
-                                               int block) {
-  const PREDICTION_MODE above = vp9_above_block_mode(mi, above_mi, block);
-  const PREDICTION_MODE left = vp9_left_block_mode(mi, left_mi, block);
-  return vp9_kf_y_mode_prob[above][left];
-}
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 7938fc1..0482025 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h

@@ -104,6 +104,44 @@
   VP9_ALT_FLAG = 1 << 2,
 } VP9_REFFRAME;
 
+typedef enum {
+  PLANE_TYPE_Y  = 0,
+  PLANE_TYPE_UV = 1,
+  PLANE_TYPES
+} PLANE_TYPE;
+
+typedef enum {
+  DC_PRED,         // Average of above and left pixels
+  V_PRED,          // Vertical
+  H_PRED,          // Horizontal
+  D45_PRED,        // Directional 45  deg = round(arctan(1/1) * 180/pi)
+  D135_PRED,       // Directional 135 deg = 180 - 45
+  D117_PRED,       // Directional 117 deg = 180 - 63
+  D153_PRED,       // Directional 153 deg = 180 - 27
+  D207_PRED,       // Directional 207 deg = 180 + 27
+  D63_PRED,        // Directional 63  deg = round(arctan(2/1) * 180/pi)
+  TM_PRED,         // True-motion
+  NEARESTMV,
+  NEARMV,
+  ZEROMV,
+  NEWMV,
+  MB_MODE_COUNT
+} PREDICTION_MODE;
+
+#define INTRA_MODES (TM_PRED + 1)
+
+#define INTER_MODES (1 + NEWMV - NEARESTMV)
+
+#define SKIP_CONTEXTS 3
+#define INTER_MODE_CONTEXTS 7
+
+/* Segment Feature Masks */
+#define MAX_MV_REF_CANDIDATES 2
+
+#define INTRA_INTER_CONTEXTS 4
+#define COMP_INTER_CONTEXTS 5
+#define REF_CONTEXTS 5
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 57189df..bebb37e 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c

@@ -171,13 +171,13 @@
   get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
 
   if (bs == BLOCK_16X16) {
-    vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+    vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
     sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
   } else if (bs == BLOCK_32X32) {
-    vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+    vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
     sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
   } else /* if (bs == BLOCK_64X64) */ {
-    vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+    vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
     sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
   }
 

diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 51e147e..ce69527 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c

@@ -223,6 +223,6 @@
       break;
     }
     default:
-      assert("Invalid block index.");
+      assert(0 && "Invalid block index.");
   }
 }

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 5179c69..188b03d 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -14,6 +14,7 @@
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "./vp9_rtcd.h"
+#include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
 #include "vp9/common/vp9_entropymv.h"
 #include "vp9/common/vp9_entropy.h"
@@ -220,6 +221,7 @@
   uint8_t *seg_map_array[NUM_PING_PONG_BUFFERS];
   uint8_t *last_frame_seg_map;
   uint8_t *current_frame_seg_map;
+  int seg_map_alloc_size;
 
   INTERP_FILTER interp_filter;
 
@@ -276,6 +278,7 @@
 
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
+  int above_context_alloc_cols;
 } VP9_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -305,8 +308,13 @@
     if (frame_bufs[i].ref_count == 0)
       break;
 
-  assert(i < FRAME_BUFFERS);
-  frame_bufs[i].ref_count = 1;
+  if (i != FRAME_BUFFERS) {
+    frame_bufs[i].ref_count = 1;
+  } else {
+    // Reset i to be INVALID_IDX to indicate no free buffer found.
+    i = INVALID_IDX;
+  }
+
   unlock_buffer_pool(cm->buffer_pool);
   return i;
 }

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2a9736b..30710ba 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -797,51 +797,6 @@
 
 
 # variance
-add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4/, "$sse2_x86inc";
-
 add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
 specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
 
@@ -922,21 +877,6 @@
 add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
 specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
 
-add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
-
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon/;
 
@@ -1141,142 +1081,6 @@
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
-  # variance
-  add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_variance4x4/;
-
-  add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_variance4x4/;
-
-  add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance8x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_variance4x4/;
-
-  add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
-
-  add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
-
   add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
 
@@ -1511,41 +1315,6 @@
   add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
   specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
 
-  add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x16/;
-
-  add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse16x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
 
   # ENCODEMB INVOKE
 

diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 65e2aa6..1d86b5c 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h

@@ -38,6 +38,18 @@
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
+static INLINE const scan_order *get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
+                                         PLANE_TYPE type, int block_idx) {
+  const MODE_INFO *const mi = xd->mi[0];
+
+  if (is_inter_block(&mi->mbmi) || type != PLANE_TYPE_Y || xd->lossless) {
+    return &vp9_default_scan_orders[tx_size];
+  } else {
+    const PREDICTION_MODE mode = get_y_mode(mi, block_idx);
+    return &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+  }
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/common/vp9_systemdependent.h b/vp9/common/vp9_systemdependent.h
index e971158..fc77762 100644
--- a/vp9/common/vp9_systemdependent.h
+++ b/vp9/common/vp9_systemdependent.h

@@ -11,13 +11,14 @@
 #ifndef VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 #define VP9_COMMON_VP9_SYSTEMDEPENDENT_H_
 
+#include "vpx_ports/msvc.h"
+
 #ifdef _MSC_VER
 # include <math.h>  // the ceil() definition must precede intrin.h
 # if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
 #  include <intrin.h>
-#  define USE_MSC_INTRIN
+#  define USE_MSC_INTRINSICS
 # endif
-# define snprintf _snprintf
 #endif
 
 #ifdef __cplusplus
@@ -48,7 +49,7 @@
 static INLINE int get_msb(unsigned int n) {
   return 31 ^ __builtin_clz(n);
 }
-#elif defined(USE_MSC_INTRIN)
+#elif defined(USE_MSC_INTRINSICS)
 #pragma intrinsic(_BitScanReverse)
 
 static INLINE int get_msb(unsigned int n) {
@@ -56,7 +57,7 @@
   _BitScanReverse(&first_set_bit, n);
   return first_set_bit;
 }
-#undef USE_MSC_INTRIN
+#undef USE_MSC_INTRINSICS
 #else
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {

diff --git a/vp9/common/x86/convolve.h b/vp9/common/x86/convolve.h
new file mode 100644
index 0000000..de2df47
--- /dev/null
+++ b/vp9/common/x86/convolve.h

@@ -0,0 +1,296 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_X86_CONVOLVE_H_
+#define VP9_COMMON_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+  const uint8_t *src_ptr,
+  ptrdiff_t src_pitch,
+  uint8_t *output_ptr,
+  ptrdiff_t out_pitch,
+  uint32_t output_height,
+  const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                    uint8_t *dst, ptrdiff_t dst_stride, \
+                                    const int16_t *filter_x, int x_step_q4, \
+                                    const int16_t *filter_y, int y_step_q4, \
+                                    int w, int h) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                 src_stride, \
+                                                 dst, \
+                                                 dst_stride, \
+                                                 h, \
+                                                 filter); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                src_stride, \
+                                                dst, \
+                                                dst_stride, \
+                                                h, \
+                                                filter); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+                             filter_x, x_step_q4, filter_y, y_step_q4, \
+                             w, h); \
+  } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                              uint8_t *dst, ptrdiff_t dst_stride, \
+                              const int16_t *filter_x, int x_step_q4, \
+                              const int16_t *filter_y, int y_step_q4, \
+                              int w, int h) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 7); \
+      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } else { \
+      DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+                                filter_x, x_step_q4, filter_y, y_step_q4, \
+                                w, h + 1); \
+      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+                                      filter_x, x_step_q4, filter_y, \
+                                      y_step_q4, w, h); \
+    } \
+  } else { \
+    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+  } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+  const uint16_t *src_ptr,
+  const ptrdiff_t src_pitch,
+  uint16_t *output_ptr,
+  ptrdiff_t out_pitch,
+  unsigned int output_height,
+  const int16_t *filter,
+  int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+  void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+                                           ptrdiff_t src_stride, \
+                                           uint8_t *dst8, \
+                                           ptrdiff_t dst_stride, \
+                                           const int16_t *filter_x, \
+                                           int x_step_q4, \
+                                           const int16_t *filter_y, \
+                                           int y_step_q4, \
+                                           int w, int h, int bd) { \
+  if (step_q4 == 16 && filter[3] != 128) { \
+    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+    if (filter[0] || filter[1] || filter[2]) { \
+      while (w >= 16) { \
+        vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } else { \
+      while (w >= 16) { \
+        vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+                                                        src_stride, \
+                                                        dst, \
+                                                        dst_stride, \
+                                                        h, \
+                                                        filter, \
+                                                        bd); \
+        src += 16; \
+        dst += 16; \
+        w -= 16; \
+      } \
+      while (w >= 8) { \
+        vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 8; \
+        dst += 8; \
+        w -= 8; \
+      } \
+      while (w >= 4) { \
+        vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+                                                       src_stride, \
+                                                       dst, \
+                                                       dst_stride, \
+                                                       h, \
+                                                       filter, \
+                                                       bd); \
+        src += 4; \
+        dst += 4; \
+        w -= 4; \
+      } \
+    } \
+  } \
+  if (w) { \
+    vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+                                    filter_x, x_step_q4, filter_y, y_step_q4, \
+                                    w, h, bd); \
+  } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+                                     uint8_t *dst, ptrdiff_t dst_stride, \
+                                     const int16_t *filter_x, int x_step_q4, \
+                                     const int16_t *filter_y, int y_step_q4, \
+                                     int w, int h, int bd) { \
+  assert(w <= 64); \
+  assert(h <= 64); \
+  if (x_step_q4 == 16 && y_step_q4 == 16) { \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+      vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 7, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+                                             64, dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } else { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+      vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                       filter_x, x_step_q4, \
+                                       filter_y, y_step_q4, \
+                                       w, h + 1, bd); \
+      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+                                             dst, dst_stride, \
+                                             filter_x, x_step_q4, \
+                                             filter_y, y_step_q4, \
+                                             w, h, bd); \
+    } \
+  } else { \
+    vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
+                                  h, bd); \
+  } \
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VP9_COMMON_X86_CONVOLVE_H_

diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 963023c..fd55fb8 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c

@@ -8,421 +8,9 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include <assert.h>
-
-#include "./vpx_config.h"
 #include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
-  const unsigned char *src_ptr,
-  const ptrdiff_t src_pitch,
-  unsigned char *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const short *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                   uint8_t *dst, ptrdiff_t dst_stride, \
-                                   const int16_t *filter_x, int x_step_q4, \
-                                   const int16_t *filter_y, int y_step_q4, \
-                                   int w, int h) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    if (filter[0] || filter[1] || filter[2]) { \
-      while (w >= 16) { \
-        vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                 src_stride, \
-                                                 dst, \
-                                                 dst_stride, \
-                                                 h, \
-                                                 filter); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vp9_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                 src_stride, \
-                                                 dst, \
-                                                 dst_stride, \
-                                                 h, \
-                                                 filter); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                src_stride, \
-                                                dst, \
-                                                dst_stride, \
-                                                h, \
-                                                filter); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
-                             filter_x, x_step_q4, filter_y, y_step_q4, \
-                             w, h); \
-  } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                              uint8_t *dst, ptrdiff_t dst_stride, \
-                              const int16_t *filter_x, int x_step_q4, \
-                              const int16_t *filter_y, int y_step_q4, \
-                              int w, int h) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
-      vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, \
-                                w, h + 7); \
-      vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
-                                      filter_x, x_step_q4, filter_y, \
-                                      y_step_q4, w, h); \
-    } else { \
-      DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
-      vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
-                                filter_x, x_step_q4, filter_y, y_step_q4, \
-                                w, h + 1); \
-      vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
-                                      filter_x, x_step_q4, filter_y, \
-                                      y_step_q4, w, h); \
-    } \
-  } else { \
-    vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                           filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
-  } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
-  const uint16_t *src_ptr,
-  const ptrdiff_t src_pitch,
-  uint16_t *output_ptr,
-  ptrdiff_t out_pitch,
-  unsigned int output_height,
-  const int16_t *filter,
-  int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
-  void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
-                                           ptrdiff_t src_stride, \
-                                           uint8_t *dst8, \
-                                           ptrdiff_t dst_stride, \
-                                           const int16_t *filter_x, \
-                                           int x_step_q4, \
-                                           const int16_t *filter_y, \
-                                           int y_step_q4, \
-                                           int w, int h, int bd) { \
-  if (step_q4 == 16 && filter[3] != 128) { \
-    uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-    uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-    if (filter[0] || filter[1] || filter[2]) { \
-      while (w >= 16) { \
-        vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } else { \
-      while (w >= 16) { \
-        vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
-                                                        src_stride, \
-                                                        dst, \
-                                                        dst_stride, \
-                                                        h, \
-                                                        filter, \
-                                                        bd); \
-        src += 16; \
-        dst += 16; \
-        w -= 16; \
-      } \
-      while (w >= 8) { \
-        vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 8; \
-        dst += 8; \
-        w -= 8; \
-      } \
-      while (w >= 4) { \
-        vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
-                                                       src_stride, \
-                                                       dst, \
-                                                       dst_stride, \
-                                                       h, \
-                                                       filter, \
-                                                       bd); \
-        src += 4; \
-        dst += 4; \
-        w -= 4; \
-      } \
-    } \
-  } \
-  if (w) { \
-    vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
-                                    filter_x, x_step_q4, filter_y, y_step_q4, \
-                                    w, h, bd); \
-  } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
-                                     uint8_t *dst, ptrdiff_t dst_stride, \
-                                     const int16_t *filter_x, int x_step_q4, \
-                                     const int16_t *filter_y, int y_step_q4, \
-                                     int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
-  if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
-        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 7, bd); \
-      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                       filter_x, x_step_q4, \
-                                       filter_y, y_step_q4, \
-                                       w, h + 1, bd); \
-      vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
-    } \
-  } else { \
-    vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
-                                  filter_x, x_step_q4, filter_y, y_step_q4, w, \
-                                  h, bd); \
-  } \
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d16_v8_avx2;
-filter8_1dfunction vp9_filter_block1d16_h8_avx2;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif  // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-FUN_CONV_2D(, avx2);
-#endif  // HAVE_AX2 && HAVE_SSSE3
-#if HAVE_SSSE3
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else  // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif  // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                uint8_t *dst, ptrdiff_t dst_stride,
-//                                const int16_t *filter_x, int x_step_q4,
-//                                const int16_t *filter_y, int y_step_q4,
-//                                int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                               uint8_t *dst, ptrdiff_t dst_stride,
-//                               const int16_t *filter_x, int x_step_q4,
-//                               const int16_t *filter_y, int y_step_q4,
-//                               int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                    uint8_t *dst, ptrdiff_t dst_stride,
-//                                    const int16_t *filter_x, int x_step_q4,
-//                                    const int16_t *filter_y, int y_step_q4,
-//                                    int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                                   uint8_t *dst, ptrdiff_t dst_stride,
-//                                   const int16_t *filter_x, int x_step_q4,
-//                                   const int16_t *filter_y, int y_step_q4,
-//                                   int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
-            ssse3);
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                          uint8_t *dst, ptrdiff_t dst_stride,
-//                          const int16_t *filter_x, int x_step_q4,
-//                          const int16_t *filter_y, int y_step_q4,
-//                          int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-//                              uint8_t *dst, ptrdiff_t dst_stride,
-//                              const int16_t *filter_x, int x_step_q4,
-//                              const int16_t *filter_y, int y_step_q4,
-//                              int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
-#endif  // HAVE_SSSE3
+#include "./vpx_config.h"
+#include "vp9/common/x86/convolve.h"
 
 #if HAVE_SSE2
 filter8_1dfunction vp9_filter_block1d16_v8_sse2;

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index 3bc7d39..cee8d1e 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c

@@ -8,7 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+#include "./vp9_rtcd.h"
+
 #include <immintrin.h>
+
+#include "vp9/common/x86/convolve.h"
 #include "vpx_ports/mem.h"
 
 // filters for 16_h8 and 16_v8
@@ -53,23 +60,23 @@
 # define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
 #endif  // __clang__
 
-void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
-                                  unsigned int src_pixels_per_line,
-                                  unsigned char *output_ptr,
-                                  unsigned int  output_pitch,
-                                  unsigned int  output_height,
-                                  int16_t *filter) {
+static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
   __m256i srcReg32b1, srcReg32b2, filtersReg32;
   unsigned int i;
-  unsigned int src_stride, dst_stride;
+  ptrdiff_t src_stride, dst_stride;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to 8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -104,9 +111,9 @@
   for (i = output_height; i > 1; i-=2) {
     // load the 2 strides of source
     srcReg32b1 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((__m128i *)(src_ptr-3)));
+                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
     srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
-                 _mm_loadu_si128((__m128i *)
+                 _mm_loadu_si128((const __m128i *)
                  (src_ptr+src_pixels_per_line-3)), 1);
 
     // filter the source buffer
@@ -135,9 +142,9 @@
     // reading 2 strides of the next 16 bytes
     // (part of it was being read by earlier read)
     srcReg32b2 = _mm256_castsi128_si256(
-                 _mm_loadu_si128((__m128i *)(src_ptr+5)));
+                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
     srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
-                 _mm_loadu_si128((__m128i *)
+                 _mm_loadu_si128((const __m128i *)
                  (src_ptr+src_pixels_per_line+5)), 1);
 
     // add and saturate the results together
@@ -202,7 +209,7 @@
     __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
     __m128i srcRegFilt2, srcRegFilt3;
 
-    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
     srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
@@ -237,7 +244,7 @@
 
     // reading the next 16 bytes
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -297,12 +304,12 @@
   }
 }
 
-void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
-                                  unsigned int src_pitch,
-                                  unsigned char *output_ptr,
-                                  unsigned int out_pitch,
-                                  unsigned int output_height,
-                                  int16_t *filter) {
+static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i filtersReg;
   __m256i addFilterReg64;
   __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
@@ -310,11 +317,11 @@
   __m256i srcReg32b11, srcReg32b12, filtersReg32;
   __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
   unsigned int i;
-  unsigned int src_stride, dst_stride;
+  ptrdiff_t src_stride, dst_stride;
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the
   // same data in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -344,19 +351,19 @@
 
   // load 16 bytes 7 times in stride of src_pitch
   srcReg32b1 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr)));
+               _mm_loadu_si128((const __m128i *)(src_ptr)));
   srcReg32b2 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
   srcReg32b3 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
   srcReg32b4 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
   srcReg32b5 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
   srcReg32b6 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
   srcReg32b7 = _mm256_castsi128_si256(
-               _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
 
   // have each consecutive loads on the same 256 register
   srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
@@ -393,11 +400,11 @@
      // load the last 2 loads of 16 bytes and have every two
      // consecutive loads in the same 256 bit register
      srcReg32b8 = _mm256_castsi128_si256(
-     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
      srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
      _mm256_castsi256_si128(srcReg32b8), 1);
      srcReg32b9 = _mm256_castsi128_si256(
-     _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
      srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
      _mm256_castsi256_si128(srcReg32b9), 1);
 
@@ -476,7 +483,7 @@
     __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
     __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
     // load the last 16 bytes
-    srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the last 2 results together
     srcRegFilt4 = _mm_unpacklo_epi8(
@@ -542,3 +549,54 @@
     _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
   }
 }
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif  // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2  vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2  vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2  vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2  vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+FUN_CONV_2D(, avx2);
+#endif  // HAVE_AX2 && HAVE_SSSE3

diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index 4ab49e7..5fd2857 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c

@@ -8,9 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+#include "./vp9_rtcd.h"
+
 #include <tmmintrin.h>
 
-#include "./vp9_rtcd.h"
+#include "vp9/common/x86/convolve.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/emmintrin_compat.h"
 
@@ -40,12 +45,17 @@
   6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
 };
 
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+
+void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i firstFilters, secondFilters, shuffle1, shuffle2;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
   __m128i addFilterReg64, filtersReg, srcReg, minReg;
@@ -53,7 +63,7 @@
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -74,7 +84,7 @@
   shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
 
   for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
     srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
@@ -111,12 +121,12 @@
   }
 }
 
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pixels_per_line,
-                                         unsigned char *output_ptr,
-                                         unsigned int output_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
+void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pixels_per_line,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t output_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
@@ -125,7 +135,7 @@
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -149,7 +159,7 @@
   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
 
   for (i = 0; i < output_height; i++) {
-    srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
     srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
@@ -191,12 +201,12 @@
   }
 }
 
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pixels_per_line,
-                                          unsigned char *output_ptr,
-                                          unsigned int output_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
+static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pixels_per_line,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t output_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
   __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
   __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
@@ -205,7 +215,7 @@
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -229,7 +239,7 @@
   filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
 
   for (i = 0; i < output_height; i++) {
-    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
 
     // filter the source buffer
     srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
@@ -256,7 +266,7 @@
 
     // reading the next 16 bytes.
     // (part of it was being read by earlier read)
-    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
 
     // add and saturate the results together
     srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -308,12 +318,12 @@
   }
 }
 
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
-                                         unsigned int src_pitch,
-                                         unsigned char *output_ptr,
-                                         unsigned int out_pitch,
-                                         unsigned int output_height,
-                                         int16_t *filter) {
+void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                         ptrdiff_t src_pitch,
+                                         uint8_t *output_ptr,
+                                         ptrdiff_t out_pitch,
+                                         uint32_t output_height,
+                                         const int16_t *filter) {
   __m128i addFilterReg64, filtersReg, minReg;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
@@ -323,7 +333,7 @@
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -338,17 +348,17 @@
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
   // load the first 7 rows of 8 bytes
-  srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
-  srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
-  srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
-  srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
-  srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
-  srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
-  srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
 
   for (i = 0; i < output_height; i++) {
     // load the last 8 bytes
-    srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
+    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the result together
     srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -396,12 +406,12 @@
   }
 }
 
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
-                                          unsigned int src_pitch,
-                                          unsigned char *output_ptr,
-                                          unsigned int out_pitch,
-                                          unsigned int output_height,
-                                          int16_t *filter) {
+static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+                                                 ptrdiff_t src_pitch,
+                                                 uint8_t *output_ptr,
+                                                 ptrdiff_t out_pitch,
+                                                 uint32_t output_height,
+                                                 const int16_t *filter) {
   __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
   __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
   __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
@@ -411,7 +421,7 @@
 
   // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
   addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
-  filtersReg = _mm_loadu_si128((__m128i *)filter);
+  filtersReg = _mm_loadu_si128((const __m128i *)filter);
   // converting the 16 bit (short) to  8 bit (byte) and have the same data
   // in both lanes of 128 bit register.
   filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -426,17 +436,17 @@
   forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
 
   // load the first 7 rows of 16 bytes
-  srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
-  srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
-  srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
-  srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
-  srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
-  srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
-  srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+  srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
 
   for (i = 0; i < output_height; i++) {
     // load the last 16 bytes
-    srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
+    srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
 
     // merge the result together
     srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -510,3 +520,82 @@
     output_ptr+=out_pitch;
   }
 }
+
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else  // ARCH_X86
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif  // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
+
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                uint8_t *dst, ptrdiff_t dst_stride,
+//                                const int16_t *filter_x, int x_step_q4,
+//                                const int16_t *filter_y, int y_step_q4,
+//                                int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                               uint8_t *dst, ptrdiff_t dst_stride,
+//                               const int16_t *filter_x, int x_step_q4,
+//                               const int16_t *filter_y, int y_step_q4,
+//                               int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                    uint8_t *dst, ptrdiff_t dst_stride,
+//                                    const int16_t *filter_x, int x_step_q4,
+//                                    const int16_t *filter_y, int y_step_q4,
+//                                    int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                                   uint8_t *dst, ptrdiff_t dst_stride,
+//                                   const int16_t *filter_x, int x_step_q4,
+//                                   const int16_t *filter_y, int y_step_q4,
+//                                   int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+            ssse3);
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                          uint8_t *dst, ptrdiff_t dst_stride,
+//                          const int16_t *filter_x, int x_step_q4,
+//                          const int16_t *filter_y, int y_step_q4,
+//                          int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+//                              uint8_t *dst, ptrdiff_t dst_stride,
+//                              const int16_t *filter_x, int x_step_q4,
+//                              const int16_t *filter_y, int y_step_q4,
+//                              int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index fcf480b..0e9b1c5 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -699,7 +699,8 @@
 #if CONFIG_SIZE_LIMIT
   if (width > DECODE_WIDTH_LIMIT || height > DECODE_HEIGHT_LIMIT)
     vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                       "Width and height beyond allowed size.");
+                       "Dimensions of %dx%d beyond allowed size of %dx%d.",
+                       width, height, DECODE_WIDTH_LIMIT, DECODE_HEIGHT_LIMIT);
 #endif
   if (cm->width != width || cm->height != height) {
     const int new_mi_rows =

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 288d869..7991a39 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -50,7 +50,6 @@
 
 static void vp9_dec_setup_mi(VP9_COMMON *cm) {
   cm->mi = cm->mip + cm->mi_stride + 1;
-  memset(cm->mip, 0, cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mip));
   cm->mi_grid_visible = cm->mi_grid_base + cm->mi_stride + 1;
   memset(cm->mi_grid_base, 0,
          cm->mi_stride * (cm->mi_rows + 1) * sizeof(*cm->mi_grid_base));
@@ -212,6 +211,9 @@
 
     // Find an empty frame buffer.
     const int free_fb = get_free_fb(cm);
+    if (cm->new_fb_idx == INVALID_IDX)
+      return VPX_CODEC_MEM_ERROR;
+
     // Decrease ref_count since it will be increased again in
     // ref_cnt_fb() below.
     --frame_bufs[free_fb].ref_count;
@@ -299,7 +301,10 @@
       && frame_bufs[cm->new_fb_idx].ref_count == 0)
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+  // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
+  if (cm->new_fb_idx == INVALID_IDX)
+    return VPX_CODEC_MEM_ERROR;
 
   // Assign a MV array to the frame buffer.
   cm->cur_frame = &pool->frame_bufs[cm->new_fb_idx];

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index bb8c66f..6326984 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -17,6 +17,7 @@
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #include "vp9/common/vp9_idct.h"
 #endif
+#include "vp9/common/vp9_scan.h"
 
 #include "vp9/decoder/vp9_detokenize.h"
 

diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
index cf82dd7..166156a 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c

@@ -10,6 +10,7 @@
 
 #include <arm_neon.h>
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
@@ -20,82 +21,6 @@
 
 #include "vp9/encoder/vp9_variance.h"
 
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-// w * h must be less than 2048 or local variable v_sum may overflow.
-static void variance_neon_w8(const uint8_t *a, int a_stride,
-                             const uint8_t *b, int b_stride,
-                             int w, int h, uint32_t *sse, int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo = vmlal_s16(v_sse_lo,
-                           vget_low_s16(sv_diff),
-                           vget_low_s16(sv_diff));
-      v_sse_hi = vmlal_s16(v_sse_hi,
-                           vget_high_s16(sv_diff),
-                           vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-void vp9_get8x8var_neon(const uint8_t *src_ptr, int source_stride,
-                        const uint8_t *ref_ptr, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 8,
-                   8, sse, sum);
-}
-
-unsigned int vp9_variance8x8_neon(const uint8_t *a, int a_stride,
-                                  const uint8_t *b, int b_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
-}
-
-void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 16,
-                   16, sse, sum);
-}
-
-unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
-}
-
 static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
                                       uint8_t *output_ptr,
                                       unsigned int src_pixels_per_line,
@@ -162,7 +87,7 @@
                             BILINEAR_FILTERS_2TAP(xoffset));
   var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
                             8, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
+  return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
 
 unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
@@ -180,77 +105,7 @@
                              BILINEAR_FILTERS_2TAP(xoffset));
   var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16,
                              16, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
-}
-
-void vp9_get32x32var_neon(const uint8_t *src_ptr, int source_stride,
-                          const uint8_t *ref_ptr, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, 32,
-                   32, sse, sum);
-}
-
-unsigned int vp9_variance32x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
-  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
-}
-
-unsigned int vp9_variance32x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
-  variance_neon_w8(a + (32 * a_stride), a_stride,
-                   b + (32 * b_stride), b_stride, 32, 32,
-                   &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x32_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
-}
-
-unsigned int vp9_variance64x64_neon(const uint8_t *a, int a_stride,
-                                    const uint8_t *b, int b_stride,
-                                    unsigned int *sse) {
-  int sum1, sum2;
-  uint32_t sse1, sse2;
-
-  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
-  variance_neon_w8(a + (16 * a_stride), a_stride,
-                   b + (16 * b_stride), b_stride, 64, 16,
-                   &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
-                   b + (16 * 2 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  sse1 += sse2;
-  sum1 += sum2;
-
-  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
-                   b + (16 * 3 * b_stride), b_stride,
-                   64, 16, &sse2, &sum2);
-  *sse = sse1 + sse2;
-  sum1 += sum2;
-  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+  return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
 
 unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
@@ -268,7 +123,7 @@
                              BILINEAR_FILTERS_2TAP(xoffset));
   var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32,
                              32, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
+  return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }
 
 unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
@@ -286,5 +141,5 @@
                              BILINEAR_FILTERS_2TAP(xoffset));
   var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64,
                              64, BILINEAR_FILTERS_2TAP(yoffset));
-  return vp9_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
+  return vpx_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
 }

diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c
index 55c9649..9e5d9ee 100644
--- a/vp9/encoder/vp9_aq_variance.c
+++ b/vp9/encoder/vp9_aq_variance.c

@@ -98,9 +98,9 @@
     int avg;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      highbd_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                      CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
-                      &sse, &avg);
+      highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
+                        CONVERT_TO_BYTEPTR(vp9_highbd_64_zeros), 0, bw, bh,
+                        &sse, &avg);
       sse >>= 2 * (xd->bd - 8);
       avg >>= (xd->bd - 8);
     } else {

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3d310f9..a8adca9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -13,6 +13,7 @@
 #include <stdio.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
@@ -463,46 +464,55 @@
   return 0;
 }
 
-void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
+// Set the variance split thresholds for following the block sizes:
+// 0 - threshold_64x64, 1 - threshold_32x32, 2 - threshold_16x16,
+// 3 - vbp_threshold_8x8. vbp_threshold_8x8 (to split to 4x4 partition) is
+// currently only used on key frame.
+static void set_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
+  const int threshold_multiplier = is_key_frame ? 20 : 1;
+  const int64_t threshold_base = (int64_t)(threshold_multiplier *
+      cpi->y_dequant[q][1]);
+  if (is_key_frame) {
+    thresholds[0] = threshold_base;
+    thresholds[1] = threshold_base >> 2;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base << 2;
+  } else {
+    thresholds[1] = threshold_base;
+    if (cm->width <= 352 && cm->height <= 288) {
+      thresholds[0] = threshold_base >> 2;
+      thresholds[2] = threshold_base << 3;
+    } else {
+      thresholds[0] = threshold_base;
+      thresholds[1] = (5 * threshold_base) >> 2;
+      if (cm->width >= 1920 && cm->height >= 1080)
+        thresholds[1] = (7 * threshold_base) >> 2;
+      thresholds[2] = threshold_base << cpi->oxcf.speed;
+    }
+  }
+}
+
+void vp9_set_variance_partition_thresholds(VP9_COMP *cpi, int q) {
+  VP9_COMMON *const cm = &cpi->common;
   SPEED_FEATURES *const sf = &cpi->sf;
+  const int is_key_frame = (cm->frame_type == KEY_FRAME);
   if (sf->partition_search_type != VAR_BASED_PARTITION &&
       sf->partition_search_type != REFERENCE_PARTITION) {
     return;
   } else {
-    VP9_COMMON *const cm = &cpi->common;
-    const int is_key_frame = (cm->frame_type == KEY_FRAME);
-    const int threshold_multiplier = is_key_frame ? 20 : 1;
-    const int64_t threshold_base = (int64_t)(threshold_multiplier *
-        cpi->y_dequant[q][1]);
-
-    // TODO(marpan): Allow 4x4 partitions for inter-frames.
-    // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
-    // If 4x4 partition is not used, then 8x8 partition will be selected
-    // if variance of 16x16 block is very high, so use larger threshold
-    // for 16x16 (threshold_bsize_min) in that case.
-
-    // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
-    // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+    set_vbp_thresholds(cpi, cpi->vbp_thresholds, q);
+    // The thresholds below are not changed locally.
     if (is_key_frame) {
-      cpi->vbp_thresholds[0] = threshold_base;
-      cpi->vbp_thresholds[1] = threshold_base >> 2;
-      cpi->vbp_thresholds[2] = threshold_base >> 2;
-      cpi->vbp_thresholds[3] = threshold_base << 2;
       cpi->vbp_threshold_sad = 0;
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
-      cpi->vbp_thresholds[1] = threshold_base;
-      if (cm->width <= 352 && cm->height <= 288) {
-        cpi->vbp_thresholds[0] = threshold_base >> 2;
-        cpi->vbp_thresholds[2] = threshold_base << 3;
+      if (cm->width <= 352 && cm->height <= 288)
         cpi->vbp_threshold_sad = 100;
-      } else {
-        cpi->vbp_thresholds[0] = threshold_base;
-        cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2;
-        cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed;
+      else
         cpi->vbp_threshold_sad = (cpi->y_dequant[q][1] << 1) > 1000 ?
             (cpi->y_dequant[q][1] << 1) : 1000;
-      }
       cpi->vbp_bsize_min = BLOCK_16X16;
     }
     cpi->vbp_threshold_minmax = 15 + (q >> 3);
@@ -551,23 +561,6 @@
   return (minmax_max - minmax_min);
 }
 
-static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
-  VP9_COMMON *const cm = &cpi->common;
-  const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]);
-
-  // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
-  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
-  thresholds[1] = threshold_base;
-  if (cm->width <= 352 && cm->height <= 288) {
-    thresholds[0] = threshold_base >> 2;
-    thresholds[2] = threshold_base << 3;
-  } else {
-    thresholds[0] = threshold_base;
-    thresholds[1] = (5 * threshold_base) >> 2;
-    thresholds[2] = threshold_base << cpi->oxcf.speed;
-  }
-}
-
 static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
                                  int dp, int x8_idx, int y8_idx, v8x8 *vst,
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -680,7 +673,7 @@
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
-      modify_vbp_thresholds(cpi, thresholds, q);
+      set_vbp_thresholds(cpi, thresholds, q);
     }
   }
 
@@ -724,7 +717,7 @@
     mbmi->mv[0].as_int = 0;
     mbmi->interp_filter = BILINEAR;
 
-    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+    y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
     if (y_sad_g < y_sad) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -3672,15 +3665,15 @@
       if (cm->use_highbitdepth) {
         switch (cm->bit_depth) {
           case VPX_BITS_8:
-            vp9_highbd_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_8_get16x16var(src, src_stride, last_src, last_stride,
                                    &var16->sse, &var16->sum);
             break;
           case VPX_BITS_10:
-            vp9_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_10_get16x16var(src, src_stride, last_src, last_stride,
                                     &var16->sse, &var16->sum);
             break;
           case VPX_BITS_12:
-            vp9_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
+            vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
                                       &var16->sse, &var16->sum);
             break;
           default:
@@ -3689,11 +3682,11 @@
             return -1;
         }
       } else {
-        vp9_get16x16var(src, src_stride, last_src, last_stride,
+        vpx_get16x16var(src, src_stride, last_src, last_stride,
                         &var16->sse, &var16->sum);
       }
 #else
-      vp9_get16x16var(src, src_stride, last_src, last_stride,
+      vpx_get16x16var(src, src_stride, last_src, last_stride,
                       &var16->sse, &var16->sum);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       var16->var = var16->sse -

diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 1acde02..6aaa564 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h

@@ -40,7 +40,7 @@
 void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td,
                      int tile_row, int tile_col);
 
-void vp9_set_vbp_thresholds(struct VP9_COMP *cpi, int q);
+void vp9_set_variance_partition_thresholds(struct VP9_COMP *cpi, int q);
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index b115e0e..2829365 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -18,6 +18,7 @@
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_systemdependent.h"
 
 #include "vp9/encoder/vp9_encodemb.h"

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index df70949..2fdf408 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -1000,7 +1000,7 @@
         HIGHBD_BFP(BLOCK_32X16,
                    vpx_highbd_sad32x16_bits8,
                    vpx_highbd_sad32x16_avg_bits8,
-                   vp9_highbd_variance32x16,
+                   vpx_highbd_8_variance32x16,
                    vp9_highbd_sub_pixel_variance32x16,
                    vp9_highbd_sub_pixel_avg_variance32x16,
                    NULL,
@@ -1010,7 +1010,7 @@
         HIGHBD_BFP(BLOCK_16X32,
                    vpx_highbd_sad16x32_bits8,
                    vpx_highbd_sad16x32_avg_bits8,
-                   vp9_highbd_variance16x32,
+                   vpx_highbd_8_variance16x32,
                    vp9_highbd_sub_pixel_variance16x32,
                    vp9_highbd_sub_pixel_avg_variance16x32,
                    NULL,
@@ -1020,7 +1020,7 @@
         HIGHBD_BFP(BLOCK_64X32,
                    vpx_highbd_sad64x32_bits8,
                    vpx_highbd_sad64x32_avg_bits8,
-                   vp9_highbd_variance64x32,
+                   vpx_highbd_8_variance64x32,
                    vp9_highbd_sub_pixel_variance64x32,
                    vp9_highbd_sub_pixel_avg_variance64x32,
                    NULL,
@@ -1030,7 +1030,7 @@
         HIGHBD_BFP(BLOCK_32X64,
                    vpx_highbd_sad32x64_bits8,
                    vpx_highbd_sad32x64_avg_bits8,
-                   vp9_highbd_variance32x64,
+                   vpx_highbd_8_variance32x64,
                    vp9_highbd_sub_pixel_variance32x64,
                    vp9_highbd_sub_pixel_avg_variance32x64,
                    NULL,
@@ -1040,7 +1040,7 @@
         HIGHBD_BFP(BLOCK_32X32,
                    vpx_highbd_sad32x32_bits8,
                    vpx_highbd_sad32x32_avg_bits8,
-                   vp9_highbd_variance32x32,
+                   vpx_highbd_8_variance32x32,
                    vp9_highbd_sub_pixel_variance32x32,
                    vp9_highbd_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits8,
@@ -1050,7 +1050,7 @@
         HIGHBD_BFP(BLOCK_64X64,
                    vpx_highbd_sad64x64_bits8,
                    vpx_highbd_sad64x64_avg_bits8,
-                   vp9_highbd_variance64x64,
+                   vpx_highbd_8_variance64x64,
                    vp9_highbd_sub_pixel_variance64x64,
                    vp9_highbd_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits8,
@@ -1060,7 +1060,7 @@
         HIGHBD_BFP(BLOCK_16X16,
                    vpx_highbd_sad16x16_bits8,
                    vpx_highbd_sad16x16_avg_bits8,
-                   vp9_highbd_variance16x16,
+                   vpx_highbd_8_variance16x16,
                    vp9_highbd_sub_pixel_variance16x16,
                    vp9_highbd_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits8,
@@ -1070,7 +1070,7 @@
         HIGHBD_BFP(BLOCK_16X8,
                    vpx_highbd_sad16x8_bits8,
                    vpx_highbd_sad16x8_avg_bits8,
-                   vp9_highbd_variance16x8,
+                   vpx_highbd_8_variance16x8,
                    vp9_highbd_sub_pixel_variance16x8,
                    vp9_highbd_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits8,
@@ -1080,7 +1080,7 @@
         HIGHBD_BFP(BLOCK_8X16,
                    vpx_highbd_sad8x16_bits8,
                    vpx_highbd_sad8x16_avg_bits8,
-                   vp9_highbd_variance8x16,
+                   vpx_highbd_8_variance8x16,
                    vp9_highbd_sub_pixel_variance8x16,
                    vp9_highbd_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits8,
@@ -1090,7 +1090,7 @@
         HIGHBD_BFP(BLOCK_8X8,
                    vpx_highbd_sad8x8_bits8,
                    vpx_highbd_sad8x8_avg_bits8,
-                   vp9_highbd_variance8x8,
+                   vpx_highbd_8_variance8x8,
                    vp9_highbd_sub_pixel_variance8x8,
                    vp9_highbd_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits8,
@@ -1100,7 +1100,7 @@
         HIGHBD_BFP(BLOCK_8X4,
                    vpx_highbd_sad8x4_bits8,
                    vpx_highbd_sad8x4_avg_bits8,
-                   vp9_highbd_variance8x4,
+                   vpx_highbd_8_variance8x4,
                    vp9_highbd_sub_pixel_variance8x4,
                    vp9_highbd_sub_pixel_avg_variance8x4,
                    NULL,
@@ -1110,7 +1110,7 @@
         HIGHBD_BFP(BLOCK_4X8,
                    vpx_highbd_sad4x8_bits8,
                    vpx_highbd_sad4x8_avg_bits8,
-                   vp9_highbd_variance4x8,
+                   vpx_highbd_8_variance4x8,
                    vp9_highbd_sub_pixel_variance4x8,
                    vp9_highbd_sub_pixel_avg_variance4x8,
                    NULL,
@@ -1120,7 +1120,7 @@
         HIGHBD_BFP(BLOCK_4X4,
                    vpx_highbd_sad4x4_bits8,
                    vpx_highbd_sad4x4_avg_bits8,
-                   vp9_highbd_variance4x4,
+                   vpx_highbd_8_variance4x4,
                    vp9_highbd_sub_pixel_variance4x4,
                    vp9_highbd_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits8,
@@ -1132,7 +1132,7 @@
         HIGHBD_BFP(BLOCK_32X16,
                    vpx_highbd_sad32x16_bits10,
                    vpx_highbd_sad32x16_avg_bits10,
-                   vp9_highbd_10_variance32x16,
+                   vpx_highbd_10_variance32x16,
                    vp9_highbd_10_sub_pixel_variance32x16,
                    vp9_highbd_10_sub_pixel_avg_variance32x16,
                    NULL,
@@ -1142,7 +1142,7 @@
         HIGHBD_BFP(BLOCK_16X32,
                    vpx_highbd_sad16x32_bits10,
                    vpx_highbd_sad16x32_avg_bits10,
-                   vp9_highbd_10_variance16x32,
+                   vpx_highbd_10_variance16x32,
                    vp9_highbd_10_sub_pixel_variance16x32,
                    vp9_highbd_10_sub_pixel_avg_variance16x32,
                    NULL,
@@ -1152,7 +1152,7 @@
         HIGHBD_BFP(BLOCK_64X32,
                    vpx_highbd_sad64x32_bits10,
                    vpx_highbd_sad64x32_avg_bits10,
-                   vp9_highbd_10_variance64x32,
+                   vpx_highbd_10_variance64x32,
                    vp9_highbd_10_sub_pixel_variance64x32,
                    vp9_highbd_10_sub_pixel_avg_variance64x32,
                    NULL,
@@ -1162,7 +1162,7 @@
         HIGHBD_BFP(BLOCK_32X64,
                    vpx_highbd_sad32x64_bits10,
                    vpx_highbd_sad32x64_avg_bits10,
-                   vp9_highbd_10_variance32x64,
+                   vpx_highbd_10_variance32x64,
                    vp9_highbd_10_sub_pixel_variance32x64,
                    vp9_highbd_10_sub_pixel_avg_variance32x64,
                    NULL,
@@ -1172,7 +1172,7 @@
         HIGHBD_BFP(BLOCK_32X32,
                    vpx_highbd_sad32x32_bits10,
                    vpx_highbd_sad32x32_avg_bits10,
-                   vp9_highbd_10_variance32x32,
+                   vpx_highbd_10_variance32x32,
                    vp9_highbd_10_sub_pixel_variance32x32,
                    vp9_highbd_10_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits10,
@@ -1182,7 +1182,7 @@
         HIGHBD_BFP(BLOCK_64X64,
                    vpx_highbd_sad64x64_bits10,
                    vpx_highbd_sad64x64_avg_bits10,
-                   vp9_highbd_10_variance64x64,
+                   vpx_highbd_10_variance64x64,
                    vp9_highbd_10_sub_pixel_variance64x64,
                    vp9_highbd_10_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits10,
@@ -1192,7 +1192,7 @@
         HIGHBD_BFP(BLOCK_16X16,
                    vpx_highbd_sad16x16_bits10,
                    vpx_highbd_sad16x16_avg_bits10,
-                   vp9_highbd_10_variance16x16,
+                   vpx_highbd_10_variance16x16,
                    vp9_highbd_10_sub_pixel_variance16x16,
                    vp9_highbd_10_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits10,
@@ -1202,7 +1202,7 @@
         HIGHBD_BFP(BLOCK_16X8,
                    vpx_highbd_sad16x8_bits10,
                    vpx_highbd_sad16x8_avg_bits10,
-                   vp9_highbd_10_variance16x8,
+                   vpx_highbd_10_variance16x8,
                    vp9_highbd_10_sub_pixel_variance16x8,
                    vp9_highbd_10_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits10,
@@ -1212,7 +1212,7 @@
         HIGHBD_BFP(BLOCK_8X16,
                    vpx_highbd_sad8x16_bits10,
                    vpx_highbd_sad8x16_avg_bits10,
-                   vp9_highbd_10_variance8x16,
+                   vpx_highbd_10_variance8x16,
                    vp9_highbd_10_sub_pixel_variance8x16,
                    vp9_highbd_10_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits10,
@@ -1222,7 +1222,7 @@
         HIGHBD_BFP(BLOCK_8X8,
                    vpx_highbd_sad8x8_bits10,
                    vpx_highbd_sad8x8_avg_bits10,
-                   vp9_highbd_10_variance8x8,
+                   vpx_highbd_10_variance8x8,
                    vp9_highbd_10_sub_pixel_variance8x8,
                    vp9_highbd_10_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits10,
@@ -1232,7 +1232,7 @@
         HIGHBD_BFP(BLOCK_8X4,
                    vpx_highbd_sad8x4_bits10,
                    vpx_highbd_sad8x4_avg_bits10,
-                   vp9_highbd_10_variance8x4,
+                   vpx_highbd_10_variance8x4,
                    vp9_highbd_10_sub_pixel_variance8x4,
                    vp9_highbd_10_sub_pixel_avg_variance8x4,
                    NULL,
@@ -1242,7 +1242,7 @@
         HIGHBD_BFP(BLOCK_4X8,
                    vpx_highbd_sad4x8_bits10,
                    vpx_highbd_sad4x8_avg_bits10,
-                   vp9_highbd_10_variance4x8,
+                   vpx_highbd_10_variance4x8,
                    vp9_highbd_10_sub_pixel_variance4x8,
                    vp9_highbd_10_sub_pixel_avg_variance4x8,
                    NULL,
@@ -1252,7 +1252,7 @@
         HIGHBD_BFP(BLOCK_4X4,
                    vpx_highbd_sad4x4_bits10,
                    vpx_highbd_sad4x4_avg_bits10,
-                   vp9_highbd_10_variance4x4,
+                   vpx_highbd_10_variance4x4,
                    vp9_highbd_10_sub_pixel_variance4x4,
                    vp9_highbd_10_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits10,
@@ -1264,7 +1264,7 @@
         HIGHBD_BFP(BLOCK_32X16,
                    vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12,
-                   vp9_highbd_12_variance32x16,
+                   vpx_highbd_12_variance32x16,
                    vp9_highbd_12_sub_pixel_variance32x16,
                    vp9_highbd_12_sub_pixel_avg_variance32x16,
                    NULL,
@@ -1274,7 +1274,7 @@
         HIGHBD_BFP(BLOCK_16X32,
                    vpx_highbd_sad16x32_bits12,
                    vpx_highbd_sad16x32_avg_bits12,
-                   vp9_highbd_12_variance16x32,
+                   vpx_highbd_12_variance16x32,
                    vp9_highbd_12_sub_pixel_variance16x32,
                    vp9_highbd_12_sub_pixel_avg_variance16x32,
                    NULL,
@@ -1284,7 +1284,7 @@
         HIGHBD_BFP(BLOCK_64X32,
                    vpx_highbd_sad64x32_bits12,
                    vpx_highbd_sad64x32_avg_bits12,
-                   vp9_highbd_12_variance64x32,
+                   vpx_highbd_12_variance64x32,
                    vp9_highbd_12_sub_pixel_variance64x32,
                    vp9_highbd_12_sub_pixel_avg_variance64x32,
                    NULL,
@@ -1294,7 +1294,7 @@
         HIGHBD_BFP(BLOCK_32X64,
                    vpx_highbd_sad32x64_bits12,
                    vpx_highbd_sad32x64_avg_bits12,
-                   vp9_highbd_12_variance32x64,
+                   vpx_highbd_12_variance32x64,
                    vp9_highbd_12_sub_pixel_variance32x64,
                    vp9_highbd_12_sub_pixel_avg_variance32x64,
                    NULL,
@@ -1304,7 +1304,7 @@
         HIGHBD_BFP(BLOCK_32X32,
                    vpx_highbd_sad32x32_bits12,
                    vpx_highbd_sad32x32_avg_bits12,
-                   vp9_highbd_12_variance32x32,
+                   vpx_highbd_12_variance32x32,
                    vp9_highbd_12_sub_pixel_variance32x32,
                    vp9_highbd_12_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits12,
@@ -1314,7 +1314,7 @@
         HIGHBD_BFP(BLOCK_64X64,
                    vpx_highbd_sad64x64_bits12,
                    vpx_highbd_sad64x64_avg_bits12,
-                   vp9_highbd_12_variance64x64,
+                   vpx_highbd_12_variance64x64,
                    vp9_highbd_12_sub_pixel_variance64x64,
                    vp9_highbd_12_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits12,
@@ -1324,7 +1324,7 @@
         HIGHBD_BFP(BLOCK_16X16,
                    vpx_highbd_sad16x16_bits12,
                    vpx_highbd_sad16x16_avg_bits12,
-                   vp9_highbd_12_variance16x16,
+                   vpx_highbd_12_variance16x16,
                    vp9_highbd_12_sub_pixel_variance16x16,
                    vp9_highbd_12_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits12,
@@ -1334,7 +1334,7 @@
         HIGHBD_BFP(BLOCK_16X8,
                    vpx_highbd_sad16x8_bits12,
                    vpx_highbd_sad16x8_avg_bits12,
-                   vp9_highbd_12_variance16x8,
+                   vpx_highbd_12_variance16x8,
                    vp9_highbd_12_sub_pixel_variance16x8,
                    vp9_highbd_12_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits12,
@@ -1344,7 +1344,7 @@
         HIGHBD_BFP(BLOCK_8X16,
                    vpx_highbd_sad8x16_bits12,
                    vpx_highbd_sad8x16_avg_bits12,
-                   vp9_highbd_12_variance8x16,
+                   vpx_highbd_12_variance8x16,
                    vp9_highbd_12_sub_pixel_variance8x16,
                    vp9_highbd_12_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits12,
@@ -1354,7 +1354,7 @@
         HIGHBD_BFP(BLOCK_8X8,
                    vpx_highbd_sad8x8_bits12,
                    vpx_highbd_sad8x8_avg_bits12,
-                   vp9_highbd_12_variance8x8,
+                   vpx_highbd_12_variance8x8,
                    vp9_highbd_12_sub_pixel_variance8x8,
                    vp9_highbd_12_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits12,
@@ -1364,7 +1364,7 @@
         HIGHBD_BFP(BLOCK_8X4,
                    vpx_highbd_sad8x4_bits12,
                    vpx_highbd_sad8x4_avg_bits12,
-                   vp9_highbd_12_variance8x4,
+                   vpx_highbd_12_variance8x4,
                    vp9_highbd_12_sub_pixel_variance8x4,
                    vp9_highbd_12_sub_pixel_avg_variance8x4,
                    NULL,
@@ -1374,7 +1374,7 @@
         HIGHBD_BFP(BLOCK_4X8,
                    vpx_highbd_sad4x8_bits12,
                    vpx_highbd_sad4x8_avg_bits12,
-                   vp9_highbd_12_variance4x8,
+                   vpx_highbd_12_variance4x8,
                    vp9_highbd_12_sub_pixel_variance4x8,
                    vp9_highbd_12_sub_pixel_avg_variance4x8,
                    NULL,
@@ -1384,7 +1384,7 @@
         HIGHBD_BFP(BLOCK_4X4,
                    vpx_highbd_sad4x4_bits12,
                    vpx_highbd_sad4x4_avg_bits12,
-                   vp9_highbd_12_variance4x4,
+                   vpx_highbd_12_variance4x4,
                    vp9_highbd_12_sub_pixel_variance4x4,
                    vp9_highbd_12_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits12,
@@ -1807,61 +1807,61 @@
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
-      vp9_variance32x16, vp9_sub_pixel_variance32x16,
+      vpx_variance32x16, vp9_sub_pixel_variance32x16,
       vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
-      vp9_variance16x32, vp9_sub_pixel_variance16x32,
+      vpx_variance16x32, vp9_sub_pixel_variance16x32,
       vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
-      vp9_variance64x32, vp9_sub_pixel_variance64x32,
+      vpx_variance64x32, vp9_sub_pixel_variance64x32,
       vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
-      vp9_variance32x64, vp9_sub_pixel_variance32x64,
+      vpx_variance32x64, vp9_sub_pixel_variance32x64,
       vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
-      vp9_variance32x32, vp9_sub_pixel_variance32x32,
+      vpx_variance32x32, vp9_sub_pixel_variance32x32,
       vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
       vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
-      vp9_variance64x64, vp9_sub_pixel_variance64x64,
+      vpx_variance64x64, vp9_sub_pixel_variance64x64,
       vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
       vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
-      vp9_variance16x16, vp9_sub_pixel_variance16x16,
+      vpx_variance16x16, vp9_sub_pixel_variance16x16,
       vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
       vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
-      vp9_variance16x8, vp9_sub_pixel_variance16x8,
+      vpx_variance16x8, vp9_sub_pixel_variance16x8,
       vp9_sub_pixel_avg_variance16x8,
       vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
-      vp9_variance8x16, vp9_sub_pixel_variance8x16,
+      vpx_variance8x16, vp9_sub_pixel_variance8x16,
       vp9_sub_pixel_avg_variance8x16,
       vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
-      vp9_variance8x8, vp9_sub_pixel_variance8x8,
+      vpx_variance8x8, vp9_sub_pixel_variance8x8,
       vp9_sub_pixel_avg_variance8x8,
       vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
-      vp9_variance8x4, vp9_sub_pixel_variance8x4,
+      vpx_variance8x4, vp9_sub_pixel_variance8x4,
       vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
-      vp9_variance4x8, vp9_sub_pixel_variance4x8,
+      vpx_variance4x8, vp9_sub_pixel_variance4x8,
       vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
-      vp9_variance4x4, vp9_sub_pixel_variance4x4,
+      vpx_variance4x4, vp9_sub_pixel_variance4x4,
       vp9_sub_pixel_avg_variance4x4,
       vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
@@ -2081,7 +2081,7 @@
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vp9_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
 
       pa += 16;
@@ -2126,21 +2126,21 @@
   unsigned int sse = 0;
   int sum = 0;
   if (dw > 0) {
-    highbd_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                    dw, height, &sse, &sum);
+    highbd_8_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+                      dw, height, &sse, &sum);
     total_sse += sse;
   }
   if (dh > 0) {
-    highbd_variance(&a[(height - dh) * a_stride], a_stride,
-                    &b[(height - dh) * b_stride], b_stride,
-                    width - dw, dh, &sse, &sum);
+    highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+                      &b[(height - dh) * b_stride], b_stride,
+                      width - dw, dh, &sse, &sum);
     total_sse += sse;
   }
   for (y = 0; y < height / 16; ++y) {
     const uint8_t *pa = a;
     const uint8_t *pb = b;
     for (x = 0; x < width / 16; ++x) {
-      vp9_highbd_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
       total_sse += sse;
       pa += 16;
       pb += 16;
@@ -2716,7 +2716,10 @@
 #if CONFIG_VP9_HIGHBITDEPTH
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
-        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+        RefCntBuffer *new_fb_ptr = NULL;
+        if (cm->new_fb_idx == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
         cm->cur_frame = &pool->frame_bufs[new_fb];
         vp9_realloc_frame_buffer(&pool->frame_bufs[new_fb].buf,
                                  cm->width, cm->height,
@@ -2728,7 +2731,10 @@
 #else
       if (ref->y_crop_width != cm->width || ref->y_crop_height != cm->height) {
         const int new_fb = get_free_fb(cm);
-        RefCntBuffer *const new_fb_ptr = &pool->frame_bufs[new_fb];
+        RefCntBuffer *new_fb_ptr = NULL;
+        if (cm->new_fb_idx == INVALID_IDX)
+          return;
+        new_fb_ptr = &pool->frame_bufs[new_fb];
         vp9_realloc_frame_buffer(&new_fb_ptr->buf,
                                  cm->width, cm->height,
                                  cm->subsampling_x, cm->subsampling_y,
@@ -2797,7 +2803,7 @@
   recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
-    fprintf(f, "%10u %dx%d %10d %10d %10d %10d"
+    fprintf(f, "%10u %dx%d %d %d %10d %10d %10d %10d"
         "%10"PRId64" %10"PRId64" %10"PRId64" %10"PRId64" %10d "
         "%7.2lf %7.2lf %7.2lf %7.2lf %7.2lf"
         "%6d %6d %5d %5d %5d "
@@ -2805,6 +2811,8 @@
         "%10lf %8u %10"PRId64" %10d %10d\n",
         cpi->common.current_video_frame,
         cm->width, cm->height,
+        cpi->rc.source_alt_ref_pending,
+        cpi->rc.source_alt_ref_active,
         cpi->rc.this_frame_target,
         cpi->rc.projected_frame_size,
         cpi->rc.projected_frame_size / cpi->common.MBs,
@@ -3031,7 +3039,7 @@
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
   vp9_set_quantizer(cm, q);
-  vp9_set_vbp_thresholds(cpi, q);
+  vp9_set_variance_partition_thresholds(cpi, q);
 
   setup_frame(cpi);
 
@@ -3476,34 +3484,41 @@
     }
   }
   if (is_two_pass_svc(cpi) && cm->error_resilient_mode == 0) {
-    // Use the last frame context for the empty frame.
+    // Use context 0 for intra only empty frame, but the last frame context
+    // for other empty frames.
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
+      if (cpi->svc.encode_intra_empty_frame != 0)
+        cm->frame_context_idx = 0;
+      else
+        cm->frame_context_idx = FRAME_CONTEXTS - 1;
+    } else {
     cm->frame_context_idx =
-        (cpi->svc.encode_empty_frame_state == ENCODING) ? FRAME_CONTEXTS - 1 :
         cpi->svc.spatial_layer_id * cpi->svc.number_temporal_layers +
         cpi->svc.temporal_layer_id;
+    }
+
+    cm->frame_parallel_decoding_mode = oxcf->frame_parallel_decoding_mode;
 
     // The probs will be updated based on the frame type of its previous
     // frame if frame_parallel_decoding_mode is 0. The type may vary for
     // the frame after a key frame in base layer since we may drop enhancement
     // layers. So set frame_parallel_decoding_mode to 1 in this case.
-    if (cpi->svc.number_temporal_layers == 1) {
-      if (cpi->svc.spatial_layer_id == 0 &&
-          cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
-        cm->frame_parallel_decoding_mode = 1;
-      else
-        cm->frame_parallel_decoding_mode = 0;
-    } else if (cpi->svc.spatial_layer_id == 0) {
-      // Find the 2nd frame in temporal base layer and 1st frame in temporal
-      // enhancement layers from the key frame.
-      int i;
-      for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
-        if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+    if (cm->frame_parallel_decoding_mode == 0) {
+      if (cpi->svc.number_temporal_layers == 1) {
+        if (cpi->svc.spatial_layer_id == 0 &&
+            cpi->svc.layer_context[0].last_frame_type == KEY_FRAME)
           cm->frame_parallel_decoding_mode = 1;
-          break;
+      } else if (cpi->svc.spatial_layer_id == 0) {
+        // Find the 2nd frame in temporal base layer and 1st frame in temporal
+        // enhancement layers from the key frame.
+        int i;
+        for (i = 0; i < cpi->svc.number_temporal_layers; ++i) {
+          if (cpi->svc.layer_context[0].frames_from_key_frame == 1 << i) {
+            cm->frame_parallel_decoding_mode = 1;
+            break;
+          }
         }
       }
-      if (i == cpi->svc.number_temporal_layers)
-        cm->frame_parallel_decoding_mode = 0;
     }
   }
 
@@ -3968,6 +3983,7 @@
       }
 
       cm->show_frame = 0;
+      cm->intra_only = 0;
       cpi->refresh_alt_ref_frame = 1;
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
@@ -4310,8 +4326,10 @@
 #endif
 
   if (is_two_pass_svc(cpi)) {
-    if (cpi->svc.encode_empty_frame_state == ENCODING)
+    if (cpi->svc.encode_empty_frame_state == ENCODING) {
       cpi->svc.encode_empty_frame_state = ENCODED;
+      cpi->svc.encode_intra_empty_frame = 0;
+    }
 
     if (cm->show_frame) {
       ++cpi->svc.spatial_layer_to_encode;

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 88b1030..942eac9 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -12,6 +12,7 @@
 #include <math.h>
 #include <stdio.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
@@ -267,13 +268,13 @@
 static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_8X8:
-      return vp9_mse8x8;
+      return vpx_mse8x8;
     case BLOCK_16X8:
-      return vp9_mse16x8;
+      return vpx_mse16x8;
     case BLOCK_8X16:
-      return vp9_mse8x16;
+      return vpx_mse8x16;
     default:
-      return vp9_mse16x16;
+      return vpx_mse16x16;
   }
 }
 
@@ -293,37 +294,37 @@
     default:
       switch (bsize) {
         case BLOCK_8X8:
-          return vp9_highbd_mse8x8;
+          return vpx_highbd_8_mse8x8;
         case BLOCK_16X8:
-          return vp9_highbd_mse16x8;
+          return vpx_highbd_8_mse16x8;
         case BLOCK_8X16:
-          return vp9_highbd_mse8x16;
+          return vpx_highbd_8_mse8x16;
         default:
-          return vp9_highbd_mse16x16;
+          return vpx_highbd_8_mse16x16;
       }
       break;
     case 10:
       switch (bsize) {
         case BLOCK_8X8:
-          return vp9_highbd_10_mse8x8;
+          return vpx_highbd_10_mse8x8;
         case BLOCK_16X8:
-          return vp9_highbd_10_mse16x8;
+          return vpx_highbd_10_mse16x8;
         case BLOCK_8X16:
-          return vp9_highbd_10_mse8x16;
+          return vpx_highbd_10_mse8x16;
         default:
-          return vp9_highbd_10_mse16x16;
+          return vpx_highbd_10_mse16x16;
       }
       break;
     case 12:
       switch (bsize) {
         case BLOCK_8X8:
-          return vp9_highbd_12_mse8x8;
+          return vpx_highbd_12_mse8x8;
         case BLOCK_16X8:
-          return vp9_highbd_12_mse16x8;
+          return vpx_highbd_12_mse16x8;
         case BLOCK_8X16:
-          return vp9_highbd_12_mse8x16;
+          return vpx_highbd_12_mse8x16;
         default:
-          return vp9_highbd_12_mse16x16;
+          return vpx_highbd_12_mse16x16;
       }
       break;
   }
@@ -634,7 +635,7 @@
       xd->mi[0]->mbmi.tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
       vp9_encode_intra_block_plane(x, bsize, 0);
-      this_error = vp9_get_mb_ss(x->plane[0].src_diff);
+      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (cm->use_highbitdepth) {
         switch (cm->bit_depth) {
@@ -1696,7 +1697,7 @@
   mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
 
   // Allocate bits to the other frames in the group.
-  for (i = 0; i < rc->baseline_gf_interval - 1; ++i) {
+  for (i = 0; i < rc->baseline_gf_interval - rc->source_alt_ref_pending; ++i) {
     int arf_idx = 0;
     if (EOF == input_stats(twopass, &frame_stats))
       break;
@@ -1934,8 +1935,26 @@
   // Was the group length constrained by the requirement for a new KF?
   rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
 
+  // Should we use the alternate reference frame.
+  if (allow_alt_ref &&
+    (i < cpi->oxcf.lag_in_frames) &&
+    (i >= rc->min_gf_interval)) {
+    // Calculate the boost for alt ref.
+    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
+      &b_boost);
+    rc->source_alt_ref_pending = 1;
+
+    // Test to see if multi arf is appropriate.
+    cpi->multi_arf_enabled =
+      (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
+      (zero_motion_accumulator < 0.995)) ? 1 : 0;
+  } else {
+    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
+    rc->source_alt_ref_pending = 0;
+  }
+
   // Set the interval until the next gf.
-  if (is_key_frame || rc->source_alt_ref_active)
+  if (is_key_frame || rc->source_alt_ref_pending)
     rc->baseline_gf_interval = i - 1;
   else
     rc->baseline_gf_interval = i;
@@ -1960,24 +1979,6 @@
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
-  // Should we use the alternate reference frame.
-  if (allow_alt_ref &&
-      (i < cpi->oxcf.lag_in_frames) &&
-      (i >= rc->min_gf_interval)) {
-    // Calculate the boost for alt ref.
-    rc->gfu_boost = calc_arf_boost(cpi, 0, (i - 1), (i - 1), &f_boost,
-                                   &b_boost);
-    rc->source_alt_ref_pending = 1;
-
-    // Test to see if multi arf is appropriate.
-    cpi->multi_arf_enabled =
-      (cpi->multi_arf_allowed && (rc->baseline_gf_interval >= 6) &&
-      (zero_motion_accumulator < 0.995)) ? 1 : 0;
-  } else {
-    rc->gfu_boost = MAX((int)boost_score, MIN_ARF_GF_BOOST);
-    rc->source_alt_ref_pending = 0;
-  }
-
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
@@ -2581,9 +2582,8 @@
         cpi->ref_frame_flags &=
             (~VP9_LAST_FLAG & ~VP9_GOLD_FLAG & ~VP9_ALT_FLAG);
         lc->frames_from_key_frame = 0;
-        // Reset the empty frame resolution since we have a key frame.
-        cpi->svc.empty_frame_width = cm->width;
-        cpi->svc.empty_frame_height = cm->height;
+        // Encode an intra only empty frame since we have a key frame.
+        cpi->svc.encode_intra_empty_frame = 1;
       }
     } else {
       cm->frame_type = INTER_FRAME;

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 8bdd428..15f9582 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -13,11 +13,13 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_reconinter.h"
 
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -303,13 +305,13 @@
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
-      vp9_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
+      vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                                y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
       DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-      vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+      vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
   } else {
@@ -321,7 +323,7 @@
   (void) xd;
   if (second_pred != NULL) {
     DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
-    vp9_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
+    vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
     besterr = vfp->vf(y + offset, y_stride, src, src_stride, sse1);
@@ -1789,8 +1791,11 @@
 };
 
 unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
-                                           BLOCK_SIZE bsize) {
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col) {
   MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
   DECLARE_ALIGNED(16, int16_t, hbuf[128]);
   DECLARE_ALIGNED(16, int16_t, vbuf[128]);
   DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
@@ -1807,12 +1812,34 @@
   unsigned int best_sad, tmp_sad, this_sad[4];
   MV this_mv;
   const int norm_factor = 3 + (bw >> 5);
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]);
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[0];
+    vp9_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  tmp_mv->row = 0;
-  tmp_mv->col = 0;
-  return cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
-                                xd->plane[0].pre[0].buf, ref_stride);
+  {
+    unsigned int this_sad;
+    tmp_mv->row = 0;
+    tmp_mv->col = 0;
+    this_sad = cpi->fn_ptr[bsize].sdf(x->plane[0].src.buf, src_stride,
+                                      xd->plane[0].pre[0].buf, ref_stride);
+
+    if (scaled_ref_frame) {
+      int i;
+      for (i = 0; i < MAX_MB_PLANE; i++)
+        xd->plane[i].pre[0] = backup_yv12[i];
+    }
+    return this_sad;
+  }
 #endif
 
   // Set up prediction 1-D reference set
@@ -1890,6 +1917,12 @@
   tmp_mv->row *= 8;
   tmp_mv->col *= 8;
 
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[0] = backup_yv12[i];
+  }
+
   return best_sad;
 }
 

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index dd8a460..99c1afa 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -83,7 +83,8 @@
 // Perform integral projection based motion estimation.
 unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
                                            MACROBLOCK *x,
-                                           BLOCK_SIZE bsize);
+                                           BLOCK_SIZE bsize,
+                                           int mi_row, int mi_col);
 
 typedef int (integer_mv_pattern_search_fn) (
     const MACROBLOCK *x,

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 1e91715..b0e255d 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -14,6 +14,7 @@
 #include <stdio.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -24,6 +25,7 @@
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
 
 #include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
@@ -215,7 +217,7 @@
 
   for (i = 0; i < h; i += block_size) {
     for (j = 0; j < w; j += block_size) {
-      vp9_get8x8var(src + src_stride * i + j, src_stride,
+      vpx_get8x8var(src + src_stride * i + j, src_stride,
                     ref + ref_stride * i + j, ref_stride,
                     &sse8x8[k], &sum8x8[k]);
       *sse += sse8x8[k];
@@ -1248,7 +1250,7 @@
         if (bsize < BLOCK_16X16)
           continue;
 
-        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize);
+        tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
 
         if (tmp_sad > x->pred_mv_sad[LAST_FRAME])
           continue;

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index fabe362..7211e99 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -1207,11 +1207,9 @@
     // this frame refreshes means next frames don't unless specified by user
     rc->frames_since_golden = 0;
 
-    if (cpi->oxcf.pass == 2) {
-      if (!rc->source_alt_ref_pending &&
-          cpi->twopass.gf_group.rf_level[0] == GF_ARF_STD)
-      rc->source_alt_ref_active = 0;
-    } else if (!rc->source_alt_ref_pending) {
+    // If we are not using alt ref in the up and coming group clear the arf
+    // active flag.
+    if (!rc->source_alt_ref_pending) {
       rc->source_alt_ref_active = 0;
     }
 

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6eb8f6c..9fa258c 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -25,6 +25,7 @@
 #include "vp9/common/vp9_quant_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_systemdependent.h"
 

diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index b3491a2..2d2e95d 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c

@@ -15,6 +15,8 @@
 #include "vp9/encoder/vp9_extend.h"
 
 #define SMALL_FRAME_FB_IDX 7
+#define SMALL_FRAME_WIDTH  16
+#define SMALL_FRAME_HEIGHT 16
 
 void vp9_init_layer_context(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
@@ -33,7 +35,7 @@
 
     if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
       if (vp9_realloc_frame_buffer(&cpi->svc.empty_frame.img,
-                                   cpi->common.width, cpi->common.height,
+                                   SMALL_FRAME_WIDTH, SMALL_FRAME_HEIGHT,
                                    cpi->common.subsampling_x,
                                    cpi->common.subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -48,8 +50,6 @@
 
       memset(cpi->svc.empty_frame.img.buffer_alloc, 0x80,
              cpi->svc.empty_frame.img.buffer_alloc_sz);
-      cpi->svc.empty_frame_width = cpi->common.width;
-      cpi->svc.empty_frame_height = cpi->common.height;
     }
   }
 
@@ -362,20 +362,11 @@
         cpi->lst_fb_idx =
             cpi->gld_fb_idx = cpi->alt_fb_idx = SMALL_FRAME_FB_IDX;
 
-        // Gradually make the empty frame smaller to save bits. Make it half of
-        // its previous size because of the scaling factor restriction.
-        cpi->svc.empty_frame_width >>= 1;
-        cpi->svc.empty_frame_width = (cpi->svc.empty_frame_width + 1) & ~1;
-        if (cpi->svc.empty_frame_width < 16)
-          cpi->svc.empty_frame_width = 16;
+        if (cpi->svc.encode_intra_empty_frame != 0)
+          cpi->common.intra_only = 1;
 
-        cpi->svc.empty_frame_height >>= 1;
-        cpi->svc.empty_frame_height = (cpi->svc.empty_frame_height + 1) & ~1;
-        if (cpi->svc.empty_frame_height < 16)
-          cpi->svc.empty_frame_height = 16;
-
-        width = cpi->svc.empty_frame_width;
-        height = cpi->svc.empty_frame_height;
+        width = SMALL_FRAME_WIDTH;
+        height = SMALL_FRAME_HEIGHT;
       }
     }
   }

diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index e9645ce..5063d52 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h

@@ -57,8 +57,7 @@
     NEED_TO_ENCODE
   }encode_empty_frame_state;
   struct lookahead_entry empty_frame;
-  int empty_frame_width;
-  int empty_frame_height;
+  int encode_intra_empty_frame;
 
   // Store scaled source frames to be used for temporal filter to generate
   // a alt ref frame.

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 862be4d..3592031 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -17,6 +17,7 @@
 
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_pred_common.h"
+#include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/encoder/vp9_cost.h"

diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
index f38f96d..1f6b083 100644
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c

@@ -9,6 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
@@ -18,26 +19,6 @@
 
 #include "vp9/encoder/vp9_variance.h"
 
-void variance(const uint8_t *a, int  a_stride,
-              const uint8_t *b, int  b_stride,
-              int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
 // Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
 // or vertical direction to produce the filtered output block. Used to implement
 // first-pass of 2-D separable filter.
@@ -100,25 +81,6 @@
   }
 }
 
-unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
-  unsigned int i, sum = 0;
-
-  for (i = 0; i < 256; ++i) {
-    sum += src_ptr[i] * src_ptr[i];
-  }
-
-  return sum;
-}
-
-#define VAR(W, H) \
-unsigned int vp9_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                       const uint8_t *b, int b_stride, \
-                                       unsigned int *sse) { \
-  int sum; \
-  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
 #define SUBPIX_VAR(W, H) \
 unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
   const uint8_t *src, int  src_stride, \
@@ -133,7 +95,7 @@
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                      BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
 }
 
 #define SUBPIX_AVG_VAR(W, H) \
@@ -152,178 +114,51 @@
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                      BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
-  return vp9_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
+  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
 }
 
-void vp9_get16x16var_c(const uint8_t *src_ptr, int source_stride,
-                       const uint8_t *ref_ptr, int ref_stride,
-                       unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
-}
-
-void vp9_get8x8var_c(const uint8_t *src_ptr, int source_stride,
-                     const uint8_t *ref_ptr, int ref_stride,
-                     unsigned int *sse, int *sum) {
-  variance(src_ptr, source_stride, ref_ptr, ref_stride, 8, 8, sse, sum);
-}
-
-unsigned int vp9_mse16x16_c(const uint8_t *src, int src_stride,
-                            const uint8_t *ref, int ref_stride,
-                            unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse16x8_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 16, 8, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x16_c(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 16, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_mse8x8_c(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse) {
-  int sum;
-  variance(src, src_stride, ref, ref_stride, 8, 8, sse, &sum);
-  return *sse;
-}
-
-VAR(4, 4)
 SUBPIX_VAR(4, 4)
 SUBPIX_AVG_VAR(4, 4)
 
-VAR(4, 8)
 SUBPIX_VAR(4, 8)
 SUBPIX_AVG_VAR(4, 8)
 
-VAR(8, 4)
 SUBPIX_VAR(8, 4)
 SUBPIX_AVG_VAR(8, 4)
 
-VAR(8, 8)
 SUBPIX_VAR(8, 8)
 SUBPIX_AVG_VAR(8, 8)
 
-VAR(8, 16)
 SUBPIX_VAR(8, 16)
 SUBPIX_AVG_VAR(8, 16)
 
-VAR(16, 8)
 SUBPIX_VAR(16, 8)
 SUBPIX_AVG_VAR(16, 8)
 
-VAR(16, 16)
 SUBPIX_VAR(16, 16)
 SUBPIX_AVG_VAR(16, 16)
 
-VAR(16, 32)
 SUBPIX_VAR(16, 32)
 SUBPIX_AVG_VAR(16, 32)
 
-VAR(32, 16)
 SUBPIX_VAR(32, 16)
 SUBPIX_AVG_VAR(32, 16)
 
-VAR(32, 32)
 SUBPIX_VAR(32, 32)
 SUBPIX_AVG_VAR(32, 32)
 
-VAR(32, 64)
 SUBPIX_VAR(32, 64)
 SUBPIX_AVG_VAR(32, 64)
 
-VAR(64, 32)
 SUBPIX_VAR(64, 32)
 SUBPIX_AVG_VAR(64, 32)
 
-VAR(64, 64)
 SUBPIX_VAR(64, 64)
 SUBPIX_AVG_VAR(64, 64)
 
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride) {
-  int i, j;
-
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance64(const uint8_t *a8, int  a_stride,
-                       const uint8_t *b8, int  b_stride,
-                       int w, int h, uint64_t *sse,
-                       uint64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-void highbd_variance(const uint8_t *a8, int  a_stride,
-                     const uint8_t *b8, int  b_stride,
-                     int w, int h, unsigned int *sse,
-                     int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-
-void highbd_10_variance(const uint8_t *a8, int  a_stride,
-                        const uint8_t *b8, int  b_stride,
-                        int w, int h, unsigned int *sse,
-                        int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-void highbd_12_variance(const uint8_t *a8, int  a_stride,
-                        const uint8_t *b8, int  b_stride,
-                        int w, int h, unsigned int *sse,
-                        int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
 static void highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
@@ -374,35 +209,6 @@
   }
 }
 
-#define HIGHBD_VAR(W, H) \
-unsigned int vp9_highbd_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                              const uint8_t *b, int b_stride, \
-                                              unsigned int *sse) { \
-  int sum; \
-  highbd_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
-  int sum; \
-  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-} \
-\
-unsigned int vp9_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
-  int sum; \
-  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
-  return *sse - (((int64_t)sum * sum) / (W * H)); \
-}
-
 #define HIGHBD_SUBPIX_VAR(W, H) \
 unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
   const uint8_t *src, int  src_stride, \
@@ -417,7 +223,7 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
                                           dst_stride, sse); \
 } \
 \
@@ -434,7 +240,7 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
 } \
 \
@@ -451,7 +257,7 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
 }
 
@@ -471,10 +277,10 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
                                           dst_stride, sse); \
 } \
 \
@@ -493,10 +299,10 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
 } \
 \
@@ -515,137 +321,49 @@
   highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
                                             BILINEAR_FILTERS_2TAP(yoffset)); \
 \
-  vp9_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
                            CONVERT_TO_BYTEPTR(temp2), W); \
 \
-  return vp9_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
 }
 
-#define HIGHBD_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    unsigned int *sse, int *sum) { \
-  highbd_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
-  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                       const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
-  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
-}
-
-#define HIGHBD_MSE(W, H) \
-unsigned int vp9_highbd_mse##W##x##H##_c(const uint8_t *src, \
-                                         int src_stride, \
-                                         const uint8_t *ref, \
-                                         int ref_stride, \
-                                         unsigned int *sse) { \
-  int sum; \
-  highbd_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-unsigned int vp9_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
-  int sum; \
-  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-} \
-\
-unsigned int vp9_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
-  int sum; \
-  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
-  return *sse; \
-}
-
-HIGHBD_GET_VAR(8)
-HIGHBD_GET_VAR(16)
-
-HIGHBD_MSE(16, 16)
-HIGHBD_MSE(16, 8)
-HIGHBD_MSE(8, 16)
-HIGHBD_MSE(8, 8)
-
-HIGHBD_VAR(4, 4)
 HIGHBD_SUBPIX_VAR(4, 4)
 HIGHBD_SUBPIX_AVG_VAR(4, 4)
 
-HIGHBD_VAR(4, 8)
 HIGHBD_SUBPIX_VAR(4, 8)
 HIGHBD_SUBPIX_AVG_VAR(4, 8)
 
-HIGHBD_VAR(8, 4)
 HIGHBD_SUBPIX_VAR(8, 4)
 HIGHBD_SUBPIX_AVG_VAR(8, 4)
 
-HIGHBD_VAR(8, 8)
 HIGHBD_SUBPIX_VAR(8, 8)
 HIGHBD_SUBPIX_AVG_VAR(8, 8)
 
-HIGHBD_VAR(8, 16)
 HIGHBD_SUBPIX_VAR(8, 16)
 HIGHBD_SUBPIX_AVG_VAR(8, 16)
 
-HIGHBD_VAR(16, 8)
 HIGHBD_SUBPIX_VAR(16, 8)
 HIGHBD_SUBPIX_AVG_VAR(16, 8)
 
-HIGHBD_VAR(16, 16)
 HIGHBD_SUBPIX_VAR(16, 16)
 HIGHBD_SUBPIX_AVG_VAR(16, 16)
 
-HIGHBD_VAR(16, 32)
 HIGHBD_SUBPIX_VAR(16, 32)
 HIGHBD_SUBPIX_AVG_VAR(16, 32)
 
-HIGHBD_VAR(32, 16)
 HIGHBD_SUBPIX_VAR(32, 16)
 HIGHBD_SUBPIX_AVG_VAR(32, 16)
 
-HIGHBD_VAR(32, 32)
 HIGHBD_SUBPIX_VAR(32, 32)
 HIGHBD_SUBPIX_AVG_VAR(32, 32)
 
-HIGHBD_VAR(32, 64)
 HIGHBD_SUBPIX_VAR(32, 64)
 HIGHBD_SUBPIX_AVG_VAR(32, 64)
 
-HIGHBD_VAR(64, 32)
 HIGHBD_SUBPIX_VAR(64, 32)
 HIGHBD_SUBPIX_AVG_VAR(64, 32)
 
-HIGHBD_VAR(64, 64)
 HIGHBD_SUBPIX_VAR(64, 64)
 HIGHBD_SUBPIX_AVG_VAR(64, 64)
-
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
-  int i, j;
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
-      const int tmp = pred[j] + ref[j];
-      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
-    }
-    comp_pred += width;
-    pred += width;
-    ref += ref_stride;
-  }
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 53148f2..8fc47a8 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h

@@ -12,31 +12,64 @@
 #define VP9_ENCODER_VP9_VARIANCE_H_
 
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void variance(const uint8_t *a, int a_stride,
-              const uint8_t *b, int b_stride,
-              int  w, int  h,
-              unsigned int *sse, int *sum);
+// TODO(johannkoenig): All functions which depend on
+// [highbd_][8|10|12_]variance should be refactored or moved to vpx_dsp.
+static void variance(const uint8_t *a, int a_stride,
+                     const uint8_t *b, int b_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void highbd_variance(const uint8_t *a8, int a_stride,
-                     const uint8_t *b8, int b_stride,
-                     int w, int h,
-                     unsigned int *sse, int *sum);
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
 
-void highbd_10_variance(const uint8_t *a8, int a_stride,
-                        const uint8_t *b8, int b_stride,
-                        int w, int h,
-                        unsigned int *sse, int *sum);
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
 
-void highbd_12_variance(const uint8_t *a8, int a_stride,
-                        const uint8_t *b8, int b_stride,
-                        int w, int h,
-                        unsigned int *sse, int *sum);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+static void highbd_8_variance(const uint8_t *a8, int a_stride,
+                              const uint8_t *b8, int b_stride,
+                              int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
 #endif
 
 typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
@@ -95,15 +128,6 @@
   vp9_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
 
-void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
-                       int height, const uint8_t *ref, int ref_stride);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred,
-                              int width, int height,
-                              const uint8_t *ref, int ref_stride);
-#endif
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif

diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
index 4bc3e7e..29b7b27 100644
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_highbd_variance_sse2.c

@@ -13,237 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
-typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
-                                        const uint16_t *ref, int ref_stride,
-                                        uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    uint32_t *sse, int *sum);
-
-uint32_t vp9_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
-                                      const uint16_t *ref, int ref_stride,
-                                      uint32_t *sse, int *sum);
-
-static void highbd_variance_sse2(const uint16_t *src, int src_stride,
-                                 const uint16_t *ref, int ref_stride,
-                                 int w, int h, uint32_t *sse, int *sum,
-                                 high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
-}
-
-static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
-                                    const uint16_t *ref, int ref_stride,
-                                    int w, int h, uint32_t *sse, int *sum,
-                                    high_variance_fn_t var_fn, int block_size) {
-  int i, j;
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      sse_long += sse0;
-      sum_long += sum0;
-    }
-  }
-  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
-  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
-}
-
-
-#define HIGH_GET_VAR(S) \
-void vp9_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                       const uint8_t *ref8, int ref_stride, \
-                                       uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-} \
-\
-void vp9_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
-} \
-\
-void vp9_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
-                                          const uint8_t *ref8, int ref_stride, \
-                                          uint32_t *sse, int *sum) { \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  vp9_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
-                                     sse, sum); \
-  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
-  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
-}
-
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
-
-#undef HIGH_GET_VAR
-
-#define VAR_FN(w, h, block_size, shift) \
-uint32_t vp9_highbd_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-                       vp9_highbd_calc##block_size##x##block_size##var_sse2, \
-                       block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_10_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_10_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-} \
-\
-uint32_t vp9_highbd_12_variance##w##x##h##_sse2( \
-    const uint8_t *src8, int src_stride, \
-    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
-  int sum; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
-  highbd_12_variance_sse2( \
-      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
-      vp9_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
-  return *sse - (((int64_t)sum * sum) >> shift); \
-}
-
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
-
-#undef VAR_FN
-
-unsigned int vp9_highbd_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                      const uint8_t *ref8, int ref_stride,
-                                      unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                       sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
-                                         const uint8_t *ref8, int ref_stride,
-                                         unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
-                          sse, &sum, vp9_highbd_calc16x16var_sse2, 16);
-  return *sse;
-}
-
-unsigned int vp9_highbd_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                    const uint8_t *ref8, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                       sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int vp9_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
-unsigned int vp9_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
-                                       const uint8_t *ref8, int ref_stride,
-                                       unsigned int *sse) {
-  int sum;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
-                          sse, &sum, vp9_highbd_calc8x8var_sse2, 8);
-  return *sse;
-}
-
 #define DECL(w, opt) \
 int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
                                                ptrdiff_t src_stride, \

diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
index ff9f7cc..8cd071d 100644
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ b/vp9/encoder/x86/vp9_variance_avx2.c

@@ -13,18 +13,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
-typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse, int *sum);
-
-void vp9_get16x16var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum);
-
-void vp9_get32x32var_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum);
-
 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
@@ -42,81 +30,6 @@
                                                  int height,
                                                  unsigned int *sseptr);
 
-static void variance_avx2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int  ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          get_var_avx2 var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += 16) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(&src[src_stride * i + j], src_stride,
-             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-
-unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
-                sse, &sum, vp9_get16x16var_avx2, 16);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  int sum;
-  vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse;
-}
-
-unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vp9_get32x32var_avx2, 32);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
                                               int src_stride,
                                               int x_offset,

diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index cacee74..961efe3 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c

@@ -16,299 +16,6 @@
 #include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
-typedef void (*variance_fn_t)(const unsigned char *src, int src_stride,
-                              const unsigned char *ref, int ref_stride,
-                              unsigned int *sse, int *sum);
-
-unsigned int vp9_get_mb_ss_sse2(const int16_t *src) {
-  __m128i vsum = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 32; ++i) {
-    const __m128i v = _mm_loadu_si128((const __m128i *)src);
-    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
-    src += 8;
-  }
-
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  return  _mm_cvtsi128_si32(vsum);
-}
-
-#define READ64(p, stride, i) \
-  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
-      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
-
-static void get4x4var_sse2(const uint8_t *src, int src_stride,
-                           const uint8_t *ref, int ref_stride,
-                           unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
-  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
-  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
-  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
-  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-  // sum
-  __m128i vsum = _mm_add_epi16(diff0, diff1);
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
-                       _mm_madd_epi16(diff1, diff1));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
-  *sse = _mm_cvtsi128_si32(vsum);
-}
-
-void vp9_get8x8var_sse2(const uint8_t *src, int src_stride,
-                        const uint8_t *ref, int ref_stride,
-                        unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 8; i += 2) {
-    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + i * src_stride)), zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + i * ref_stride)), zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(src + (i + 1) * src_stride)), zero);
-    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
-        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-  }
-
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-void vp9_get16x16var_sse2(const uint8_t *src, int src_stride,
-                          const uint8_t *ref, int ref_stride,
-                          unsigned int *sse, int *sum) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i vsum = _mm_setzero_si128();
-  __m128i vsse = _mm_setzero_si128();
-  int i;
-
-  for (i = 0; i < 16; ++i) {
-    const __m128i s = _mm_loadu_si128((const __m128i *)src);
-    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
-
-    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
-    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
-    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
-
-    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
-    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
-    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
-
-    vsum = _mm_add_epi16(vsum, diff0);
-    vsum = _mm_add_epi16(vsum, diff1);
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
-    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
-
-    src += src_stride;
-    ref += ref_stride;
-  }
-
-  // sum
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
-  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
-  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
-             (int16_t)_mm_extract_epi16(vsum, 1);
-
-  // sse
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
-  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
-  *sse = _mm_cvtsi128_si32(vsse);
-}
-
-
-static void variance_sse2(const unsigned char *src, int src_stride,
-                          const unsigned char *ref, int ref_stride,
-                          int w, int h, unsigned int *sse, int *sum,
-                          variance_fn_t var_fn, int block_size) {
-  int i, j;
-
-  *sse = 0;
-  *sum = 0;
-
-  for (i = 0; i < h; i += block_size) {
-    for (j = 0; j < w; j += block_size) {
-      unsigned int sse0;
-      int sum0;
-      var_fn(src + src_stride * i + j, src_stride,
-             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
-      *sse += sse0;
-      *sum += sum0;
-    }
-  }
-}
-
-unsigned int vp9_variance4x4_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 4);
-}
-
-unsigned int vp9_variance8x4_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
-                sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance4x8_sse2(const uint8_t *src, int src_stride,
-                                  const uint8_t *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
-                sse, &sum, get4x4var_sse2, 4);
-  return *sse - (((unsigned int)sum * sum) >> 5);
-}
-
-unsigned int vp9_variance8x8_sse2(const unsigned char *src, int src_stride,
-                                  const unsigned char *ref, int ref_stride,
-                                  unsigned int *sse) {
-  int sum;
-  vp9_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 6);
-}
-
-unsigned int vp9_variance16x8_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance8x16_sse2(const unsigned char *src, int src_stride,
-                                   const unsigned char *ref, int ref_stride,
-                                   unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
-                sse, &sum, vp9_get8x8var_sse2, 8);
-  return *sse - (((unsigned int)sum * sum) >> 7);
-}
-
-unsigned int vp9_variance16x16_sse2(const unsigned char *src, int src_stride,
-                                    const unsigned char *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  vp9_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
-  return *sse - (((unsigned int)sum * sum) >> 8);
-}
-
-unsigned int vp9_variance32x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 10);
-}
-
-unsigned int vp9_variance32x16_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance16x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 9);
-}
-
-unsigned int vp9_variance64x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 12);
-}
-
-unsigned int vp9_variance64x32_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_variance32x64_sse2(const uint8_t *src, int src_stride,
-                                    const uint8_t *ref, int ref_stride,
-                                    unsigned int *sse) {
-  int sum;
-  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
-                sse, &sum, vp9_get16x16var_sse2, 16);
-  return *sse - (((int64_t)sum * sum) >> 11);
-}
-
-unsigned int vp9_mse8x8_sse2(const uint8_t *src, int src_stride,
-                             const uint8_t *ref, int ref_stride,
-                             unsigned int *sse) {
-  vp9_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse8x16_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  vp9_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse16x8_sse2(const uint8_t *src, int src_stride,
-                              const uint8_t *ref, int ref_stride,
-                              unsigned int *sse) {
-  vp9_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
-unsigned int vp9_mse16x16_sse2(const uint8_t *src, int src_stride,
-                               const uint8_t *ref, int ref_stride,
-                               unsigned int *sse) {
-  vp9_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
-  return *sse;
-}
-
 // The 2 unused parameters are place holders for PIC enabled build.
 #define DECL(w, opt) \
 int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index f629d98..cbc0488 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -69,6 +69,7 @@
 VP9_COMMON_SRCS-yes += common/vp9_scan.c
 VP9_COMMON_SRCS-yes += common/vp9_scan.h
 
+VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/convolve.h
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
 VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_AVX2) += common/x86/vp9_loopfilter_intrin_avx2.c

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index bd0d18c..5415215 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -102,13 +102,11 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.c
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_impl_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_quantize_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c
 endif

diff --git a/vpx_dsp/arm/variance_media.asm b/vpx_dsp/arm/variance_media.asm
new file mode 100644
index 0000000..f7f9e14
--- /dev/null
+++ b/vpx_dsp/arm/variance_media.asm

@@ -0,0 +1,358 @@
+;
+;  Copyright (c) 2011 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_variance16x16_media|
+    EXPORT  |vpx_variance8x8_media|
+    EXPORT  |vpx_mse16x16_media|
+
+    ARM
+    REQUIRE8
+    PRESERVE8
+
+    AREA ||.text||, CODE, READONLY, ALIGN=2
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance16x16_media| PROC
+
+    stmfd   sp!, {r4-r12, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r8, #0              ; initialize sum = 0
+    mov     r11, #0             ; initialize sse = 0
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+
+loop16x16
+    ; 1st 4 pixels
+    ldr     r4, [r0, #0]        ; load 4 src pixels
+    ldr     r5, [r2, #0]        ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r6, r4, r5          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+    ; calculate total sum
+    adds    r8, r8, r4          ; add positive differences to sum
+    subs    r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r4, [r0, #4]        ; load 4 src pixels
+    ldr     r5, [r2, #4]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r4, [r0, #8]        ; load 4 src pixels
+    ldr     r5, [r2, #8]        ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r4, [r0, #12]       ; load 4 src pixels
+    ldr     r5, [r2, #12]       ; load 4 ref pixels
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+    usub8   r6, r4, r5          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r6, lr          ; select bytes with positive difference
+    usub8   r9, r5, r4          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r6, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r4, r7, lr          ; calculate sum of positive differences
+    usad8   r5, r6, lr          ; calculate sum of negative differences
+    orr     r6, r6, r7          ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r8, r8, r4          ; add positive differences to sum
+    sub     r8, r8, r5          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r5, r6              ; byte (two pixels) to halfwords
+    uxtb16  r10, r6, ror #8     ; another two pixels to halfwords
+    smlad   r11, r5, r5, r11    ; dual signed multiply, add and accumulate (1)
+    smlad   r11, r10, r10, r11  ; dual signed multiply, add and accumulate (2)
+
+
+    subs    r12, r12, #1
+
+    bne     loop16x16
+
+    ; return stuff
+    ldr     r6, [sp, #40]       ; get address of sse
+    mul     r0, r8, r8          ; sum * sum
+    str     r11, [r6]           ; store sse
+    sub     r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8))
+
+    ldmfd   sp!, {r4-r12, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+|vpx_variance8x8_media| PROC
+
+    push    {r4-r10, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #8             ; set loop counter to 8 (=block height)
+    mov     r4, #0              ; initialize sum = 0
+    mov     r5, #0              ; initialize sse = 0
+
+loop8x8
+    ; 1st 4 pixels
+    ldr     r6, [r0, #0x0]      ; load 4 src pixels
+    ldr     r7, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r6, r7          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+    ; calculate total sum
+    add    r4, r4, r6           ; add positive differences to sum
+    sub    r4, r4, r7           ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r0, #0x4]      ; load 4 src pixels
+    ldr     r7, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r6, r7          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r10, r8, lr         ; select bytes with positive difference
+    usub8   r9, r7, r6          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r6, r10, lr         ; calculate sum of positive differences
+    usad8   r7, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r10         ; differences of all 4 pixels
+
+    ; calculate total sum
+    add     r4, r4, r6          ; add positive differences to sum
+    sub     r4, r4, r7          ; subtract negative differences from sum
+
+    ; calculate sse
+    uxtb16  r7, r8              ; byte (two pixels) to halfwords
+    uxtb16  r10, r8, ror #8     ; another two pixels to halfwords
+    smlad   r5, r7, r7, r5      ; dual signed multiply, add and accumulate (1)
+    subs    r12, r12, #1        ; next row
+    smlad   r5, r10, r10, r5    ; dual signed multiply, add and accumulate (2)
+
+    bne     loop8x8
+
+    ; return stuff
+    ldr     r8, [sp, #32]       ; get address of sse
+    mul     r1, r4, r4          ; sum * sum
+    str     r5, [r8]            ; store sse
+    sub     r0, r5, r1, ASR #6  ; return (sse - ((sum * sum) >> 6))
+
+    pop     {r4-r10, pc}
+
+    ENDP
+
+; r0    unsigned char *src_ptr
+; r1    int source_stride
+; r2    unsigned char *ref_ptr
+; r3    int  recon_stride
+; stack unsigned int *sse
+;
+;note: Based on vpx_variance16x16_media. In this function, sum is never used.
+;      So, we can remove this part of calculation.
+
+|vpx_mse16x16_media| PROC
+
+    push    {r4-r9, lr}
+
+    pld     [r0, r1, lsl #0]
+    pld     [r2, r3, lsl #0]
+
+    mov     r12, #16            ; set loop counter to 16 (=block height)
+    mov     r4, #0              ; initialize sse = 0
+
+loopmse
+    ; 1st 4 pixels
+    ldr     r5, [r0, #0x0]      ; load 4 src pixels
+    ldr     r6, [r2, #0x0]      ; load 4 ref pixels
+
+    mov     lr, #0              ; constant zero
+
+    usub8   r8, r5, r6          ; calculate difference
+    pld     [r0, r1, lsl #1]
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    pld     [r2, r3, lsl #1]
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0x4]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 2nd 4 pixels
+    ldr     r6, [r2, #0x4]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+    ldr     r5, [r0, #0x8]      ; load 4 src pixels
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 3rd 4 pixels
+    ldr     r6, [r2, #0x8]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    ldr     r5, [r0, #0xc]      ; load 4 src pixels
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+
+    ; 4th 4 pixels
+    ldr     r6, [r2, #0xc]      ; load 4 ref pixels
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    usub8   r8, r5, r6          ; calculate difference
+    add     r0, r0, r1          ; set src_ptr to next row
+    sel     r7, r8, lr          ; select bytes with positive difference
+    usub8   r9, r6, r5          ; calculate difference with reversed operands
+    add     r2, r2, r3          ; set dst_ptr to next row
+    sel     r8, r9, lr          ; select bytes with negative difference
+
+    ; calculate partial sums
+    usad8   r5, r7, lr          ; calculate sum of positive differences
+    usad8   r6, r8, lr          ; calculate sum of negative differences
+    orr     r8, r8, r7          ; differences of all 4 pixels
+
+    subs    r12, r12, #1        ; next row
+
+    ; calculate sse
+    uxtb16  r6, r8              ; byte (two pixels) to halfwords
+    uxtb16  r7, r8, ror #8      ; another two pixels to halfwords
+    smlad   r4, r6, r6, r4      ; dual signed multiply, add and accumulate (1)
+    smlad   r4, r7, r7, r4      ; dual signed multiply, add and accumulate (2)
+
+    bne     loopmse
+
+    ; return stuff
+    ldr     r1, [sp, #28]       ; get address of sse
+    mov     r0, r4              ; return sse
+    str     r4, [r1]            ; store sse
+
+    pop     {r4-r9, pc}
+
+    ENDP
+
+    END

diff --git a/vpx_dsp/arm/variance_neon.c b/vpx_dsp/arm/variance_neon.c
new file mode 100644
index 0000000..1a9792e
--- /dev/null
+++ b/vpx_dsp/arm/variance_neon.c

@@ -0,0 +1,417 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+  const int32x4_t a = vpaddlq_s16(v_16x8);
+  const int64x2_t b = vpaddlq_s32(a);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+  const int64x2_t b = vpaddlq_s32(v_32x4);
+  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+                               vreinterpret_s32_s64(vget_high_s64(b)));
+  return vget_lane_s32(c, 0);
+}
+
+// w * h must be less than 2048 or local variable v_sum may overflow.
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+                             const uint8_t *b, int b_stride,
+                             int w, int h, uint32_t *sse, int *sum) {
+  int i, j;
+  int16x8_t v_sum = vdupq_n_s16(0);
+  int32x4_t v_sse_lo = vdupq_n_s32(0);
+  int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; j += 8) {
+      const uint8x8_t v_a = vld1_u8(&a[j]);
+      const uint8x8_t v_b = vld1_u8(&b[j]);
+      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+      v_sum = vaddq_s16(v_sum, sv_diff);
+      v_sse_lo = vmlal_s16(v_sse_lo,
+                           vget_low_s16(sv_diff),
+                           vget_low_s16(sv_diff));
+      v_sse_hi = vmlal_s16(v_sse_hi,
+                           vget_high_s16(sv_diff),
+                           vget_high_s16(sv_diff));
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+
+  *sum = horizontal_add_s16x8(v_sum);
+  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vpx_get8x8var_neon(const uint8_t *a, int a_stride,
+                        const uint8_t *b, int b_stride,
+                        unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, sum);
+}
+
+void vpx_get16x16var_neon(const uint8_t *a, int a_stride,
+                          const uint8_t *b, int b_stride,
+                          unsigned int *sse, int *sum) {
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, sum);
+}
+
+unsigned int vpx_variance8x8_neon(const uint8_t *a, int a_stride,
+                                  const uint8_t *b, int b_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 6);  //  >> 6 = / 8 * 8
+}
+
+unsigned int vpx_variance16x16_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 16, 16, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 8);  //  >> 8 = / 16 * 16
+}
+
+unsigned int vpx_variance32x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, sse, &sum);
+  return *sse - (((int64_t)sum * sum) >> 10);  // >> 10 = / 32 * 32
+}
+
+unsigned int vpx_variance32x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 32, 32, &sse1, &sum1);
+  variance_neon_w8(a + (32 * a_stride), a_stride,
+                   b + (32 * b_stride), b_stride, 32, 32,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x32_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 11);  // >> 11 = / 32 * 64
+}
+
+unsigned int vpx_variance64x64_neon(const uint8_t *a, int a_stride,
+                                    const uint8_t *b, int b_stride,
+                                    unsigned int *sse) {
+  int sum1, sum2;
+  uint32_t sse1, sse2;
+
+  variance_neon_w8(a, a_stride, b, b_stride, 64, 16, &sse1, &sum1);
+  variance_neon_w8(a + (16 * a_stride), a_stride,
+                   b + (16 * b_stride), b_stride, 64, 16,
+                   &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 2 * a_stride), a_stride,
+                   b + (16 * 2 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  sse1 += sse2;
+  sum1 += sum2;
+
+  variance_neon_w8(a + (16 * 3 * a_stride), a_stride,
+                   b + (16 * 3 * b_stride), b_stride,
+                   64, 16, &sse2, &sum2);
+  *sse = sse1 + sse2;
+  sum1 += sum2;
+  return *sse - (((int64_t)sum1 * sum1) >> 12);  // >> 12 = / 64 * 64
+}
+
+unsigned int vpx_variance16x8_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 4; i++) {
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
+        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
+        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_variance8x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    uint8x8_t d0u8, d2u8, d4u8, d6u8;
+    int16x4_t d22s16, d23s16, d24s16, d25s16;
+    uint32x2_t d0u32, d10u32;
+    int64x1_t d0s64, d1s64;
+    uint16x8_t q11u16, q12u16;
+    int32x4_t q8s32, q9s32, q10s32;
+    int64x2_t q0s64, q1s64, q5s64;
+
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {
+        d0u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        d2u8 = vld1_u8(src_ptr);
+        src_ptr += source_stride;
+        __builtin_prefetch(src_ptr);
+
+        d4u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        d6u8 = vld1_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        __builtin_prefetch(ref_ptr);
+
+        q11u16 = vsubl_u8(d0u8, d4u8);
+        q12u16 = vsubl_u8(d2u8, d6u8);
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
+        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
+        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+    }
+
+    q10s32 = vaddq_s32(q10s32, q9s32);
+    q0s64 = vpaddlq_s32(q8s32);
+    q1s64 = vpaddlq_s32(q10s32);
+
+    d0s64 = vadd_s64(vget_low_s64(q0s64), vget_high_s64(q0s64));
+    d1s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
+                      vreinterpret_s32_s64(d0s64));
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
+
+    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 7);
+    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
+
+    return vget_lane_u32(d0u32, 0);
+}
+
+unsigned int vpx_mse16x16_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride,
+        unsigned int *sse) {
+    int i;
+    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
+    int64x1_t d0s64;
+    uint8x16_t q0u8, q1u8, q2u8, q3u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    q7s32 = vdupq_n_s32(0);
+    q8s32 = vdupq_n_s32(0);
+    q9s32 = vdupq_n_s32(0);
+    q10s32 = vdupq_n_s32(0);
+
+    for (i = 0; i < 8; i++) {  // mse16x16_neon_loop
+        q0u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q1u8 = vld1q_u8(src_ptr);
+        src_ptr += source_stride;
+        q2u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+        q3u8 = vld1q_u8(ref_ptr);
+        ref_ptr += recon_stride;
+
+        q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8));
+        q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8));
+        q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8));
+        q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8));
+
+        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
+        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
+        q7s32 = vmlal_s16(q7s32, d22s16, d22s16);
+        q8s32 = vmlal_s16(q8s32, d23s16, d23s16);
+
+        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
+        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
+        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
+        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
+
+        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
+        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
+        q7s32 = vmlal_s16(q7s32, d26s16, d26s16);
+        q8s32 = vmlal_s16(q8s32, d27s16, d27s16);
+
+        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
+        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
+        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
+        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
+    }
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q10s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q10s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0);
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}
+
+unsigned int vpx_get4x4sse_cs_neon(
+        const unsigned char *src_ptr,
+        int source_stride,
+        const unsigned char *ref_ptr,
+        int recon_stride) {
+    int16x4_t d22s16, d24s16, d26s16, d28s16;
+    int64x1_t d0s64;
+    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
+    int32x4_t q7s32, q8s32, q9s32, q10s32;
+    uint16x8_t q11u16, q12u16, q13u16, q14u16;
+    int64x2_t q1s64;
+
+    d0u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d4u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d1u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d5u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d2u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d6u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+    d3u8 = vld1_u8(src_ptr);
+    src_ptr += source_stride;
+    d7u8 = vld1_u8(ref_ptr);
+    ref_ptr += recon_stride;
+
+    q11u16 = vsubl_u8(d0u8, d4u8);
+    q12u16 = vsubl_u8(d1u8, d5u8);
+    q13u16 = vsubl_u8(d2u8, d6u8);
+    q14u16 = vsubl_u8(d3u8, d7u8);
+
+    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
+    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
+    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
+    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));
+
+    q7s32 = vmull_s16(d22s16, d22s16);
+    q8s32 = vmull_s16(d24s16, d24s16);
+    q9s32 = vmull_s16(d26s16, d26s16);
+    q10s32 = vmull_s16(d28s16, d28s16);
+
+    q7s32 = vaddq_s32(q7s32, q8s32);
+    q9s32 = vaddq_s32(q9s32, q10s32);
+    q9s32 = vaddq_s32(q7s32, q9s32);
+
+    q1s64 = vpaddlq_s32(q9s32);
+    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));
+
+    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
+}

diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 9783e43..c0c3ff9 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c

@@ -33,6 +33,7 @@
   return sad;
 }
 
+// TODO(johannkoenig): this moved to vpx_dsp, should be able to clean this up.
 /* Remove dependency on vp9 variance function by duplicating vp9_comp_avg_pred.
  * The function averages every corresponding element of the buffers and stores
  * the value in a third buffer, comp_pred.

diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
new file mode 100644
index 0000000..084dd7b
--- /dev/null
+++ b/vpx_dsp/variance.c

@@ -0,0 +1,306 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
+                                const unsigned char *b, int  b_stride) {
+  int distortion = 0;
+  int r, c;
+
+  for (r = 0; r < 4; r++) {
+    for (c = 0; c < 4; c++) {
+      int diff = a[c] - b[c];
+      distortion += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+
+  return distortion;
+}
+
+unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+  unsigned int i, sum = 0;
+
+  for (i = 0; i < 256; ++i) {
+    sum += a[i] * a[i];
+  }
+
+  return sum;
+}
+
+static void variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#define VAR(W, H) \
+unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                       const uint8_t *b, int b_stride, \
+                                       unsigned int *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
+                             const uint8_t *b, int b_stride, \
+                             unsigned int *sse, int *sum) { \
+  variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+}
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                  const uint8_t *b, int b_stride, \
+                                  unsigned int *sse) { \
+  int sum; \
+  variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+VAR(64, 64)
+VAR(64, 32)
+VAR(32, 64)
+VAR(32, 32)
+VAR(32, 16)
+VAR(16, 32)
+VAR(16, 16)
+VAR(16, 8)
+VAR(8, 16)
+VAR(8, 8)
+VAR(8, 4)
+VAR(4, 8)
+VAR(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)
+
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
+                         int height, const uint8_t *ref, int ref_stride) {
+  int i, j;
+
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, uint64_t *sse, uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void highbd_8_variance(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+
+static void highbd_10_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
+}
+
+static void highbd_12_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               int w, int h, unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
+}
+
+#define HIGHBD_VAR(W, H) \
+unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                                int a_stride, \
+                                                const uint8_t *b, \
+                                                int b_stride, \
+                                                unsigned int *sse) { \
+  int sum; \
+  highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int a_stride, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 unsigned int *sse) { \
+  int sum; \
+  highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_GET_VAR(S) \
+void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                    const uint8_t *ref, int ref_stride, \
+                                    unsigned int *sse, int *sum) { \
+  highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse, int *sum) { \
+  highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGHBD_MSE(W, H) \
+unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                         int src_stride, \
+                                         const uint8_t *ref, \
+                                         int ref_stride, \
+                                         unsigned int *sse) { \
+  int sum; \
+  highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                            int src_stride, \
+                                            const uint8_t *ref, \
+                                            int ref_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+HIGHBD_GET_VAR(8)
+HIGHBD_GET_VAR(16)
+
+HIGHBD_MSE(16, 16)
+HIGHBD_MSE(16, 8)
+HIGHBD_MSE(8, 16)
+HIGHBD_MSE(8, 8)
+
+HIGHBD_VAR(64, 64)
+HIGHBD_VAR(64, 32)
+HIGHBD_VAR(32, 64)
+HIGHBD_VAR(32, 32)
+HIGHBD_VAR(32, 16)
+HIGHBD_VAR(16, 32)
+HIGHBD_VAR(16, 16)
+HIGHBD_VAR(16, 8)
+HIGHBD_VAR(8, 16)
+HIGHBD_VAR(8, 8)
+HIGHBD_VAR(8, 4)
+HIGHBD_VAR(4, 8)
+HIGHBD_VAR(4, 4)
+
+void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                              int width, int height, const uint8_t *ref8,
+                              int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 606515d..f23534a 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -17,6 +17,7 @@
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
 
+
 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
@@ -29,9 +30,28 @@
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
+
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS
 
+ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
+DSP_SRCS-yes            += variance.c
+
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
+
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
+DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
+DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+endif  # CONFIG_VP9_HIGHBITDEPTH
+endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
 DSP_SRCS-yes += vpx_dsp_rtcd.c

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ebec9ec..55271cf 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -392,4 +392,212 @@
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS
 
+if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
+
+add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x64 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance64x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x64 sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x32 sse2 avx2 neon/;
+
+add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance32x16 sse2 avx2/;
+
+add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x32 sse2/;
+
+add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance16x8 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x16 mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x8 mmx sse2 media neon/;
+
+add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance8x4 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x8 sse2/;
+
+add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_variance4x4 mmx sse2/;
+
+
+add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get16x16var sse2 avx2 neon/;
+
+add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vpx_get8x8var mmx sse2 neon/;
+
+add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x16 mmx sse2 avx2 media neon/;
+
+add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse16x8 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x16 sse2/;
+
+add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_mse8x8 sse2/;
+
+add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
+  specialize qw/vpx_get_mb_ss mmx sse2/;
+
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
+  specialize qw/vpx_get4x4sse_cs neon/;
+
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance64x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x64 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance32x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x32 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance16x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_variance8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+
+  add_proto qw/void vpx_highbd_8_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_8_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/void vpx_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  add_proto qw/void vpx_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_8_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_8_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_8_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_10_mse8x8 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse16x16 sse2/;
+
+  add_proto qw/unsigned int vpx_highbd_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  add_proto qw/unsigned int vpx_highbd_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vpx_highbd_12_mse8x8 sse2/;
+
+  add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+}  # CONFIG_VP9_HIGHBITDEPTH
+}  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
+
 1;

diff --git a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm b/vpx_dsp/x86/highbd_variance_impl_sse2.asm
similarity index 97%
rename from vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
rename to vpx_dsp/x86/highbd_variance_impl_sse2.asm
index 821dd06..923418a 100644
--- a/vp9/encoder/x86/vp9_highbd_variance_impl_sse2.asm
+++ b/vpx_dsp/x86/highbd_variance_impl_sse2.asm

@@ -11,7 +11,7 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;unsigned int vp9_highbd_calc16x16var_sse2
+;unsigned int vpx_highbd_calc16x16var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@@ -20,8 +20,8 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc16x16var_sse2) PRIVATE
-sym(vp9_highbd_calc16x16var_sse2):
+global sym(vpx_highbd_calc16x16var_sse2) PRIVATE
+sym(vpx_highbd_calc16x16var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6
@@ -164,7 +164,7 @@
     ret
 
 
-;unsigned int vp9_highbd_calc8x8var_sse2
+;unsigned int vpx_highbd_calc8x8var_sse2
 ;(
 ;    unsigned char   *  src_ptr,
 ;    int             source_stride,
@@ -173,8 +173,8 @@
 ;    unsigned int    *  SSE,
 ;    int             *  Sum
 ;)
-global sym(vp9_highbd_calc8x8var_sse2) PRIVATE
-sym(vp9_highbd_calc8x8var_sse2):
+global sym(vpx_highbd_calc8x8var_sse2) PRIVATE
+sym(vpx_highbd_calc8x8var_sse2):
     push        rbp
     mov         rbp, rsp
     SHADOW_ARGS_TO_STACK 6

diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
new file mode 100644
index 0000000..343c047
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse2.c

@@ -0,0 +1,245 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_ports/mem.h"
+
+typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
+                                        const uint16_t *ref, int ref_stride,
+                                        uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc8x8var_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    uint32_t *sse, int *sum);
+
+uint32_t vpx_highbd_calc16x16var_sse2(const uint16_t *src, int src_stride,
+                                      const uint16_t *ref, int ref_stride,
+                                      uint32_t *sse, int *sum);
+
+static void highbd_8_variance_sse2(const uint16_t *src, int src_stride,
+                                   const uint16_t *ref, int ref_stride,
+                                   int w, int h, uint32_t *sse, int *sum,
+                                   high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+static void highbd_10_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+static void highbd_12_variance_sse2(const uint16_t *src, int src_stride,
+                                    const uint16_t *ref, int ref_stride,
+                                    int w, int h, uint32_t *sse, int *sum,
+                                    high_variance_fn_t var_fn, int block_size) {
+  int i, j;
+  uint64_t sse_long = 0;
+  int64_t sum_long = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      sse_long += sse0;
+      sum_long += sum0;
+    }
+  }
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+
+#define HIGH_GET_VAR(S) \
+void vpx_highbd_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                       const uint8_t *ref8, int ref_stride, \
+                                       uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+} \
+\
+void vpx_highbd_10_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 2); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 4); \
+} \
+\
+void vpx_highbd_12_get##S##x##S##var_sse2(const uint8_t *src8, int src_stride, \
+                                          const uint8_t *ref8, int ref_stride, \
+                                          uint32_t *sse, int *sum) { \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  vpx_highbd_calc##S##x##S##var_sse2(src, src_stride, ref, ref_stride, \
+                                     sse, sum); \
+  *sum = ROUND_POWER_OF_TWO(*sum, 4); \
+  *sse = ROUND_POWER_OF_TWO(*sse, 8); \
+}
+
+HIGH_GET_VAR(16);
+HIGH_GET_VAR(8);
+
+#undef HIGH_GET_VAR
+
+#define VAR_FN(w, h, block_size, shift) \
+uint32_t vpx_highbd_8_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+                         vpx_highbd_calc##block_size##x##block_size##var_sse2, \
+                         block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_10_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_10_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+} \
+\
+uint32_t vpx_highbd_12_variance##w##x##h##_sse2( \
+    const uint8_t *src8, int src_stride, \
+    const uint8_t *ref8, int ref_stride, uint32_t *sse) { \
+  int sum; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); \
+  highbd_12_variance_sse2( \
+      src, src_stride, ref, ref_stride, w, h, sse, &sum, \
+      vpx_highbd_calc##block_size##x##block_size##var_sse2, block_size); \
+  return *sse - (((int64_t)sum * sum) >> shift); \
+}
+
+VAR_FN(64, 64, 16, 12);
+VAR_FN(64, 32, 16, 11);
+VAR_FN(32, 64, 16, 11);
+VAR_FN(32, 32, 16, 10);
+VAR_FN(32, 16, 16, 9);
+VAR_FN(16, 32, 16, 9);
+VAR_FN(16, 16, 16, 8);
+VAR_FN(16, 8, 8, 7);
+VAR_FN(8, 16, 8, 7);
+VAR_FN(8, 8, 8, 6);
+
+#undef VAR_FN
+
+unsigned int vpx_highbd_8_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                      const uint8_t *ref8, int ref_stride,
+                                      unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                         sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse16x16_sse2(const uint8_t *src8, int src_stride,
+                                         const uint8_t *ref8, int ref_stride,
+                                         unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 16, 16,
+                          sse, &sum, vpx_highbd_calc16x16var_sse2, 16);
+  return *sse;
+}
+
+unsigned int vpx_highbd_8_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                    const uint8_t *ref8, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_8_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                         sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_10_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_10_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}
+
+unsigned int vpx_highbd_12_mse8x8_sse2(const uint8_t *src8, int src_stride,
+                                       const uint8_t *ref8, int ref_stride,
+                                       unsigned int *sse) {
+  int sum;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  highbd_12_variance_sse2(src, src_stride, ref, ref_stride, 8, 8,
+                          sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
+  return *sse;
+}

diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
new file mode 100644
index 0000000..82cef4a
--- /dev/null
+++ b/vpx_dsp/x86/variance_avx2.c

@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*get_var_avx2)(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse, int *sum);
+
+void vpx_get32x32var_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum);
+
+static void variance_avx2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int  ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          get_var_avx2 var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += 16) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(&src[src_stride * i + j], src_stride,
+             &ref[ref_stride * i + j], ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+
+unsigned int vpx_variance16x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 16, 16,
+                sse, &sum, vpx_get16x16var_avx2, 16);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_mse16x16_avx2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse;
+}
+
+unsigned int vpx_variance32x16_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance32x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance64x64_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_avx2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_avx2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get32x32var_avx2, 32);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}

diff --git a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c
similarity index 97%
rename from vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
rename to vpx_dsp/x86/variance_impl_avx2.c
index ee76a31..0e40959 100644
--- a/vp9/encoder/x86/vp9_variance_impl_intrin_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c

@@ -10,9 +10,9 @@
 
 #include <immintrin.h>  // AVX2
 
-#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
-void vp9_get16x16var_avx2(const unsigned char *src_ptr,
+void vpx_get16x16var_avx2(const unsigned char *src_ptr,
                           int source_stride,
                           const unsigned char *ref_ptr,
                           int recon_stride,
@@ -123,7 +123,7 @@
     }
 }
 
-void vp9_get32x32var_avx2(const unsigned char *src_ptr,
+void vpx_get32x32var_avx2(const unsigned char *src_ptr,
                           int source_stride,
                           const unsigned char *ref_ptr,
                           int recon_stride,

diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
new file mode 100644
index 0000000..a8d7d99
--- /dev/null
+++ b/vpx_dsp/x86/variance_impl_mmx.asm

@@ -0,0 +1,424 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
+global sym(vpx_get_mb_ss_mmx) PRIVATE
+sym(vpx_get_mb_ss_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 7
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 8
+    ; end prolog
+
+        mov         rax, arg(0) ;src_ptr
+        mov         rcx, 16
+        pxor        mm4, mm4
+
+.NEXTROW:
+        movq        mm0, [rax]
+        movq        mm1, [rax+8]
+        movq        mm2, [rax+16]
+        movq        mm3, [rax+24]
+        pmaddwd     mm0, mm0
+        pmaddwd     mm1, mm1
+        pmaddwd     mm2, mm2
+        pmaddwd     mm3, mm3
+
+        paddd       mm4, mm0
+        paddd       mm4, mm1
+        paddd       mm4, mm2
+        paddd       mm4, mm3
+
+        add         rax, 32
+        dec         rcx
+        ja          .NEXTROW
+        movq        QWORD PTR [rsp], mm4
+
+        ;return sum[0]+sum[1];
+        movsxd      rax, dword ptr [rsp]
+        movsxd      rcx, dword ptr [rsp+4]
+        add         rax, rcx
+
+
+    ; begin epilog
+    add rsp, 8
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+;void vpx_get8x8var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get8x8var_mmx) PRIVATE
+sym(vpx_get8x8var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+
+        ; Row 2
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 3
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 4
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 5
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        ;              movq        mm4, [rbx + rdx]
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 6
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 7
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movq        mm1, [rbx]                  ; Copy eight bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Row 8
+        movq        mm0, [rax]                  ; Copy eight bytes to mm0
+        movq        mm2, mm0                    ; Take copies
+        movq        mm3, mm1                    ; Take copies
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        punpckhbw   mm2, mm6                    ; unpack to higher prrcision
+        punpckhbw   mm3, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        psubsw      mm2, mm3                    ; A-B (high order) to MM2
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        paddw       mm5, mm2                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        pmaddwd     mm2, mm2                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        paddd       mm7, mm0                    ; accumulate in mm7
+        paddd       mm7, mm2                    ; accumulate in mm7
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+
+
+;void
+;vpx_get4x4var_mmx
+;(
+;    unsigned char *src_ptr,
+;    int  source_stride,
+;    unsigned char *ref_ptr,
+;    int  recon_stride,
+;    unsigned int *SSE,
+;    int *Sum
+;)
+global sym(vpx_get4x4var_mmx) PRIVATE
+sym(vpx_get4x4var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 6
+    push rsi
+    push rdi
+    push rbx
+    sub         rsp, 16
+    ; end prolog
+
+
+        pxor        mm5, mm5                    ; Blank mmx6
+        pxor        mm6, mm6                    ; Blank mmx7
+        pxor        mm7, mm7                    ; Blank mmx7
+
+        mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
+        mov         rbx, arg(2) ;[ref_ptr]
+        movsxd      rcx, dword ptr arg(1) ;[source_stride]
+        movsxd      rdx, dword ptr arg(3) ;[recon_stride]
+
+        ; Row 1
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Row 2
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 3
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+        punpcklbw   mm0, mm6                    ; unpack to higher precision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        add         rbx,rdx                     ; Inc pointer into ref data
+        add         rax,rcx                     ; Inc pointer into the new data
+        movd        mm1, [rbx]                  ; Copy four bytes to mm1
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+        ; Row 4
+        movd        mm0, [rax]                  ; Copy four bytes to mm0
+
+        punpcklbw   mm0, mm6                    ; unpack to higher prrcision
+        punpcklbw   mm1, mm6
+        psubsw      mm0, mm1                    ; A-B (low order) to MM0
+
+        paddw       mm5, mm0                    ; accumulate differences in mm5
+
+        pmaddwd     mm0, mm0                    ; square and accumulate
+        paddd       mm7, mm0                    ; accumulate in mm7
+
+
+        ; Now accumulate the final results.
+        movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
+        movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
+        movsx       rdx, WORD PTR [rsp+8]
+        movsx       rcx, WORD PTR [rsp+10]
+        movsx       rbx, WORD PTR [rsp+12]
+        movsx       rax, WORD PTR [rsp+14]
+        add         rdx, rcx
+        add         rbx, rax
+        add         rdx, rbx    ;XSum
+        movsxd      rax, DWORD PTR [rsp]
+        movsxd      rcx, DWORD PTR [rsp+4]
+        add         rax, rcx    ;XXSum
+        mov         rsi, arg(4) ;SSE
+        mov         rdi, arg(5) ;Sum
+        mov         dword ptr [rsi], eax
+        mov         dword ptr [rdi], edx
+        xor         rax, rax    ; return 0
+
+
+    ; begin epilog
+    add rsp, 16
+    pop rbx
+    pop rdi
+    pop rsi
+    UNSHADOW_ARGS
+    pop         rbp
+    ret

diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
new file mode 100644
index 0000000..99dd741
--- /dev/null
+++ b/vpx_dsp/x86/variance_mmx.c

@@ -0,0 +1,107 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+
+extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              unsigned int *sse, int *sum);
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
+                                 const unsigned char *b, int  b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get4x4var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 4));
+}
+
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
+                                 const unsigned char *b, int  b_stride,
+                                 unsigned int *sse) {
+    unsigned int var;
+    int avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &var, &avg);
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 6));
+}
+
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
+                              const unsigned char *b, int  b_stride,
+                              unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    *sse = var;
+    return var;
+}
+
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
+                                   const unsigned char *b, int  b_stride,
+                                   unsigned int *sse) {
+    unsigned int sse0, sse1, sse2, sse3, var;
+    int sum0, sum1, sum2, sum3, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse2, &sum2);
+    vpx_get8x8var_mmx(a + 8 * a_stride + 8, a_stride,
+                      b + 8 * b_stride + 8, b_stride, &sse3, &sum3);
+
+    var = sse0 + sse1 + sse2 + sse3;
+    avg = sum0 + sum1 + sum2 + sum3;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 8));
+}
+
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
+                                  const unsigned char *b, int  b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8, a_stride, b + 8, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+    return (var - (((unsigned int)avg * avg) >> 7));
+}
+
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,
+                                  const unsigned char *b, int  b_stride,
+                                  unsigned int *sse) {
+    unsigned int sse0, sse1, var;
+    int sum0, sum1, avg;
+
+    vpx_get8x8var_mmx(a, a_stride, b, b_stride, &sse0, &sum0);
+    vpx_get8x8var_mmx(a + 8 * a_stride, a_stride,
+                      b + 8 * b_stride, b_stride, &sse1, &sum1);
+
+    var = sse0 + sse1;
+    avg = sum0 + sum1;
+    *sse = var;
+
+    return (var - (((unsigned int)avg * avg) >> 7));
+}

diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
new file mode 100644
index 0000000..6256bc5
--- /dev/null
+++ b/vpx_dsp/x86/variance_sse2.c

@@ -0,0 +1,309 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <emmintrin.h>  // SSE2
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_ports/mem.h"
+
+typedef void (*getNxMvar_fn_t) (const unsigned char *src, int src_stride,
+                                const unsigned char *ref, int ref_stride,
+                                unsigned int *sse, int *sum);
+
+unsigned int vpx_get_mb_ss_sse2(const int16_t *src) {
+  __m128i vsum = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 32; ++i) {
+    const __m128i v = _mm_loadu_si128((const __m128i *)src);
+    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
+    src += 8;
+  }
+
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  return  _mm_cvtsi128_si32(vsum);
+}
+
+#define READ64(p, stride, i) \
+  _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const uint32_t *)(p + i * stride)), \
+      _mm_cvtsi32_si128(*(const uint32_t *)(p + (i + 1) * stride)))
+
+static void get4x4var_sse2(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i src0 = _mm_unpacklo_epi8(READ64(src, src_stride, 0), zero);
+  const __m128i src1 = _mm_unpacklo_epi8(READ64(src, src_stride, 2), zero);
+  const __m128i ref0 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 0), zero);
+  const __m128i ref1 = _mm_unpacklo_epi8(READ64(ref, ref_stride, 2), zero);
+  const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+  const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+  // sum
+  __m128i vsum = _mm_add_epi16(diff0, diff1);
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsum = _mm_add_epi32(_mm_madd_epi16(diff0, diff0),
+                       _mm_madd_epi16(diff1, diff1));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
+  *sse = _mm_cvtsi128_si32(vsum);
+}
+
+void vpx_get8x8var_sse2(const uint8_t *src, int src_stride,
+                        const uint8_t *ref, int ref_stride,
+                        unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 8; i += 2) {
+    const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + i * src_stride)), zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + i * ref_stride)), zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(src + (i + 1) * src_stride)), zero);
+    const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64(
+        (const __m128i *)(ref + (i + 1) * ref_stride)), zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+void vpx_get16x16var_sse2(const uint8_t *src, int src_stride,
+                          const uint8_t *ref, int ref_stride,
+                          unsigned int *sse, int *sum) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i vsum = _mm_setzero_si128();
+  __m128i vsse = _mm_setzero_si128();
+  int i;
+
+  for (i = 0; i < 16; ++i) {
+    const __m128i s = _mm_loadu_si128((const __m128i *)src);
+    const __m128i r = _mm_loadu_si128((const __m128i *)ref);
+
+    const __m128i src0 = _mm_unpacklo_epi8(s, zero);
+    const __m128i ref0 = _mm_unpacklo_epi8(r, zero);
+    const __m128i diff0 = _mm_sub_epi16(src0, ref0);
+
+    const __m128i src1 = _mm_unpackhi_epi8(s, zero);
+    const __m128i ref1 = _mm_unpackhi_epi8(r, zero);
+    const __m128i diff1 = _mm_sub_epi16(src1, ref1);
+
+    vsum = _mm_add_epi16(vsum, diff0);
+    vsum = _mm_add_epi16(vsum, diff1);
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0));
+    vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1));
+
+    src += src_stride;
+    ref += ref_stride;
+  }
+
+  // sum
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8));
+  vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4));
+  *sum = (int16_t)_mm_extract_epi16(vsum, 0) +
+             (int16_t)_mm_extract_epi16(vsum, 1);
+
+  // sse
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8));
+  vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4));
+  *sse = _mm_cvtsi128_si32(vsse);
+}
+
+
+static void variance_sse2(const unsigned char *src, int src_stride,
+                          const unsigned char *ref, int ref_stride,
+                          int w, int h, unsigned int *sse, int *sum,
+                          getNxMvar_fn_t var_fn, int block_size) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      unsigned int sse0;
+      int sum0;
+      var_fn(src + src_stride * i + j, src_stride,
+             ref + ref_stride * i + j, ref_stride, &sse0, &sum0);
+      *sse += sse0;
+      *sum += sum0;
+    }
+  }
+}
+
+unsigned int vpx_variance4x4_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  get4x4var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 4);
+}
+
+unsigned int vpx_variance8x4_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 4,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance4x8_sse2(const uint8_t *src, int src_stride,
+                                  const uint8_t *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 4, 8,
+                sse, &sum, get4x4var_sse2, 4);
+  return *sse - (((unsigned int)sum * sum) >> 5);
+}
+
+unsigned int vpx_variance8x8_sse2(const unsigned char *src, int src_stride,
+                                  const unsigned char *ref, int ref_stride,
+                                  unsigned int *sse) {
+  int sum;
+  vpx_get8x8var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 6);
+}
+
+unsigned int vpx_variance16x8_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 8,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance8x16_sse2(const unsigned char *src, int src_stride,
+                                   const unsigned char *ref, int ref_stride,
+                                   unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 8, 16,
+                sse, &sum, vpx_get8x8var_sse2, 8);
+  return *sse - (((unsigned int)sum * sum) >> 7);
+}
+
+unsigned int vpx_variance16x16_sse2(const unsigned char *src, int src_stride,
+                                    const unsigned char *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  vpx_get16x16var_sse2(src, src_stride, ref, ref_stride, sse, &sum);
+  return *sse - (((unsigned int)sum * sum) >> 8);
+}
+
+unsigned int vpx_variance32x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 10);
+}
+
+unsigned int vpx_variance32x16_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 16,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance16x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 16, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 9);
+}
+
+unsigned int vpx_variance64x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 12);
+}
+
+unsigned int vpx_variance64x32_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 64, 32,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_variance32x64_sse2(const uint8_t *src, int src_stride,
+                                    const uint8_t *ref, int ref_stride,
+                                    unsigned int *sse) {
+  int sum;
+  variance_sse2(src, src_stride, ref, ref_stride, 32, 64,
+                sse, &sum, vpx_get16x16var_sse2, 16);
+  return *sse - (((int64_t)sum * sum) >> 11);
+}
+
+unsigned int vpx_mse8x8_sse2(const uint8_t *src, int src_stride,
+                             const uint8_t *ref, int ref_stride,
+                             unsigned int *sse) {
+  vpx_variance8x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse8x16_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance8x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x8_sse2(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
+                              unsigned int *sse) {
+  vpx_variance16x8_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}
+
+unsigned int vpx_mse16x16_sse2(const uint8_t *src, int src_stride,
+                               const uint8_t *ref, int ref_stride,
+                               unsigned int *sse) {
+  vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
+  return *sse;
+}

diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h
new file mode 100644
index 0000000..43a36e7
--- /dev/null
+++ b/vpx_ports/msvc.h

@@ -0,0 +1,22 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_PORTS_MSVC_H_
+#define VPX_PORTS_MSVC_H_
+#ifdef _MSC_VER
+
+#include "./vpx_config.h"
+
+# if _MSC_VER < 1900  // VS2015 provides snprintf
+#  define snprintf _snprintf
+# endif  // _MSC_VER < 1900
+
+#endif  // _MSC_VER
+#endif  // VPX_PORTS_MSVC_H_

diff --git a/vpx_ports/vpx_ports.mk b/vpx_ports/vpx_ports.mk
index dfc75ab..ab7fc4a 100644
--- a/vpx_ports/vpx_ports.mk
+++ b/vpx_ports/vpx_ports.mk

@@ -12,6 +12,7 @@
 PORTS_SRCS-yes += vpx_ports.mk
 
 PORTS_SRCS-yes += mem.h
+PORTS_SRCS-yes += msvc.h
 PORTS_SRCS-yes += vpx_timer.h
 
 ifeq ($(ARCH_X86)$(ARCH_X86_64),yes)
commit	45db29784d4ae2d32c387d58ac0a670a7a8533af	[log] [tgz]
author	Minghai Shang <minghai@google.com>	Thu May 28 22:13:53 2015 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	Thu May 28 22:13:54 2015 +0000
tree	c8b3cf60fc3d00fa1213c54d89b9cf272484fbfa
parent	7c16dcc79bb0d9c8be8c56f4ae43cd8bc3cb9fb9 [diff]
parent	9843e7c6355cecc84229bc812c10bb88bb7b74ca [diff]