Merge "VP9_LPF_VERTICAL_16_DUAL_SSE2 optimization"

diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index 484deb5..ee7de6b 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c

@@ -692,6 +692,7 @@
     vpx_codec_control(&codec, VP9E_SET_FRAME_PERIODIC_BOOST, 0);
     vpx_codec_control(&codec, VP9E_SET_NOISE_SENSITIVITY, 0);
     vpx_codec_control(&codec, VP8E_SET_STATIC_THRESHOLD, 0);
+    vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, (cfg.g_threads >> 1));
     if (vpx_codec_control(&codec, VP9E_SET_SVC, layering_mode > 0 ? 1: 0))
       die_codec(&codec, "Failed to set SVC");

diff --git a/libs.mk b/libs.mk
index 1650028..1dc28e2 100644
--- a/libs.mk
+++ b/libs.mk

@@ -50,8 +50,8 @@
 include $(SRC_PATH_BARE)/vpx_dsp/vpx_dsp.mk
 CODEC_SRCS-yes += $(addprefix vpx_dsp/,$(call enabled,DSP_SRCS))
 
-include $(SRC_PATH_BARE)/vpx_thread/vpx_thread.mk
-CODEC_SRCS-yes += $(addprefix vpx_thread/,$(call enabled,THREAD_SRCS))
+include $(SRC_PATH_BARE)/vpx_util/vpx_util.mk
+CODEC_SRCS-yes += $(addprefix vpx_util/,$(call enabled,UTIL_SRCS))
 
 ifneq ($(CONFIG_VP8_ENCODER)$(CONFIG_VP8_DECODER),)
   VP8_PREFIX=vp8/

diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 327dab7..db7e961 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc

@@ -647,7 +647,7 @@
 TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) {
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
     for (int i = 0; i < kNumFilters; i++) {
       const int p0 = filters[i][0] + filters[i][1];
       const int p1 = filters[i][2] + filters[i][3];
@@ -685,9 +685,9 @@
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
     const InterpKernel *const eighttap_smooth =
-        vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+        vp9_filter_kernels[EIGHTTAP_SMOOTH];
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -764,9 +764,9 @@
 
   for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
     const InterpKernel *filters =
-        vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+        vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
     const InterpKernel *const eighttap_smooth =
-        vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+        vp9_filter_kernels[EIGHTTAP_SMOOTH];
 
     for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
       for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
@@ -863,9 +863,9 @@
 
       for (int filter_bank = 0; filter_bank < kNumFilterBanks; ++filter_bank) {
         const InterpKernel *filters =
-            vp9_get_interp_kernel(static_cast<INTERP_FILTER>(filter_bank));
+            vp9_filter_kernels[static_cast<INTERP_FILTER>(filter_bank)];
         const InterpKernel *const eighttap_smooth =
-            vp9_get_interp_kernel(EIGHTTAP_SMOOTH);
+            vp9_filter_kernels[EIGHTTAP_SMOOTH];
         for (int filter_x = 0; filter_x < kNumFilters; ++filter_x) {
           for (int filter_y = 0; filter_y < kNumFilters; ++filter_y) {
             wrapper_filter_block2d_8_c(in, kInputStride,
@@ -1017,7 +1017,7 @@
 TEST_P(ConvolveTest, CheckScalingFiltering) {
   uint8_t* const in = input();
   uint8_t* const out = output();
-  const InterpKernel *const eighttap = vp9_get_interp_kernel(EIGHTTAP);
+  const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
 
   SetConstantInput(127);
 

diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 2d1ad16..61bfe50 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc

@@ -684,8 +684,6 @@
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
         make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
 #else
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
@@ -696,8 +694,6 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
     NEON, FwdTrans8x8DCT,
     ::testing::Values(
@@ -716,8 +712,6 @@
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8DCT,
     ::testing::Values(
@@ -746,8 +740,6 @@
         make_tuple(&vp9_highbd_fdct8x8_sse2,
                    &idct8x8_64_add_12_sse2, 12, VPX_BITS_12)));
 
-// TODO(jingning): re-enable after these handle the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(
@@ -773,8 +765,6 @@
 
 #if HAVE_SSSE3 && CONFIG_USE_X86INC && ARCH_X86_64 && \
     !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
-// TODO(jingning): re-enable after this handles the expanded range [0, 65535]
-// returned from Rand16().
 INSTANTIATE_TEST_CASE_P(
     SSSE3, FwdTrans8x8DCT,
     ::testing::Values(

diff --git a/test/idct_test.cc b/test/idct_test.cc
index 2ff9e64..34b721f 100644
--- a/test/idct_test.cc
+++ b/test/idct_test.cc

@@ -113,4 +113,8 @@
 INSTANTIATE_TEST_CASE_P(MMX, IDCTTest,
                         ::testing::Values(vp8_short_idct4x4llm_mmx));
 #endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, IDCTTest,
+                        ::testing::Values(vp8_short_idct4x4llm_msa));
+#endif
 }

diff --git a/test/subtract_test.cc b/test/subtract_test.cc
deleted file mode 100644
index ff42725..0000000
--- a/test/subtract_test.cc
+++ /dev/null

@@ -1,123 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "./vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "vp8/common/blockd.h"
-#include "vp8/encoder/block.h"
-#include "vpx_mem/vpx_mem.h"
-
-typedef void (*SubtractBlockFunc)(BLOCK *be, BLOCKD *bd, int pitch);
-
-namespace {
-
-class SubtractBlockTest : public ::testing::TestWithParam<SubtractBlockFunc> {
- public:
-  virtual void TearDown() {
-    libvpx_test::ClearSystemState();
-  }
-};
-
-using libvpx_test::ACMRandom;
-
-TEST_P(SubtractBlockTest, SimpleSubtract) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  BLOCK be;
-  BLOCKD bd;
-  // in libvpx, this stride is always 16
-  const int kDiffPredStride = 16;
-  const int kSrcStride[] = {32, 16, 8, 4, 0};
-  const int kBlockWidth = 4;
-  const int kBlockHeight = 4;
-
-  // Allocate... align to 16 for mmx/sse tests
-  uint8_t *source = reinterpret_cast<uint8_t*>(
-      vpx_memalign(16, kBlockHeight * kSrcStride[0] * sizeof(*source)));
-  be.src_diff = reinterpret_cast<int16_t*>(
-      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*be.src_diff)));
-  bd.predictor = reinterpret_cast<unsigned char*>(
-      vpx_memalign(16, kBlockHeight * kDiffPredStride * sizeof(*bd.predictor)));
-
-  for (int i = 0; kSrcStride[i] > 0; ++i) {
-    // start at block0
-    be.src = 0;
-    be.base_src = &source;
-    be.src_stride = kSrcStride[i];
-
-    // set difference
-    int16_t *src_diff = be.src_diff;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        src_diff[c] = static_cast<int16_t>(0xa5a5u);
-      }
-      src_diff += kDiffPredStride;
-    }
-
-    // set destination
-    uint8_t *base_src = *be.base_src;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        base_src[c] = rnd.Rand8();
-      }
-      base_src += be.src_stride;
-    }
-
-    // set predictor
-    uint8_t *predictor = bd.predictor;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        predictor[c] = rnd.Rand8();
-      }
-      predictor += kDiffPredStride;
-    }
-
-    ASM_REGISTER_STATE_CHECK(GetParam()(&be, &bd, kDiffPredStride));
-
-    base_src = *be.base_src;
-    src_diff = be.src_diff;
-    predictor = bd.predictor;
-    for (int r = 0; r < kBlockHeight; ++r) {
-      for (int c = 0; c < kBlockWidth; ++c) {
-        EXPECT_EQ(base_src[c], (src_diff[c] + predictor[c])) << "r = " << r
-                                                             << ", c = " << c;
-      }
-      src_diff += kDiffPredStride;
-      predictor += kDiffPredStride;
-      base_src += be.src_stride;
-    }
-  }
-  vpx_free(be.src_diff);
-  vpx_free(source);
-  vpx_free(bd.predictor);
-}
-
-INSTANTIATE_TEST_CASE_P(C, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_c));
-
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_neon));
-#endif
-
-#if HAVE_MMX
-INSTANTIATE_TEST_CASE_P(MMX, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_mmx));
-#endif
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, SubtractBlockTest,
-                        ::testing::Values(vp8_subtract_b_sse2));
-#endif
-
-}  // namespace

diff --git a/test/test.mk b/test/test.mk
index 4d63a63..a8a365e 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -104,7 +104,6 @@
 LIBVPX_TEST_SRCS-$(CONFIG_POSTPROC)    += pp_filter_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += quantize_test.cc
@@ -159,6 +158,7 @@
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_VP9_TEMPORAL_DENOISING),yesyes)
 LIBVPX_TEST_SRCS-$(HAVE_SSE2) += vp9_denoiser_sse2_test.cc
 endif
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_arf_freq_test.cc
 
 endif # VP9
 

diff --git a/test/variance_test.cc b/test/variance_test.cc
index 614c4d9..64095bc 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc

@@ -21,13 +21,6 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
-#if CONFIG_VP8_ENCODER
-# include "./vp8_rtcd.h"
-#endif  // CONFIG_VP8_ENCODER
-#if CONFIG_VP9_ENCODER
-# include "./vp9_rtcd.h"
-# include "vp9/encoder/vp9_variance.h"
-#endif  // CONFIG_VP9_ENCODER
 #include "./vpx_dsp_rtcd.h"
 
 namespace {
@@ -39,8 +32,15 @@
                                          int xoffset, int yoffset,
                                          const uint8_t *b, int b_stride,
                                          unsigned int *sse);
+typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
+                                            int xoffset, int yoffset,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse,
+                                            const uint8_t *second_pred);
 typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
                                       const uint8_t *b, int b_stride);
+typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
+
 
 using ::std::tr1::get;
 using ::std::tr1::make_tuple;
@@ -166,8 +166,6 @@
                                 (l2w + l2h)));
 }
 
-typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
-
 class SumOfSquaresTest : public ::testing::TestWithParam<SumOfSquaresFunction> {
  public:
   SumOfSquaresTest() : func_(GetParam()) {}
@@ -687,9 +685,8 @@
   }
 }
 
-#if CONFIG_VP9_ENCODER
 template<>
-void SubpelVarianceTest<vp9_subp_avg_variance_fn_t>::RefTest() {
+void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
   for (int x = 0; x < 8; ++x) {
     for (int y = 0; y < 8; ++y) {
       if (!use_high_bit_depth_) {
@@ -726,11 +723,12 @@
     }
   }
 }
-#endif  // CONFIG_VP9_ENCODER
 
 typedef MseTest<Get4x4SseFunc> VpxSseTest;
 typedef MseTest<VarianceMxNFunc> VpxMseTest;
 typedef VarianceTest<VarianceMxNFunc> VpxVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
 
 TEST_P(VpxSseTest, Ref_sse) { RefTest_sse(); }
 TEST_P(VpxSseTest, Max_sse) { MaxTest_sse(); }
@@ -742,6 +740,9 @@
 TEST_P(VpxVarianceTest, OneQuarter) { OneQuarterTest(); }
 TEST_P(SumOfSquaresTest, Const) { ConstTest(); }
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_CASE_P(C, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_c));
@@ -773,7 +774,6 @@
 const VarianceMxNFunc variance8x4_c = vpx_variance8x4_c;
 const VarianceMxNFunc variance4x8_c = vpx_variance4x8_c;
 const VarianceMxNFunc variance4x4_c = vpx_variance4x4_c;
-
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
     ::testing::Values(make_tuple(6, 6, variance64x64_c, 0),
@@ -790,9 +790,79 @@
                       make_tuple(2, 3, variance4x8_c, 0),
                       make_tuple(2, 2, variance4x4_c, 0)));
 
+const SubpixVarMxNFunc subpel_var64x64_c = vpx_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc subpel_var64x32_c = vpx_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc subpel_var32x64_c = vpx_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc subpel_var32x32_c = vpx_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc subpel_var32x16_c = vpx_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc subpel_var16x32_c = vpx_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc subpel_var16x16_c = vpx_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc subpel_var16x8_c = vpx_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc subpel_var8x16_c = vpx_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc subpel_var8x8_c = vpx_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc subpel_var8x4_c = vpx_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc subpel_var4x8_c = vpx_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc subpel_var4x4_c = vpx_sub_pixel_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_var64x64_c, 0),
+                      make_tuple(6, 5, subpel_var64x32_c, 0),
+                      make_tuple(5, 6, subpel_var32x64_c, 0),
+                      make_tuple(5, 5, subpel_var32x32_c, 0),
+                      make_tuple(5, 4, subpel_var32x16_c, 0),
+                      make_tuple(4, 5, subpel_var16x32_c, 0),
+                      make_tuple(4, 4, subpel_var16x16_c, 0),
+                      make_tuple(4, 3, subpel_var16x8_c, 0),
+                      make_tuple(3, 4, subpel_var8x16_c, 0),
+                      make_tuple(3, 3, subpel_var8x8_c, 0),
+                      make_tuple(3, 2, subpel_var8x4_c, 0),
+                      make_tuple(2, 3, subpel_var4x8_c, 0),
+                      make_tuple(2, 2, subpel_var4x4_c, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_var64x64_c =
+    vpx_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var64x32_c =
+    vpx_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x64_c =
+    vpx_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x32_c =
+    vpx_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var32x16_c =
+    vpx_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x32_c =
+    vpx_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x16_c =
+    vpx_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var16x8_c =
+    vpx_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x16_c =
+    vpx_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x8_c = vpx_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var8x4_c = vpx_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x8_c = vpx_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc subpel_avg_var4x4_c = vpx_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_var64x64_c, 0),
+                      make_tuple(6, 5, subpel_avg_var64x32_c, 0),
+                      make_tuple(5, 6, subpel_avg_var32x64_c, 0),
+                      make_tuple(5, 5, subpel_avg_var32x32_c, 0),
+                      make_tuple(5, 4, subpel_avg_var32x16_c, 0),
+                      make_tuple(4, 5, subpel_avg_var16x32_c, 0),
+                      make_tuple(4, 4, subpel_avg_var16x16_c, 0),
+                      make_tuple(4, 3, subpel_avg_var16x8_c, 0),
+                      make_tuple(3, 4, subpel_avg_var8x16_c, 0),
+                      make_tuple(3, 3, subpel_avg_var8x8_c, 0),
+                      make_tuple(3, 2, subpel_avg_var8x4_c, 0),
+                      make_tuple(2, 3, subpel_avg_var4x8_c, 0),
+                      make_tuple(2, 2, subpel_avg_var4x4_c, 0)));
+
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
 typedef VarianceTest<VarianceMxNFunc> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<SubpixAvgVarMxNFunc>
+    VpxHBDSubpelAvgVarianceTest;
 
 TEST_P(VpxHBDMseTest, Ref_mse) { RefTest_mse(); }
 TEST_P(VpxHBDMseTest, Max_mse) { MaxTest_mse(); }
@@ -800,6 +870,9 @@
 TEST_P(VpxHBDVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxHBDVarianceTest, RefStride) { RefStrideTest(); }
 TEST_P(VpxHBDVarianceTest, OneQuarter) { OneQuarterTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, Ref) { RefTest(); }
+TEST_P(VpxHBDSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxHBDSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 /* TODO(debargha): This test does not support the highbd version
 const VarianceMxNFunc highbd_12_mse16x16_c = vpx_highbd_12_mse16x16_c;
@@ -844,7 +917,6 @@
 const VarianceMxNFunc highbd_12_variance8x4_c = vpx_highbd_12_variance8x4_c;
 const VarianceMxNFunc highbd_12_variance4x8_c = vpx_highbd_12_variance4x8_c;
 const VarianceMxNFunc highbd_12_variance4x4_c = vpx_highbd_12_variance4x4_c;
-
 const VarianceMxNFunc highbd_10_variance64x64_c = vpx_highbd_10_variance64x64_c;
 const VarianceMxNFunc highbd_10_variance64x32_c = vpx_highbd_10_variance64x32_c;
 const VarianceMxNFunc highbd_10_variance32x64_c = vpx_highbd_10_variance32x64_c;
@@ -858,7 +930,6 @@
 const VarianceMxNFunc highbd_10_variance8x4_c = vpx_highbd_10_variance8x4_c;
 const VarianceMxNFunc highbd_10_variance4x8_c = vpx_highbd_10_variance4x8_c;
 const VarianceMxNFunc highbd_10_variance4x4_c = vpx_highbd_10_variance4x4_c;
-
 const VarianceMxNFunc highbd_8_variance64x64_c = vpx_highbd_8_variance64x64_c;
 const VarianceMxNFunc highbd_8_variance64x32_c = vpx_highbd_8_variance64x32_c;
 const VarianceMxNFunc highbd_8_variance32x64_c = vpx_highbd_8_variance32x64_c;
@@ -913,6 +984,247 @@
                       make_tuple(3, 2, highbd_8_variance8x4_c, 8),
                       make_tuple(2, 3, highbd_8_variance4x8_c, 8),
                       make_tuple(2, 2, highbd_8_variance4x4_c, 8)));
+
+const SubpixVarMxNFunc highbd_8_subpel_var64x64_c =
+    vpx_highbd_8_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var64x32_c =
+    vpx_highbd_8_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x64_c =
+    vpx_highbd_8_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x32_c =
+    vpx_highbd_8_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var32x16_c =
+    vpx_highbd_8_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x32_c =
+    vpx_highbd_8_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x16_c =
+    vpx_highbd_8_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var16x8_c =
+    vpx_highbd_8_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x16_c =
+    vpx_highbd_8_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x8_c =
+    vpx_highbd_8_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var8x4_c =
+    vpx_highbd_8_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x8_c =
+    vpx_highbd_8_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_8_subpel_var4x4_c =
+    vpx_highbd_8_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x64_c =
+    vpx_highbd_10_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var64x32_c =
+    vpx_highbd_10_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x64_c =
+    vpx_highbd_10_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x32_c =
+    vpx_highbd_10_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var32x16_c =
+    vpx_highbd_10_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x32_c =
+    vpx_highbd_10_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x16_c =
+    vpx_highbd_10_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var16x8_c =
+    vpx_highbd_10_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x16_c =
+    vpx_highbd_10_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x8_c =
+    vpx_highbd_10_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var8x4_c =
+    vpx_highbd_10_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x8_c =
+    vpx_highbd_10_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_10_subpel_var4x4_c =
+    vpx_highbd_10_sub_pixel_variance4x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x64_c =
+    vpx_highbd_12_sub_pixel_variance64x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var64x32_c =
+    vpx_highbd_12_sub_pixel_variance64x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x64_c =
+    vpx_highbd_12_sub_pixel_variance32x64_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x32_c =
+    vpx_highbd_12_sub_pixel_variance32x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var32x16_c =
+    vpx_highbd_12_sub_pixel_variance32x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x32_c =
+    vpx_highbd_12_sub_pixel_variance16x32_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x16_c =
+    vpx_highbd_12_sub_pixel_variance16x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var16x8_c =
+    vpx_highbd_12_sub_pixel_variance16x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x16_c =
+    vpx_highbd_12_sub_pixel_variance8x16_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x8_c =
+    vpx_highbd_12_sub_pixel_variance8x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var8x4_c =
+    vpx_highbd_12_sub_pixel_variance8x4_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x8_c =
+    vpx_highbd_12_sub_pixel_variance4x8_c;
+const SubpixVarMxNFunc highbd_12_subpel_var4x4_c =
+    vpx_highbd_12_sub_pixel_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_8_subpel_var64x64_c, 8),
+                      make_tuple(6, 5, highbd_8_subpel_var64x32_c, 8),
+                      make_tuple(5, 6, highbd_8_subpel_var32x64_c, 8),
+                      make_tuple(5, 5, highbd_8_subpel_var32x32_c, 8),
+                      make_tuple(5, 4, highbd_8_subpel_var32x16_c, 8),
+                      make_tuple(4, 5, highbd_8_subpel_var16x32_c, 8),
+                      make_tuple(4, 4, highbd_8_subpel_var16x16_c, 8),
+                      make_tuple(4, 3, highbd_8_subpel_var16x8_c, 8),
+                      make_tuple(3, 4, highbd_8_subpel_var8x16_c, 8),
+                      make_tuple(3, 3, highbd_8_subpel_var8x8_c, 8),
+                      make_tuple(3, 2, highbd_8_subpel_var8x4_c, 8),
+                      make_tuple(2, 3, highbd_8_subpel_var4x8_c, 8),
+                      make_tuple(2, 2, highbd_8_subpel_var4x4_c, 8),
+                      make_tuple(6, 6, highbd_10_subpel_var64x64_c, 10),
+                      make_tuple(6, 5, highbd_10_subpel_var64x32_c, 10),
+                      make_tuple(5, 6, highbd_10_subpel_var32x64_c, 10),
+                      make_tuple(5, 5, highbd_10_subpel_var32x32_c, 10),
+                      make_tuple(5, 4, highbd_10_subpel_var32x16_c, 10),
+                      make_tuple(4, 5, highbd_10_subpel_var16x32_c, 10),
+                      make_tuple(4, 4, highbd_10_subpel_var16x16_c, 10),
+                      make_tuple(4, 3, highbd_10_subpel_var16x8_c, 10),
+                      make_tuple(3, 4, highbd_10_subpel_var8x16_c, 10),
+                      make_tuple(3, 3, highbd_10_subpel_var8x8_c, 10),
+                      make_tuple(3, 2, highbd_10_subpel_var8x4_c, 10),
+                      make_tuple(2, 3, highbd_10_subpel_var4x8_c, 10),
+                      make_tuple(2, 2, highbd_10_subpel_var4x4_c, 10),
+                      make_tuple(6, 6, highbd_12_subpel_var64x64_c, 12),
+                      make_tuple(6, 5, highbd_12_subpel_var64x32_c, 12),
+                      make_tuple(5, 6, highbd_12_subpel_var32x64_c, 12),
+                      make_tuple(5, 5, highbd_12_subpel_var32x32_c, 12),
+                      make_tuple(5, 4, highbd_12_subpel_var32x16_c, 12),
+                      make_tuple(4, 5, highbd_12_subpel_var16x32_c, 12),
+                      make_tuple(4, 4, highbd_12_subpel_var16x16_c, 12),
+                      make_tuple(4, 3, highbd_12_subpel_var16x8_c, 12),
+                      make_tuple(3, 4, highbd_12_subpel_var8x16_c, 12),
+                      make_tuple(3, 3, highbd_12_subpel_var8x8_c, 12),
+                      make_tuple(3, 2, highbd_12_subpel_var8x4_c, 12),
+                      make_tuple(2, 3, highbd_12_subpel_var4x8_c, 12),
+                      make_tuple(2, 2, highbd_12_subpel_var4x4_c, 12)));
+
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x64_c =
+    vpx_highbd_8_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var64x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x64_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var32x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x32_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var16x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x16_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var8x4_c =
+    vpx_highbd_8_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x8_c =
+    vpx_highbd_8_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_var4x4_c =
+    vpx_highbd_8_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x64_c =
+    vpx_highbd_10_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var64x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x64_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var32x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x32_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var16x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x16_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var8x4_c =
+    vpx_highbd_10_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x8_c =
+    vpx_highbd_10_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_var4x4_c =
+    vpx_highbd_10_sub_pixel_avg_variance4x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x64_c =
+    vpx_highbd_12_sub_pixel_avg_variance64x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var64x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance64x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x64_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x64_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var32x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance32x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x32_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x32_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var16x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance16x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x16_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x16_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var8x4_c =
+    vpx_highbd_12_sub_pixel_avg_variance8x4_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x8_c =
+    vpx_highbd_12_sub_pixel_avg_variance4x8_c;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_var4x4_c =
+    vpx_highbd_12_sub_pixel_avg_variance4x4_c;
+INSTANTIATE_TEST_CASE_P(
+    C, VpxHBDSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(6, 6, highbd_8_subpel_avg_var64x64_c, 8),
+        make_tuple(6, 5, highbd_8_subpel_avg_var64x32_c, 8),
+        make_tuple(5, 6, highbd_8_subpel_avg_var32x64_c, 8),
+        make_tuple(5, 5, highbd_8_subpel_avg_var32x32_c, 8),
+        make_tuple(5, 4, highbd_8_subpel_avg_var32x16_c, 8),
+        make_tuple(4, 5, highbd_8_subpel_avg_var16x32_c, 8),
+        make_tuple(4, 4, highbd_8_subpel_avg_var16x16_c, 8),
+        make_tuple(4, 3, highbd_8_subpel_avg_var16x8_c, 8),
+        make_tuple(3, 4, highbd_8_subpel_avg_var8x16_c, 8),
+        make_tuple(3, 3, highbd_8_subpel_avg_var8x8_c, 8),
+        make_tuple(3, 2, highbd_8_subpel_avg_var8x4_c, 8),
+        make_tuple(2, 3, highbd_8_subpel_avg_var4x8_c, 8),
+        make_tuple(2, 2, highbd_8_subpel_avg_var4x4_c, 8),
+        make_tuple(6, 6, highbd_10_subpel_avg_var64x64_c, 10),
+        make_tuple(6, 5, highbd_10_subpel_avg_var64x32_c, 10),
+        make_tuple(5, 6, highbd_10_subpel_avg_var32x64_c, 10),
+        make_tuple(5, 5, highbd_10_subpel_avg_var32x32_c, 10),
+        make_tuple(5, 4, highbd_10_subpel_avg_var32x16_c, 10),
+        make_tuple(4, 5, highbd_10_subpel_avg_var16x32_c, 10),
+        make_tuple(4, 4, highbd_10_subpel_avg_var16x16_c, 10),
+        make_tuple(4, 3, highbd_10_subpel_avg_var16x8_c, 10),
+        make_tuple(3, 4, highbd_10_subpel_avg_var8x16_c, 10),
+        make_tuple(3, 3, highbd_10_subpel_avg_var8x8_c, 10),
+        make_tuple(3, 2, highbd_10_subpel_avg_var8x4_c, 10),
+        make_tuple(2, 3, highbd_10_subpel_avg_var4x8_c, 10),
+        make_tuple(2, 2, highbd_10_subpel_avg_var4x4_c, 10),
+        make_tuple(6, 6, highbd_12_subpel_avg_var64x64_c, 12),
+        make_tuple(6, 5, highbd_12_subpel_avg_var64x32_c, 12),
+        make_tuple(5, 6, highbd_12_subpel_avg_var32x64_c, 12),
+        make_tuple(5, 5, highbd_12_subpel_avg_var32x32_c, 12),
+        make_tuple(5, 4, highbd_12_subpel_avg_var32x16_c, 12),
+        make_tuple(4, 5, highbd_12_subpel_avg_var16x32_c, 12),
+        make_tuple(4, 4, highbd_12_subpel_avg_var16x16_c, 12),
+        make_tuple(4, 3, highbd_12_subpel_avg_var16x8_c, 12),
+        make_tuple(3, 4, highbd_12_subpel_avg_var8x16_c, 12),
+        make_tuple(3, 3, highbd_12_subpel_avg_var8x8_c, 12),
+        make_tuple(3, 2, highbd_12_subpel_avg_var8x4_c, 12),
+        make_tuple(2, 3, highbd_12_subpel_avg_var4x8_c, 12),
+        make_tuple(2, 2, highbd_12_subpel_avg_var4x4_c, 12)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_MMX
@@ -935,6 +1247,19 @@
                       make_tuple(3, 4, variance8x16_mmx, 0),
                       make_tuple(3, 3, variance8x8_mmx, 0),
                       make_tuple(2, 2, variance4x4_mmx, 0)));
+
+const SubpixVarMxNFunc subpel_var16x16_mmx = vpx_sub_pixel_variance16x16_mmx;
+const SubpixVarMxNFunc subpel_var16x8_mmx = vpx_sub_pixel_variance16x8_mmx;
+const SubpixVarMxNFunc subpel_var8x16_mmx = vpx_sub_pixel_variance8x16_mmx;
+const SubpixVarMxNFunc subpel_var8x8_mmx = vpx_sub_pixel_variance8x8_mmx;
+const SubpixVarMxNFunc subpel_var4x4_mmx = vpx_sub_pixel_variance4x4_mmx;
+INSTANTIATE_TEST_CASE_P(
+    MMX, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_var16x16_mmx, 0),
+                      make_tuple(4, 3, subpel_var16x8_mmx, 0),
+                      make_tuple(3, 4, subpel_var8x16_mmx, 0),
+                      make_tuple(3, 3, subpel_var8x8_mmx, 0),
+                      make_tuple(2, 2, subpel_var4x4_mmx, 0)));
 #endif  // HAVE_MMX
 
 #if HAVE_SSE2
@@ -979,6 +1304,90 @@
                       make_tuple(3, 2, variance8x4_sse2, 0),
                       make_tuple(2, 3, variance4x8_sse2, 0),
                       make_tuple(2, 2, variance4x4_sse2, 0)));
+
+#if CONFIG_USE_X86INC
+const SubpixVarMxNFunc subpel_variance64x64_sse2 =
+    vpx_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc subpel_variance64x32_sse2 =
+    vpx_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x64_sse2 =
+    vpx_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc subpel_variance32x32_sse2 =
+    vpx_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc subpel_variance32x16_sse2 =
+    vpx_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x32_sse2 =
+    vpx_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc subpel_variance16x16_sse2 =
+    vpx_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc subpel_variance16x8_sse2 =
+    vpx_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x16_sse2 =
+    vpx_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc subpel_variance8x8_sse2 = vpx_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc subpel_variance8x4_sse2 = vpx_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc subpel_variance4x8_sse = vpx_sub_pixel_variance4x8_sse;
+const SubpixVarMxNFunc subpel_variance4x4_sse = vpx_sub_pixel_variance4x4_sse;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_sse2, 0),
+                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),
+                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),
+                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),
+                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),
+                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),
+                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),
+                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),
+                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),
+                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),
+                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),
+                      make_tuple(2, 3, subpel_variance4x8_sse, 0),
+                      make_tuple(2, 2, subpel_variance4x4_sse, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_sse2 =
+    vpx_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_sse2 =
+    vpx_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_sse2 =
+    vpx_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_sse2 =
+    vpx_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_sse2 =
+    vpx_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_sse2 =
+    vpx_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_sse2 =
+    vpx_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_sse2 =
+    vpx_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_sse2 =
+    vpx_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_sse2 =
+    vpx_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_sse2 =
+    vpx_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_sse =
+    vpx_sub_pixel_avg_variance4x8_sse;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_sse =
+    vpx_sub_pixel_avg_variance4x4_sse;
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0),
+                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
+                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_sse, 0)));
+#endif  // CONFIG_USE_X86INC
+
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 const VarianceMxNFunc highbd_12_mse16x16_sse2 = vpx_highbd_12_mse16x16_sse2;
@@ -1103,794 +1512,303 @@
                       make_tuple(4, 3, highbd_8_variance16x8_sse2, 8),
                       make_tuple(3, 4, highbd_8_variance8x16_sse2, 8),
                       make_tuple(3, 3, highbd_8_variance8x8_sse2, 8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // HAVE_SSE2
 
-#if CONFIG_VP8_ENCODER
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP8SubpelVarianceTest;
-
-TEST_P(VP8SubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(VP8SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceTest;
-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t> VP9SubpelAvgVarianceTest;
-
-TEST_P(VP9SubpelVarianceTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
-TEST_P(VP9SubpelAvgVarianceTest, Ref) { RefTest(); }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VP9SubpelVarianceHighTest;
-typedef SubpelVarianceTest<vp9_subp_avg_variance_fn_t>
-    VP9SubpelAvgVarianceHighTest;
-
-TEST_P(VP9SubpelVarianceHighTest, Ref) { RefTest(); }
-TEST_P(VP9SubpelVarianceHighTest, ExtremeRef) { ExtremeRefTest(); }
-TEST_P(VP9SubpelAvgVarianceHighTest, Ref) { RefTest(); }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-const SubpixVarMxNFunc subpel_variance4x4_c = vp9_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc subpel_variance4x8_c = vp9_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc subpel_variance8x4_c = vp9_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc subpel_variance8x8_c = vp9_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc subpel_variance8x16_c = vp9_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc subpel_variance16x8_c = vp9_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc subpel_variance16x16_c = vp9_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc subpel_variance16x32_c = vp9_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc subpel_variance32x16_c = vp9_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc subpel_variance32x32_c = vp9_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc subpel_variance32x64_c = vp9_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc subpel_variance64x32_c = vp9_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc subpel_variance64x64_c = vp9_sub_pixel_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_c, 0),
-                      make_tuple(2, 3, subpel_variance4x8_c, 0),
-                      make_tuple(3, 2, subpel_variance8x4_c, 0),
-                      make_tuple(3, 3, subpel_variance8x8_c, 0),
-                      make_tuple(3, 4, subpel_variance8x16_c, 0),
-                      make_tuple(4, 3, subpel_variance16x8_c, 0),
-                      make_tuple(4, 4, subpel_variance16x16_c, 0),
-                      make_tuple(4, 5, subpel_variance16x32_c, 0),
-                      make_tuple(5, 4, subpel_variance32x16_c, 0),
-                      make_tuple(5, 5, subpel_variance32x32_c, 0),
-                      make_tuple(5, 6, subpel_variance32x64_c, 0),
-                      make_tuple(6, 5, subpel_variance64x32_c, 0),
-                      make_tuple(6, 6, subpel_variance64x64_c, 0)));
-
-#if CONFIG_VP8_ENCODER
-const SubpixVarMxNFunc vp8_subpel_variance16x16_c =
-    vp8_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_c = vp8_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc vp8_subpel_variance8x16_c = vp8_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc vp8_subpel_variance8x8_c = vp8_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc vp8_subpel_variance4x4_c = vp8_sub_pixel_variance4x4_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_c, 0),
-                      make_tuple(3, 3, vp8_subpel_variance8x8_c, 0),
-                      make_tuple(3, 4, vp8_subpel_variance8x16_c, 0),
-                      make_tuple(4, 3, vp8_subpel_variance16x8_c, 0),
-                      make_tuple(4, 4, vp8_subpel_variance16x16_c, 0)));
-#endif  // CONFIG_VP8_ENCODER
-
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_c =
-    vp9_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_c =
-    vp9_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_c =
-    vp9_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_c =
-    vp9_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_c =
-    vp9_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_c =
-    vp9_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_c =
-    vp9_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_c =
-    vp9_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_c =
-    vp9_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_c =
-    vp9_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_c =
-    vp9_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_c =
-    vp9_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_c =
-    vp9_sub_pixel_avg_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_c, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_c, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_c, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_c, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_c, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_c, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_c, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_c, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_c, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_c, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_c, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_c, 0),
-                      make_tuple(6, 6, subpel_avg_variance64x64_c, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const SubpixVarMxNFunc highbd_10_subpel_variance4x4_c =
-    vp9_highbd_10_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance4x8_c =
-    vp9_highbd_10_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_c =
-    vp9_highbd_10_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_c =
-    vp9_highbd_10_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_c =
-    vp9_highbd_10_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_c =
-    vp9_highbd_10_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_c =
-    vp9_highbd_10_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_c =
-    vp9_highbd_10_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_c =
-    vp9_highbd_10_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_c =
-    vp9_highbd_10_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_c =
-    vp9_highbd_10_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_c =
-    vp9_highbd_10_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_c =
-    vp9_highbd_10_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance4x4_c =
-    vp9_highbd_12_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance4x8_c =
-    vp9_highbd_12_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_c =
-    vp9_highbd_12_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_c =
-    vp9_highbd_12_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_c =
-    vp9_highbd_12_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_c =
-    vp9_highbd_12_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_c =
-    vp9_highbd_12_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_c =
-    vp9_highbd_12_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_c =
-    vp9_highbd_12_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_c =
-    vp9_highbd_12_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_c =
-    vp9_highbd_12_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_c =
-    vp9_highbd_12_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x64_c =
-    vp9_highbd_12_sub_pixel_variance64x64_c;
-const SubpixVarMxNFunc highbd_subpel_variance4x4_c =
-    vp9_highbd_sub_pixel_variance4x4_c;
-const SubpixVarMxNFunc highbd_subpel_variance4x8_c =
-    vp9_highbd_sub_pixel_variance4x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x4_c =
-    vp9_highbd_sub_pixel_variance8x4_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x8_c =
-    vp9_highbd_sub_pixel_variance8x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance8x16_c =
-    vp9_highbd_sub_pixel_variance8x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x8_c =
-    vp9_highbd_sub_pixel_variance16x8_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x16_c =
-    vp9_highbd_sub_pixel_variance16x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance16x32_c =
-    vp9_highbd_sub_pixel_variance16x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x16_c =
-    vp9_highbd_sub_pixel_variance32x16_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x32_c =
-    vp9_highbd_sub_pixel_variance32x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance32x64_c =
-    vp9_highbd_sub_pixel_variance32x64_c;
-const SubpixVarMxNFunc highbd_subpel_variance64x32_c =
-    vp9_highbd_sub_pixel_variance64x32_c;
-const SubpixVarMxNFunc highbd_subpel_variance64x64_c =
-    vp9_highbd_sub_pixel_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelVarianceHighTest,
-    ::testing::Values(make_tuple(2, 2, highbd_10_subpel_variance4x4_c, 10),
-                      make_tuple(2, 3, highbd_10_subpel_variance4x8_c, 10),
-                      make_tuple(3, 2, highbd_10_subpel_variance8x4_c, 10),
-                      make_tuple(3, 3, highbd_10_subpel_variance8x8_c, 10),
-                      make_tuple(3, 4, highbd_10_subpel_variance8x16_c, 10),
-                      make_tuple(4, 3, highbd_10_subpel_variance16x8_c, 10),
-                      make_tuple(4, 4, highbd_10_subpel_variance16x16_c, 10),
-                      make_tuple(4, 5, highbd_10_subpel_variance16x32_c, 10),
-                      make_tuple(5, 4, highbd_10_subpel_variance32x16_c, 10),
-                      make_tuple(5, 5, highbd_10_subpel_variance32x32_c, 10),
-                      make_tuple(5, 6, highbd_10_subpel_variance32x64_c, 10),
-                      make_tuple(6, 5, highbd_10_subpel_variance64x32_c, 10),
-                      make_tuple(6, 6, highbd_10_subpel_variance64x64_c, 10),
-                      make_tuple(2, 2, highbd_12_subpel_variance4x4_c, 12),
-                      make_tuple(2, 3, highbd_12_subpel_variance4x8_c, 12),
-                      make_tuple(3, 2, highbd_12_subpel_variance8x4_c, 12),
-                      make_tuple(3, 3, highbd_12_subpel_variance8x8_c, 12),
-                      make_tuple(3, 4, highbd_12_subpel_variance8x16_c, 12),
-                      make_tuple(4, 3, highbd_12_subpel_variance16x8_c, 12),
-                      make_tuple(4, 4, highbd_12_subpel_variance16x16_c, 12),
-                      make_tuple(4, 5, highbd_12_subpel_variance16x32_c, 12),
-                      make_tuple(5, 4, highbd_12_subpel_variance32x16_c, 12),
-                      make_tuple(5, 5, highbd_12_subpel_variance32x32_c, 12),
-                      make_tuple(5, 6, highbd_12_subpel_variance32x64_c, 12),
-                      make_tuple(6, 5, highbd_12_subpel_variance64x32_c, 12),
-                      make_tuple(6, 6, highbd_12_subpel_variance64x64_c, 12),
-                      make_tuple(2, 2, highbd_subpel_variance4x4_c, 8),
-                      make_tuple(2, 3, highbd_subpel_variance4x8_c, 8),
-                      make_tuple(3, 2, highbd_subpel_variance8x4_c, 8),
-                      make_tuple(3, 3, highbd_subpel_variance8x8_c, 8),
-                      make_tuple(3, 4, highbd_subpel_variance8x16_c, 8),
-                      make_tuple(4, 3, highbd_subpel_variance16x8_c, 8),
-                      make_tuple(4, 4, highbd_subpel_variance16x16_c, 8),
-                      make_tuple(4, 5, highbd_subpel_variance16x32_c, 8),
-                      make_tuple(5, 4, highbd_subpel_variance32x16_c, 8),
-                      make_tuple(5, 5, highbd_subpel_variance32x32_c, 8),
-                      make_tuple(5, 6, highbd_subpel_variance32x64_c, 8),
-                      make_tuple(6, 5, highbd_subpel_variance64x32_c, 8),
-                      make_tuple(6, 6, highbd_subpel_variance64x64_c, 8)));
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x4_c =
-    vp9_highbd_10_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance4x8_c =
-    vp9_highbd_10_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_c =
-    vp9_highbd_10_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_c =
-    vp9_highbd_10_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_c =
-    vp9_highbd_10_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_c =
-    vp9_highbd_10_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_c =
-    vp9_highbd_10_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_c =
-    vp9_highbd_10_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_c =
-    vp9_highbd_10_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_c =
-    vp9_highbd_10_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_c =
-    vp9_highbd_10_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_c =
-    vp9_highbd_10_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_c =
-    vp9_highbd_10_sub_pixel_avg_variance64x64_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x4_c =
-    vp9_highbd_12_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance4x8_c =
-    vp9_highbd_12_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_c =
-    vp9_highbd_12_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_c =
-    vp9_highbd_12_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_c =
-    vp9_highbd_12_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_c =
-    vp9_highbd_12_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_c =
-    vp9_highbd_12_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_c =
-    vp9_highbd_12_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_c =
-    vp9_highbd_12_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_c =
-    vp9_highbd_12_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_c =
-    vp9_highbd_12_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_c =
-    vp9_highbd_12_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_c =
-    vp9_highbd_12_sub_pixel_avg_variance64x64_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x4_c =
-    vp9_highbd_sub_pixel_avg_variance4x4_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance4x8_c =
-    vp9_highbd_sub_pixel_avg_variance4x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_c =
-    vp9_highbd_sub_pixel_avg_variance8x4_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_c =
-    vp9_highbd_sub_pixel_avg_variance8x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_c =
-    vp9_highbd_sub_pixel_avg_variance8x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_c =
-    vp9_highbd_sub_pixel_avg_variance16x8_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_c =
-    vp9_highbd_sub_pixel_avg_variance16x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_c =
-    vp9_highbd_sub_pixel_avg_variance16x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_c =
-    vp9_highbd_sub_pixel_avg_variance32x16_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_c =
-    vp9_highbd_sub_pixel_avg_variance32x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_c =
-    vp9_highbd_sub_pixel_avg_variance32x64_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_c =
-    vp9_highbd_sub_pixel_avg_variance64x32_c;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_c =
-    vp9_highbd_sub_pixel_avg_variance64x64_c;
-INSTANTIATE_TEST_CASE_P(
-    C, VP9SubpelAvgVarianceHighTest,
-    ::testing::Values(
-        make_tuple(2, 2, highbd_10_subpel_avg_variance4x4_c, 10),
-        make_tuple(2, 3, highbd_10_subpel_avg_variance4x8_c, 10),
-        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_c, 10),
-        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_c, 10),
-        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_c, 10),
-        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_c, 10),
-        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_c, 10),
-        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_c, 10),
-        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_c, 10),
-        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_c, 10),
-        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_c, 10),
-        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_c, 10),
-        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_c, 10),
-        make_tuple(2, 2, highbd_12_subpel_avg_variance4x4_c, 12),
-        make_tuple(2, 3, highbd_12_subpel_avg_variance4x8_c, 12),
-        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_c, 12),
-        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_c, 12),
-        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_c, 12),
-        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_c, 12),
-        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_c, 12),
-        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_c, 12),
-        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_c, 12),
-        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_c, 12),
-        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_c, 12),
-        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_c, 12),
-        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_c, 12),
-        make_tuple(2, 2, highbd_subpel_avg_variance4x4_c, 8),
-        make_tuple(2, 3, highbd_subpel_avg_variance4x8_c, 8),
-        make_tuple(3, 2, highbd_subpel_avg_variance8x4_c, 8),
-        make_tuple(3, 3, highbd_subpel_avg_variance8x8_c, 8),
-        make_tuple(3, 4, highbd_subpel_avg_variance8x16_c, 8),
-        make_tuple(4, 3, highbd_subpel_avg_variance16x8_c, 8),
-        make_tuple(4, 4, highbd_subpel_avg_variance16x16_c, 8),
-        make_tuple(4, 5, highbd_subpel_avg_variance16x32_c, 8),
-        make_tuple(5, 4, highbd_subpel_avg_variance32x16_c, 8),
-        make_tuple(5, 5, highbd_subpel_avg_variance32x32_c, 8),
-        make_tuple(5, 6, highbd_subpel_avg_variance32x64_c, 8),
-        make_tuple(6, 5, highbd_subpel_avg_variance64x32_c, 8),
-        make_tuple(6, 6, highbd_subpel_avg_variance64x64_c, 8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_VP9_ENCODER
-
-#if CONFIG_VP8_ENCODER
-#if HAVE_MMX
-const SubpixVarMxNFunc subpel_variance16x16_mmx =
-    vp8_sub_pixel_variance16x16_mmx;
-const SubpixVarMxNFunc subpel_variance16x8_mmx = vp8_sub_pixel_variance16x8_mmx;
-const SubpixVarMxNFunc subpel_variance8x16_mmx = vp8_sub_pixel_variance8x16_mmx;
-const SubpixVarMxNFunc subpel_variance8x8_mmx = vp8_sub_pixel_variance8x8_mmx;
-const SubpixVarMxNFunc subpel_variance4x4_mmx = vp8_sub_pixel_variance4x4_mmx;
-INSTANTIATE_TEST_CASE_P(
-    MMX, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_mmx, 0),
-                      make_tuple(4, 3, subpel_variance16x8_mmx, 0),
-                      make_tuple(3, 4, subpel_variance8x16_mmx, 0),
-                      make_tuple(3, 3, subpel_variance8x8_mmx, 0),
-                      make_tuple(2, 2, subpel_variance4x4_mmx, 0)));
-#endif  // HAVE_MMX
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-#if HAVE_SSE2
 #if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance4x4_sse = vp9_sub_pixel_variance4x4_sse;
-const SubpixVarMxNFunc subpel_variance4x8_sse = vp9_sub_pixel_variance4x8_sse;
-const SubpixVarMxNFunc subpel_variance8x4_sse2 = vp9_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc subpel_variance8x8_sse2 = vp9_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc subpel_variance8x16_sse2 =
-    vp9_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x8_sse2 =
-    vp9_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc subpel_variance16x16_sse2 =
-    vp9_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc subpel_variance16x32_sse2 =
-    vp9_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x16_sse2 =
-    vp9_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc subpel_variance32x32_sse2 =
-    vp9_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc subpel_variance32x64_sse2 =
-    vp9_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc subpel_variance64x32_sse2 =
-    vp9_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc subpel_variance64x64_sse2 =
-    vp9_sub_pixel_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_sse, 0),
-                      make_tuple(2, 3, subpel_variance4x8_sse, 0),
-                      make_tuple(3, 2, subpel_variance8x4_sse2, 0),
-                      make_tuple(3, 3, subpel_variance8x8_sse2, 0),
-                      make_tuple(3, 4, subpel_variance8x16_sse2, 0),
-                      make_tuple(4, 3, subpel_variance16x8_sse2, 0),
-                      make_tuple(4, 4, subpel_variance16x16_sse2, 0),
-                      make_tuple(4, 5, subpel_variance16x32_sse2, 0),
-                      make_tuple(5, 4, subpel_variance32x16_sse2, 0),
-                      make_tuple(5, 5, subpel_variance32x32_sse2, 0),
-                      make_tuple(5, 6, subpel_variance32x64_sse2, 0),
-                      make_tuple(6, 5, subpel_variance64x32_sse2, 0),
-                      make_tuple(6, 6, subpel_variance64x64_sse2, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_sse =
-    vp9_sub_pixel_avg_variance4x4_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_sse =
-    vp9_sub_pixel_avg_variance4x8_sse;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_sse2 =
-    vp9_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_sse2 =
-    vp9_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_sse2 =
-    vp9_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_sse2 =
-    vp9_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_sse2 =
-    vp9_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_sse2 =
-    vp9_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_sse2 =
-    vp9_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_sse2 =
-    vp9_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_sse2 =
-    vp9_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_sse2 =
-    vp9_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_sse2 =
-    vp9_sub_pixel_avg_variance64x64_sse2;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_sse, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_sse, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_sse2, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_sse2, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_sse2, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_sse2, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_sse2, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_sse2, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_sse2, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_sse2, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_sse2, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_sse2, 0),
-                      make_tuple(6, 6, subpel_avg_variance64x64_sse2, 0)));
-#if CONFIG_VP9_HIGHBITDEPTH
-const SubpixVarMxNFunc highbd_subpel_variance8x4_sse2 =
-    vp9_highbd_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance8x8_sse2 =
-    vp9_highbd_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance8x16_sse2 =
-    vp9_highbd_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x8_sse2 =
-    vp9_highbd_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x16_sse2 =
-    vp9_highbd_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance16x32_sse2 =
-    vp9_highbd_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x16_sse2 =
-    vp9_highbd_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x32_sse2 =
-    vp9_highbd_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance32x64_sse2 =
-    vp9_highbd_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance64x32_sse2 =
-    vp9_highbd_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_subpel_variance64x64_sse2 =
-    vp9_highbd_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
-    vp9_highbd_10_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
-    vp9_highbd_10_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
-    vp9_highbd_10_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
-    vp9_highbd_10_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
-    vp9_highbd_10_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
-    vp9_highbd_10_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
-    vp9_highbd_10_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
-    vp9_highbd_10_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
-    vp9_highbd_10_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
-    vp9_highbd_10_sub_pixel_variance64x32_sse2;
-const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
-    vp9_highbd_10_sub_pixel_variance64x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
-    vp9_highbd_12_sub_pixel_variance8x4_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
-    vp9_highbd_12_sub_pixel_variance8x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
-    vp9_highbd_12_sub_pixel_variance8x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
-    vp9_highbd_12_sub_pixel_variance16x8_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
-    vp9_highbd_12_sub_pixel_variance16x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
-    vp9_highbd_12_sub_pixel_variance16x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
-    vp9_highbd_12_sub_pixel_variance32x16_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
-    vp9_highbd_12_sub_pixel_variance32x32_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
-    vp9_highbd_12_sub_pixel_variance32x64_sse2;
-const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
-    vp9_highbd_12_sub_pixel_variance64x32_sse2;
 const SubpixVarMxNFunc highbd_12_subpel_variance64x64_sse2 =
-    vp9_highbd_12_sub_pixel_variance64x64_sse2;
+    vpx_highbd_12_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance64x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x64_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance32x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x32_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance16x8_sse2 =
+    vpx_highbd_12_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x16_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x8_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_12_subpel_variance8x4_sse2 =
+    vpx_highbd_12_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x64_sse2 =
+    vpx_highbd_10_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance64x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x64_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance32x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x32_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance16x8_sse2 =
+    vpx_highbd_10_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x16_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x8_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_10_subpel_variance8x4_sse2 =
+    vpx_highbd_10_sub_pixel_variance8x4_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x64_sse2 =
+    vpx_highbd_8_sub_pixel_variance64x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance64x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance64x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x64_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x64_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance32x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance32x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x32_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x32_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance16x8_sse2 =
+    vpx_highbd_8_sub_pixel_variance16x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x16_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x16_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x8_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x8_sse2;
+const SubpixVarMxNFunc highbd_8_subpel_variance8x4_sse2 =
+    vpx_highbd_8_sub_pixel_variance8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelVarianceHighTest,
-    ::testing::Values(make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
-                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
-                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
-                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
-                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
-                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
-                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
-                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
-                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
-                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
-                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
-                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
-                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
-                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
-                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
-                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
-                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
-                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
-                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
-                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+    SSE2, VpxHBDSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
                       make_tuple(6, 5, highbd_12_subpel_variance64x32_sse2, 12),
-                      make_tuple(6, 6, highbd_12_subpel_variance64x64_sse2, 12),
-                      make_tuple(3, 2, highbd_subpel_variance8x4_sse2, 8),
-                      make_tuple(3, 3, highbd_subpel_variance8x8_sse2, 8),
-                      make_tuple(3, 4, highbd_subpel_variance8x16_sse2, 8),
-                      make_tuple(4, 3, highbd_subpel_variance16x8_sse2, 8),
-                      make_tuple(4, 4, highbd_subpel_variance16x16_sse2, 8),
-                      make_tuple(4, 5, highbd_subpel_variance16x32_sse2, 8),
-                      make_tuple(5, 4, highbd_subpel_variance32x16_sse2, 8),
-                      make_tuple(5, 5, highbd_subpel_variance32x32_sse2, 8),
-                      make_tuple(5, 6, highbd_subpel_variance32x64_sse2, 8),
-                      make_tuple(6, 5, highbd_subpel_variance64x32_sse2, 8),
-                      make_tuple(6, 6, highbd_subpel_variance64x64_sse2, 8)));
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x4_sse2 =
-    vp9_highbd_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x8_sse2 =
-    vp9_highbd_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance8x16_sse2 =
-    vp9_highbd_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x8_sse2 =
-    vp9_highbd_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x16_sse2 =
-    vp9_highbd_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance16x32_sse2 =
-    vp9_highbd_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x16_sse2 =
-    vp9_highbd_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x32_sse2 =
-    vp9_highbd_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance32x64_sse2 =
-    vp9_highbd_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x32_sse2 =
-    vp9_highbd_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_subpel_avg_variance64x64_sse2 =
-    vp9_highbd_sub_pixel_avg_variance64x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x4_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x8_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance8x16_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x8_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x16_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance16x32_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x16_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x32_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance32x64_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x32_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_10_subpel_avg_variance64x64_sse2 =
-    vp9_highbd_10_sub_pixel_avg_variance64x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x4_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance8x4_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x8_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance8x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance8x16_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance8x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x8_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance16x8_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x16_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance16x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance16x32_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance16x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x16_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance32x16_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x32_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance32x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance32x64_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance32x64_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x32_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance64x32_sse2;
-const vp9_subp_avg_variance_fn_t highbd_12_subpel_avg_variance64x64_sse2 =
-    vp9_highbd_12_sub_pixel_avg_variance64x64_sse2;
+                      make_tuple(5, 6, highbd_12_subpel_variance32x64_sse2, 12),
+                      make_tuple(5, 5, highbd_12_subpel_variance32x32_sse2, 12),
+                      make_tuple(5, 4, highbd_12_subpel_variance32x16_sse2, 12),
+                      make_tuple(4, 5, highbd_12_subpel_variance16x32_sse2, 12),
+                      make_tuple(4, 4, highbd_12_subpel_variance16x16_sse2, 12),
+                      make_tuple(4, 3, highbd_12_subpel_variance16x8_sse2, 12),
+                      make_tuple(3, 4, highbd_12_subpel_variance8x16_sse2, 12),
+                      make_tuple(3, 3, highbd_12_subpel_variance8x8_sse2, 12),
+                      make_tuple(3, 2, highbd_12_subpel_variance8x4_sse2, 12),
+                      make_tuple(6, 6, highbd_10_subpel_variance64x64_sse2, 10),
+                      make_tuple(6, 5, highbd_10_subpel_variance64x32_sse2, 10),
+                      make_tuple(5, 6, highbd_10_subpel_variance32x64_sse2, 10),
+                      make_tuple(5, 5, highbd_10_subpel_variance32x32_sse2, 10),
+                      make_tuple(5, 4, highbd_10_subpel_variance32x16_sse2, 10),
+                      make_tuple(4, 5, highbd_10_subpel_variance16x32_sse2, 10),
+                      make_tuple(4, 4, highbd_10_subpel_variance16x16_sse2, 10),
+                      make_tuple(4, 3, highbd_10_subpel_variance16x8_sse2, 10),
+                      make_tuple(3, 4, highbd_10_subpel_variance8x16_sse2, 10),
+                      make_tuple(3, 3, highbd_10_subpel_variance8x8_sse2, 10),
+                      make_tuple(3, 2, highbd_10_subpel_variance8x4_sse2, 10),
+                      make_tuple(6, 6, highbd_8_subpel_variance64x64_sse2, 8),
+                      make_tuple(6, 5, highbd_8_subpel_variance64x32_sse2, 8),
+                      make_tuple(5, 6, highbd_8_subpel_variance32x64_sse2, 8),
+                      make_tuple(5, 5, highbd_8_subpel_variance32x32_sse2, 8),
+                      make_tuple(5, 4, highbd_8_subpel_variance32x16_sse2, 8),
+                      make_tuple(4, 5, highbd_8_subpel_variance16x32_sse2, 8),
+                      make_tuple(4, 4, highbd_8_subpel_variance16x16_sse2, 8),
+                      make_tuple(4, 3, highbd_8_subpel_variance16x8_sse2, 8),
+                      make_tuple(3, 4, highbd_8_subpel_variance8x16_sse2, 8),
+                      make_tuple(3, 3, highbd_8_subpel_variance8x8_sse2, 8),
+                      make_tuple(3, 2, highbd_8_subpel_variance8x4_sse2, 8)));
+
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_12_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_12_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_10_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_10_sub_pixel_avg_variance8x4_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x64_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance64x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance64x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance64x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x64_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x64_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance32x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance32x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x32_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x32_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance16x8_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance16x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x16_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x16_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x8_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x8_sse2;
+const SubpixAvgVarMxNFunc highbd_8_subpel_avg_variance8x4_sse2 =
+    vpx_highbd_8_sub_pixel_avg_variance8x4_sse2;
 INSTANTIATE_TEST_CASE_P(
-    SSE2, VP9SubpelAvgVarianceHighTest,
+    SSE2, VpxHBDSubpelAvgVarianceTest,
     ::testing::Values(
-                  make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
-                  make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
-                  make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
-                  make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
-                  make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
-                  make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
-                  make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
-                  make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
-                  make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
-                  make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
-                  make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
-                  make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
-                  make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
-                  make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
-                  make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
-                  make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
-                  make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
-                  make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
-                  make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
-                  make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
-                  make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
-                  make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
-                  make_tuple(3, 2, highbd_subpel_avg_variance8x4_sse2, 8),
-                  make_tuple(3, 3, highbd_subpel_avg_variance8x8_sse2, 8),
-                  make_tuple(3, 4, highbd_subpel_avg_variance8x16_sse2, 8),
-                  make_tuple(4, 3, highbd_subpel_avg_variance16x8_sse2, 8),
-                  make_tuple(4, 4, highbd_subpel_avg_variance16x16_sse2, 8),
-                  make_tuple(4, 5, highbd_subpel_avg_variance16x32_sse2, 8),
-                  make_tuple(5, 4, highbd_subpel_avg_variance32x16_sse2, 8),
-                  make_tuple(5, 5, highbd_subpel_avg_variance32x32_sse2, 8),
-                  make_tuple(5, 6, highbd_subpel_avg_variance32x64_sse2, 8),
-                  make_tuple(6, 5, highbd_subpel_avg_variance64x32_sse2, 8),
-                  make_tuple(6, 6, highbd_subpel_avg_variance64x64_sse2, 8)));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+        make_tuple(6, 6, highbd_12_subpel_avg_variance64x64_sse2, 12),
+        make_tuple(6, 5, highbd_12_subpel_avg_variance64x32_sse2, 12),
+        make_tuple(5, 6, highbd_12_subpel_avg_variance32x64_sse2, 12),
+        make_tuple(5, 5, highbd_12_subpel_avg_variance32x32_sse2, 12),
+        make_tuple(5, 4, highbd_12_subpel_avg_variance32x16_sse2, 12),
+        make_tuple(4, 5, highbd_12_subpel_avg_variance16x32_sse2, 12),
+        make_tuple(4, 4, highbd_12_subpel_avg_variance16x16_sse2, 12),
+        make_tuple(4, 3, highbd_12_subpel_avg_variance16x8_sse2, 12),
+        make_tuple(3, 4, highbd_12_subpel_avg_variance8x16_sse2, 12),
+        make_tuple(3, 3, highbd_12_subpel_avg_variance8x8_sse2, 12),
+        make_tuple(3, 2, highbd_12_subpel_avg_variance8x4_sse2, 12),
+        make_tuple(6, 6, highbd_10_subpel_avg_variance64x64_sse2, 10),
+        make_tuple(6, 5, highbd_10_subpel_avg_variance64x32_sse2, 10),
+        make_tuple(5, 6, highbd_10_subpel_avg_variance32x64_sse2, 10),
+        make_tuple(5, 5, highbd_10_subpel_avg_variance32x32_sse2, 10),
+        make_tuple(5, 4, highbd_10_subpel_avg_variance32x16_sse2, 10),
+        make_tuple(4, 5, highbd_10_subpel_avg_variance16x32_sse2, 10),
+        make_tuple(4, 4, highbd_10_subpel_avg_variance16x16_sse2, 10),
+        make_tuple(4, 3, highbd_10_subpel_avg_variance16x8_sse2, 10),
+        make_tuple(3, 4, highbd_10_subpel_avg_variance8x16_sse2, 10),
+        make_tuple(3, 3, highbd_10_subpel_avg_variance8x8_sse2, 10),
+        make_tuple(3, 2, highbd_10_subpel_avg_variance8x4_sse2, 10),
+        make_tuple(6, 6, highbd_8_subpel_avg_variance64x64_sse2, 8),
+        make_tuple(6, 5, highbd_8_subpel_avg_variance64x32_sse2, 8),
+        make_tuple(5, 6, highbd_8_subpel_avg_variance32x64_sse2, 8),
+        make_tuple(5, 5, highbd_8_subpel_avg_variance32x32_sse2, 8),
+        make_tuple(5, 4, highbd_8_subpel_avg_variance32x16_sse2, 8),
+        make_tuple(4, 5, highbd_8_subpel_avg_variance16x32_sse2, 8),
+        make_tuple(4, 4, highbd_8_subpel_avg_variance16x16_sse2, 8),
+        make_tuple(4, 3, highbd_8_subpel_avg_variance16x8_sse2, 8),
+        make_tuple(3, 4, highbd_8_subpel_avg_variance8x16_sse2, 8),
+        make_tuple(3, 3, highbd_8_subpel_avg_variance8x8_sse2, 8),
+        make_tuple(3, 2, highbd_8_subpel_avg_variance8x4_sse2, 8)));
 #endif  // CONFIG_USE_X86INC
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_SSE2
-#endif  // CONFIG_VP9_ENCODER
 
-#if CONFIG_VP8_ENCODER
-#if HAVE_SSE2
-const SubpixVarMxNFunc vp8_subpel_variance16x16_sse2 =
-    vp8_sub_pixel_variance16x16_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_sse2 =
-    vp8_sub_pixel_variance16x8_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance8x16_sse2 =
-    vp8_sub_pixel_variance8x16_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance8x8_sse2 =
-    vp8_sub_pixel_variance8x8_wmt;
-const SubpixVarMxNFunc vp8_subpel_variance4x4_sse2 =
-    vp8_sub_pixel_variance4x4_wmt;
-INSTANTIATE_TEST_CASE_P(
-    SSE2, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, vp8_subpel_variance4x4_sse2, 0),
-                      make_tuple(3, 3, vp8_subpel_variance8x8_sse2, 0),
-                      make_tuple(3, 4, vp8_subpel_variance8x16_sse2, 0),
-                      make_tuple(4, 3, vp8_subpel_variance16x8_sse2, 0),
-                      make_tuple(4, 4, vp8_subpel_variance16x16_sse2, 0)));
-#endif  // HAVE_SSE2
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
 #if HAVE_SSSE3
 #if CONFIG_USE_X86INC
-const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
-    vp9_sub_pixel_variance4x4_ssse3;
-const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
-    vp9_sub_pixel_variance4x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
-    vp9_sub_pixel_variance8x4_ssse3;
-const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
-    vp9_sub_pixel_variance8x8_ssse3;
-const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
-    vp9_sub_pixel_variance8x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
-    vp9_sub_pixel_variance16x8_ssse3;
-const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
-    vp9_sub_pixel_variance16x16_ssse3;
-const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
-    vp9_sub_pixel_variance16x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
-    vp9_sub_pixel_variance32x16_ssse3;
-const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
-    vp9_sub_pixel_variance32x32_ssse3;
-const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
-    vp9_sub_pixel_variance32x64_ssse3;
-const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
-    vp9_sub_pixel_variance64x32_ssse3;
 const SubpixVarMxNFunc subpel_variance64x64_ssse3 =
-    vp9_sub_pixel_variance64x64_ssse3;
+    vpx_sub_pixel_variance64x64_ssse3;
+const SubpixVarMxNFunc subpel_variance64x32_ssse3 =
+    vpx_sub_pixel_variance64x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x64_ssse3 =
+    vpx_sub_pixel_variance32x64_ssse3;
+const SubpixVarMxNFunc subpel_variance32x32_ssse3 =
+    vpx_sub_pixel_variance32x32_ssse3;
+const SubpixVarMxNFunc subpel_variance32x16_ssse3 =
+    vpx_sub_pixel_variance32x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x32_ssse3 =
+    vpx_sub_pixel_variance16x32_ssse3;
+const SubpixVarMxNFunc subpel_variance16x16_ssse3 =
+    vpx_sub_pixel_variance16x16_ssse3;
+const SubpixVarMxNFunc subpel_variance16x8_ssse3 =
+    vpx_sub_pixel_variance16x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x16_ssse3 =
+    vpx_sub_pixel_variance8x16_ssse3;
+const SubpixVarMxNFunc subpel_variance8x8_ssse3 =
+    vpx_sub_pixel_variance8x8_ssse3;
+const SubpixVarMxNFunc subpel_variance8x4_ssse3 =
+    vpx_sub_pixel_variance8x4_ssse3;
+const SubpixVarMxNFunc subpel_variance4x8_ssse3 =
+    vpx_sub_pixel_variance4x8_ssse3;
+const SubpixVarMxNFunc subpel_variance4x4_ssse3 =
+    vpx_sub_pixel_variance4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_variance4x4_ssse3, 0),
-                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
-                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
-                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
-                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
-                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
-                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
-                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
-                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
-                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
-                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+    SSSE3, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_ssse3, 0),
                       make_tuple(6, 5, subpel_variance64x32_ssse3, 0),
-                      make_tuple(6, 6, subpel_variance64x64_ssse3, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_ssse3 =
-    vp9_sub_pixel_avg_variance4x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_ssse3 =
-    vp9_sub_pixel_avg_variance4x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_ssse3 =
-    vp9_sub_pixel_avg_variance8x4_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_ssse3 =
-    vp9_sub_pixel_avg_variance8x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_ssse3 =
-    vp9_sub_pixel_avg_variance8x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_ssse3 =
-    vp9_sub_pixel_avg_variance16x8_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_ssse3 =
-    vp9_sub_pixel_avg_variance16x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_ssse3 =
-    vp9_sub_pixel_avg_variance16x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_ssse3 =
-    vp9_sub_pixel_avg_variance32x16_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_ssse3 =
-    vp9_sub_pixel_avg_variance32x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_ssse3 =
-    vp9_sub_pixel_avg_variance32x64_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_ssse3 =
-    vp9_sub_pixel_avg_variance64x32_ssse3;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_ssse3 =
-    vp9_sub_pixel_avg_variance64x64_ssse3;
+                      make_tuple(5, 6, subpel_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, subpel_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, subpel_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, subpel_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, subpel_variance4x4_ssse3, 0)));
+
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_ssse3 =
+    vpx_sub_pixel_avg_variance64x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x32_ssse3 =
+    vpx_sub_pixel_avg_variance64x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x64_ssse3 =
+    vpx_sub_pixel_avg_variance32x64_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_ssse3 =
+    vpx_sub_pixel_avg_variance32x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x16_ssse3 =
+    vpx_sub_pixel_avg_variance32x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x32_ssse3 =
+    vpx_sub_pixel_avg_variance16x32_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x16_ssse3 =
+    vpx_sub_pixel_avg_variance16x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance16x8_ssse3 =
+    vpx_sub_pixel_avg_variance16x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x16_ssse3 =
+    vpx_sub_pixel_avg_variance8x16_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x8_ssse3 =
+    vpx_sub_pixel_avg_variance8x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance8x4_ssse3 =
+    vpx_sub_pixel_avg_variance8x4_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x8_ssse3 =
+    vpx_sub_pixel_avg_variance4x8_ssse3;
+const SubpixAvgVarMxNFunc subpel_avg_variance4x4_ssse3 =
+    vpx_sub_pixel_avg_variance4x4_ssse3;
 INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+    SSSE3, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0),
                       make_tuple(6, 5, subpel_avg_variance64x32_ssse3, 0),
-                      make_tuple(6, 6, subpel_avg_variance64x64_ssse3, 0)));
+                      make_tuple(5, 6, subpel_avg_variance32x64_ssse3, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_ssse3, 0),
+                      make_tuple(5, 4, subpel_avg_variance32x16_ssse3, 0),
+                      make_tuple(4, 5, subpel_avg_variance16x32_ssse3, 0),
+                      make_tuple(4, 4, subpel_avg_variance16x16_ssse3, 0),
+                      make_tuple(4, 3, subpel_avg_variance16x8_ssse3, 0),
+                      make_tuple(3, 4, subpel_avg_variance8x16_ssse3, 0),
+                      make_tuple(3, 3, subpel_avg_variance8x8_ssse3, 0),
+                      make_tuple(3, 2, subpel_avg_variance8x4_ssse3, 0),
+                      make_tuple(2, 3, subpel_avg_variance4x8_ssse3, 0),
+                      make_tuple(2, 2, subpel_avg_variance4x4_ssse3, 0)));
 #endif  // CONFIG_USE_X86INC
 #endif  // HAVE_SSSE3
-#endif  // CONFIG_VP9_ENCODER
-
-#if CONFIG_VP8_ENCODER
-#if HAVE_SSSE3
-const SubpixVarMxNFunc vp8_subpel_variance16x16_ssse3 =
-    vp8_sub_pixel_variance16x16_ssse3;
-const SubpixVarMxNFunc vp8_subpel_variance16x8_ssse3 =
-    vp8_sub_pixel_variance16x8_ssse3;
-INSTANTIATE_TEST_CASE_P(
-    SSSE3, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 3, vp8_subpel_variance16x8_ssse3, 0),
-                      make_tuple(4, 4, vp8_subpel_variance16x16_ssse3, 0)));
-#endif  // HAVE_SSSE3
-#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_AVX2
 const VarianceMxNFunc mse16x16_avx2 = vpx_mse16x16_avx2;
@@ -1910,39 +1828,46 @@
                       make_tuple(5, 4, variance32x16_avx2, 0),
                       make_tuple(4, 4, variance16x16_avx2, 0)));
 
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance32x32_avx2 =
-    vp9_sub_pixel_variance32x32_avx2;
 const SubpixVarMxNFunc subpel_variance64x64_avx2 =
-    vp9_sub_pixel_variance64x64_avx2;
+    vpx_sub_pixel_variance64x64_avx2;
+const SubpixVarMxNFunc subpel_variance32x32_avx2 =
+    vpx_sub_pixel_variance32x32_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_variance32x32_avx2, 0),
-                      make_tuple(6, 6, subpel_variance64x64_avx2, 0)));
+    AVX2, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_avx2, 0),
+                      make_tuple(5, 5, subpel_variance32x32_avx2, 0)));
 
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_avx2 =
-    vp9_sub_pixel_avg_variance32x32_avx2;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_avx2 =
-    vp9_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance64x64_avx2 =
+    vpx_sub_pixel_avg_variance64x64_avx2;
+const SubpixAvgVarMxNFunc subpel_avg_variance32x32_avx2 =
+    vpx_sub_pixel_avg_variance32x32_avx2;
 INSTANTIATE_TEST_CASE_P(
-    AVX2, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0),
-                      make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0)));
-#endif  // CONFIG_VP9_ENCODER
+    AVX2, VpxSubpelAvgVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_avg_variance64x64_avx2, 0),
+                      make_tuple(5, 5, subpel_avg_variance32x32_avx2, 0)));
 #endif  // HAVE_AVX2
 
-#if CONFIG_VP8_ENCODER
 #if HAVE_MEDIA
-const SubpixVarMxNFunc subpel_variance16x16_media =
-    vp8_sub_pixel_variance16x16_armv6;
-const SubpixVarMxNFunc subpel_variance8x8_media =
-    vp8_sub_pixel_variance8x8_armv6;
+const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
+INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
+                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));
+
+const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
+const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
 INSTANTIATE_TEST_CASE_P(
-    MEDIA, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_media, 0),
-                      make_tuple(4, 4, subpel_variance16x16_media, 0)));
+    MEDIA, VpxVarianceTest,
+    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
+                      make_tuple(3, 3, variance8x8_media, 0)));
+
+const SubpixVarMxNFunc subpel_variance16x16_media =
+    vpx_sub_pixel_variance16x16_media;
+const SubpixVarMxNFunc subpel_variance8x8_media =
+    vpx_sub_pixel_variance8x8_media;
+INSTANTIATE_TEST_CASE_P(
+    MEDIA, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(4, 4, subpel_variance16x16_media, 0),
+                      make_tuple(3, 3, subpel_variance8x8_media, 0)));
 #endif  // HAVE_MEDIA
-#endif  // CONFIG_VP8_ENCODER
 
 #if HAVE_NEON
 const Get4x4SseFunc get4x4sse_cs_neon = vpx_get4x4sse_cs_neon;
@@ -1972,46 +1897,21 @@
                       make_tuple(3, 4, variance8x16_neon, 0),
                       make_tuple(3, 3, variance8x8_neon, 0)));
 
-#if CONFIG_VP8_ENCODER
-#if HAVE_NEON_ASM
-const SubpixVarMxNFunc vp8_subpel_variance16x16_neon =
-    vp8_sub_pixel_variance16x16_neon;
-INSTANTIATE_TEST_CASE_P(
-    NEON, VP8SubpelVarianceTest,
-    ::testing::Values(make_tuple(4, 4, vp8_subpel_variance16x16_neon, 0)));
-#endif  // HAVE_NEON_ASM
-#endif  // CONFIG_VP8_ENCODER
-
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance8x8_neon = vp9_sub_pixel_variance8x8_neon;
-const SubpixVarMxNFunc subpel_variance16x16_neon =
-    vp9_sub_pixel_variance16x16_neon;
-const SubpixVarMxNFunc subpel_variance32x32_neon =
-    vp9_sub_pixel_variance32x32_neon;
 const SubpixVarMxNFunc subpel_variance64x64_neon =
-    vp9_sub_pixel_variance64x64_neon;
+    vpx_sub_pixel_variance64x64_neon;
+const SubpixVarMxNFunc subpel_variance32x32_neon =
+    vpx_sub_pixel_variance32x32_neon;
+const SubpixVarMxNFunc subpel_variance16x16_neon =
+    vpx_sub_pixel_variance16x16_neon;
+const SubpixVarMxNFunc subpel_variance8x8_neon = vpx_sub_pixel_variance8x8_neon;
 INSTANTIATE_TEST_CASE_P(
-    NEON, VP9SubpelVarianceTest,
-    ::testing::Values(make_tuple(3, 3, subpel_variance8x8_neon, 0),
-                      make_tuple(4, 4, subpel_variance16x16_neon, 0),
+    NEON, VpxSubpelVarianceTest,
+    ::testing::Values(make_tuple(6, 6, subpel_variance64x64_neon, 0),
                       make_tuple(5, 5, subpel_variance32x32_neon, 0),
-                      make_tuple(6, 6, subpel_variance64x64_neon, 0)));
-#endif  // CONFIG_VP9_ENCODER
+                      make_tuple(4, 4, subpel_variance16x16_neon, 0),
+                      make_tuple(3, 3, subpel_variance8x8_neon, 0)));
 #endif  // HAVE_NEON
 
-#if HAVE_MEDIA
-const VarianceMxNFunc mse16x16_media = vpx_mse16x16_media;
-INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest,
-                        ::testing::Values(make_tuple(4, 4, mse16x16_media)));
-
-const VarianceMxNFunc variance16x16_media = vpx_variance16x16_media;
-const VarianceMxNFunc variance8x8_media = vpx_variance8x8_media;
-INSTANTIATE_TEST_CASE_P(
-    MEDIA, VpxVarianceTest,
-    ::testing::Values(make_tuple(4, 4, variance16x16_media, 0),
-                      make_tuple(3, 3, variance8x8_media, 0)));
-#endif  // HAVE_MEDIA
-
 #if HAVE_MSA
 INSTANTIATE_TEST_CASE_P(MSA, SumOfSquaresTest,
                         ::testing::Values(vpx_get_mb_ss_msa));
@@ -2059,29 +1959,28 @@
                       make_tuple(2, 3, variance4x8_msa, 0),
                       make_tuple(2, 2, variance4x4_msa, 0)));
 
-#if CONFIG_VP9_ENCODER
-const SubpixVarMxNFunc subpel_variance4x4_msa = vp9_sub_pixel_variance4x4_msa;
-const SubpixVarMxNFunc subpel_variance4x8_msa = vp9_sub_pixel_variance4x8_msa;
-const SubpixVarMxNFunc subpel_variance8x4_msa = vp9_sub_pixel_variance8x4_msa;
-const SubpixVarMxNFunc subpel_variance8x8_msa = vp9_sub_pixel_variance8x8_msa;
-const SubpixVarMxNFunc subpel_variance8x16_msa = vp9_sub_pixel_variance8x16_msa;
-const SubpixVarMxNFunc subpel_variance16x8_msa = vp9_sub_pixel_variance16x8_msa;
+const SubpixVarMxNFunc subpel_variance4x4_msa = vpx_sub_pixel_variance4x4_msa;
+const SubpixVarMxNFunc subpel_variance4x8_msa = vpx_sub_pixel_variance4x8_msa;
+const SubpixVarMxNFunc subpel_variance8x4_msa = vpx_sub_pixel_variance8x4_msa;
+const SubpixVarMxNFunc subpel_variance8x8_msa = vpx_sub_pixel_variance8x8_msa;
+const SubpixVarMxNFunc subpel_variance8x16_msa = vpx_sub_pixel_variance8x16_msa;
+const SubpixVarMxNFunc subpel_variance16x8_msa = vpx_sub_pixel_variance16x8_msa;
 const SubpixVarMxNFunc subpel_variance16x16_msa =
-    vp9_sub_pixel_variance16x16_msa;
+    vpx_sub_pixel_variance16x16_msa;
 const SubpixVarMxNFunc subpel_variance16x32_msa =
-    vp9_sub_pixel_variance16x32_msa;
+    vpx_sub_pixel_variance16x32_msa;
 const SubpixVarMxNFunc subpel_variance32x16_msa =
-    vp9_sub_pixel_variance32x16_msa;
+    vpx_sub_pixel_variance32x16_msa;
 const SubpixVarMxNFunc subpel_variance32x32_msa =
-    vp9_sub_pixel_variance32x32_msa;
+    vpx_sub_pixel_variance32x32_msa;
 const SubpixVarMxNFunc subpel_variance32x64_msa =
-    vp9_sub_pixel_variance32x64_msa;
+    vpx_sub_pixel_variance32x64_msa;
 const SubpixVarMxNFunc subpel_variance64x32_msa =
-    vp9_sub_pixel_variance64x32_msa;
+    vpx_sub_pixel_variance64x32_msa;
 const SubpixVarMxNFunc subpel_variance64x64_msa =
-    vp9_sub_pixel_variance64x64_msa;
+    vpx_sub_pixel_variance64x64_msa;
 INSTANTIATE_TEST_CASE_P(
-    MSA, VP9SubpelVarianceTest,
+    MSA, VpxSubpelVarianceTest,
     ::testing::Values(make_tuple(2, 2, subpel_variance4x4_msa, 0),
                       make_tuple(2, 3, subpel_variance4x8_msa, 0),
                       make_tuple(3, 2, subpel_variance8x4_msa, 0),
@@ -2095,47 +1994,5 @@
                       make_tuple(5, 6, subpel_variance32x64_msa, 0),
                       make_tuple(6, 5, subpel_variance64x32_msa, 0),
                       make_tuple(6, 6, subpel_variance64x64_msa, 0)));
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x4_msa =
-    vp9_sub_pixel_avg_variance4x4_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance4x8_msa =
-    vp9_sub_pixel_avg_variance4x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x4_msa =
-    vp9_sub_pixel_avg_variance8x4_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x8_msa =
-    vp9_sub_pixel_avg_variance8x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance8x16_msa =
-    vp9_sub_pixel_avg_variance8x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x8_msa =
-    vp9_sub_pixel_avg_variance16x8_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x16_msa =
-    vp9_sub_pixel_avg_variance16x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance16x32_msa =
-    vp9_sub_pixel_avg_variance16x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x16_msa =
-    vp9_sub_pixel_avg_variance32x16_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x32_msa =
-    vp9_sub_pixel_avg_variance32x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance32x64_msa =
-    vp9_sub_pixel_avg_variance32x64_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x32_msa =
-    vp9_sub_pixel_avg_variance64x32_msa;
-const vp9_subp_avg_variance_fn_t subpel_avg_variance64x64_msa =
-    vp9_sub_pixel_avg_variance64x64_msa;
-INSTANTIATE_TEST_CASE_P(
-    MSA, VP9SubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(2, 2, subpel_avg_variance4x4_msa, 0),
-                      make_tuple(2, 3, subpel_avg_variance4x8_msa, 0),
-                      make_tuple(3, 2, subpel_avg_variance8x4_msa, 0),
-                      make_tuple(3, 3, subpel_avg_variance8x8_msa, 0),
-                      make_tuple(3, 4, subpel_avg_variance8x16_msa, 0),
-                      make_tuple(4, 3, subpel_avg_variance16x8_msa, 0),
-                      make_tuple(4, 4, subpel_avg_variance16x16_msa, 0),
-                      make_tuple(4, 5, subpel_avg_variance16x32_msa, 0),
-                      make_tuple(5, 4, subpel_avg_variance32x16_msa, 0),
-                      make_tuple(5, 5, subpel_avg_variance32x32_msa, 0),
-                      make_tuple(5, 6, subpel_avg_variance32x64_msa, 0),
-                      make_tuple(6, 5, subpel_avg_variance64x32_msa, 0),
-                      make_tuple(6, 6, subpel_avg_variance64x64_msa, 0)));
-#endif  // CONFIG_VP9_ENCODER
 #endif  // HAVE_MSA
 }  // namespace

diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
new file mode 100644
index 0000000..92c236f
--- /dev/null
+++ b/test/vp9_arf_freq_test.cc

@@ -0,0 +1,230 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "test/util.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "vp9/encoder/vp9_ratectrl.h"
+
+namespace {
+
+const unsigned int kFrames = 100;
+const int kBitrate = 500;
+
+#define ARF_NOT_SEEN   1000001
+#define ARF_SEEN_ONCE  1000000
+
+typedef struct {
+  const char *filename;
+  unsigned int width;
+  unsigned int height;
+  unsigned int framerate_num;
+  unsigned int framerate_den;
+  unsigned int input_bit_depth;
+  vpx_img_fmt fmt;
+  vpx_bit_depth_t bit_depth;
+  unsigned int profile;
+} TestVideoParam;
+
+typedef struct {
+  libvpx_test::TestMode mode;
+  int cpu_used;
+} TestEncodeParam;
+
+const TestVideoParam kTestVectors[] = {
+  // artificially increase framerate to trigger default check
+  {"hantro_collage_w352h288.yuv", 352, 288, 5000, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"hantro_collage_w352h288.yuv", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I420, VPX_BITS_8, 0},
+  {"rush_hour_444.y4m", 352, 288, 30, 1,
+    8, VPX_IMG_FMT_I444, VPX_BITS_8, 1},
+#if CONFIG_VP9_HIGHBITDEPTH
+  // Add list of profile 2/3 test videos here ...
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+};
+
+const TestEncodeParam kEncodeVectors[] = {
+  {::libvpx_test::kOnePassGood, 2},
+  {::libvpx_test::kOnePassGood, 5},
+  {::libvpx_test::kTwoPassGood, 1},
+  {::libvpx_test::kTwoPassGood, 2},
+  {::libvpx_test::kTwoPassGood, 5},
+  {::libvpx_test::kRealTime, 5},
+};
+
+const int kMinArfVectors[] = {
+  // NOTE: 0 refers to the default built-in logic in:
+  //       vp9_rc_get_default_min_gf_interval(...)
+  0, 4, 8, 12, 15
+};
+
+int is_extension_y4m(const char *filename) {
+  const char *dot = strrchr(filename, '.');
+  if (!dot || dot == filename)
+    return 0;
+  else
+    return !strcmp(dot, ".y4m");
+}
+
+class ArfFreqTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith3Params<TestVideoParam, \
+                                                 TestEncodeParam, int> {
+ protected:
+  ArfFreqTest()
+      : EncoderTest(GET_PARAM(0)),
+        test_video_param_(GET_PARAM(1)),
+        test_encode_param_(GET_PARAM(2)),
+        min_arf_requested_(GET_PARAM(3)) {
+  }
+
+  virtual ~ArfFreqTest() {}
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(test_encode_param_.mode);
+    if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+      cfg_.g_lag_in_frames = 25;
+      cfg_.rc_end_usage = VPX_VBR;
+    } else {
+      cfg_.g_lag_in_frames = 0;
+      cfg_.rc_end_usage = VPX_CBR;
+      cfg_.rc_buf_sz = 1000;
+      cfg_.rc_buf_initial_sz = 500;
+      cfg_.rc_buf_optimal_sz = 600;
+    }
+    dec_cfg_.threads = 4;
+  }
+
+  virtual void BeginPassHook(unsigned int) {
+    min_arf_ = ARF_NOT_SEEN;
+    run_of_visible_frames_ = 0;
+  }
+
+  int GetNumFramesInPkt(const vpx_codec_cx_pkt_t *pkt) {
+    const uint8_t *buffer = reinterpret_cast<uint8_t*>(pkt->data.frame.buf);
+    const uint8_t marker = buffer[pkt->data.frame.sz - 1];
+    const int mag = ((marker >> 3) & 3) + 1;
+    int frames = (marker & 0x7) + 1;
+    const unsigned int index_sz = 2 + mag  * frames;
+    // Check for superframe or not.
+    // Assume superframe has only one visible frame, the rest being
+    // invisible. If superframe index is not found, then there is only
+    // one frame.
+    if (!((marker & 0xe0) == 0xc0 &&
+          pkt->data.frame.sz >= index_sz &&
+          buffer[pkt->data.frame.sz - index_sz] == marker)) {
+      frames = 1;
+    }
+    return frames;
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+      return;
+    const int frames = GetNumFramesInPkt(pkt);
+    if (frames == 1) {
+      run_of_visible_frames_++;
+    } else if (frames == 2) {
+      if (min_arf_ == ARF_NOT_SEEN) {
+        min_arf_ = ARF_SEEN_ONCE;
+      } else if (min_arf_ == ARF_SEEN_ONCE ||
+                 run_of_visible_frames_ < min_arf_) {
+        min_arf_ = run_of_visible_frames_;
+      }
+      run_of_visible_frames_ = 1;
+    } else {
+      min_arf_ = 0;
+      run_of_visible_frames_ = 1;
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
+      encoder->Control(VP8E_SET_CPUUSED, test_encode_param_.cpu_used);
+      encoder->Control(VP9E_SET_MIN_GF_INTERVAL, min_arf_requested_);
+      if (test_encode_param_.mode != ::libvpx_test::kRealTime) {
+        encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+        encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
+        encoder->Control(VP8E_SET_ARNR_STRENGTH, 5);
+        encoder->Control(VP8E_SET_ARNR_TYPE, 3);
+      }
+    }
+  }
+
+  int GetMinArfDistance() const {
+    return min_arf_;
+  }
+
+  int GetMinArfDistanceRequested() const {
+    if (min_arf_requested_)
+      return min_arf_requested_;
+    else
+      return vp9_rc_get_default_min_gf_interval(
+          test_video_param_.width, test_video_param_.height,
+          (double)test_video_param_.framerate_num /
+          test_video_param_.framerate_den);
+  }
+
+  TestVideoParam test_video_param_;
+  TestEncodeParam test_encode_param_;
+
+ private:
+  int min_arf_requested_;
+  int min_arf_;
+  int run_of_visible_frames_;
+};
+
+TEST_P(ArfFreqTest, MinArfFreqTest) {
+  cfg_.rc_target_bitrate = kBitrate;
+  cfg_.g_error_resilient = 0;
+  cfg_.g_profile = test_video_param_.profile;
+  cfg_.g_input_bit_depth = test_video_param_.input_bit_depth;
+  cfg_.g_bit_depth = test_video_param_.bit_depth;
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  if (cfg_.g_bit_depth > 8)
+    init_flags_ |= VPX_CODEC_USE_HIGHBITDEPTH;
+
+  libvpx_test::VideoSource *video;
+  if (is_extension_y4m(test_video_param_.filename)) {
+    video = new libvpx_test::Y4mVideoSource(test_video_param_.filename,
+                                            0, kFrames);
+  } else {
+    video = new libvpx_test::YUVVideoSource(test_video_param_.filename,
+                                            test_video_param_.fmt,
+                                            test_video_param_.width,
+                                            test_video_param_.height,
+                                            test_video_param_.framerate_num,
+                                            test_video_param_.framerate_den,
+                                            0, kFrames);
+  }
+
+  ASSERT_NO_FATAL_FAILURE(RunLoop(video));
+  const int min_arf_dist = GetMinArfDistance();
+  const int min_arf_dist_requested = GetMinArfDistanceRequested();
+  if (min_arf_dist != ARF_NOT_SEEN && min_arf_dist != ARF_SEEN_ONCE) {
+    EXPECT_GE(min_arf_dist, min_arf_dist_requested);
+  }
+  delete(video);
+}
+
+VP9_INSTANTIATE_TEST_CASE(
+    ArfFreqTest,
+    ::testing::ValuesIn(kTestVectors),
+    ::testing::ValuesIn(kEncodeVectors),
+    ::testing::ValuesIn(kMinArfVectors));
+}  // namespace

diff --git a/test/vp9_avg_test.cc b/test/vp9_avg_test.cc
index 139c412..856b4c1 100644
--- a/test/vp9_avg_test.cc
+++ b/test/vp9_avg_test.cc

@@ -286,6 +286,11 @@
         make_tuple(16, 16, 5, 8, &vp9_avg_8x8_neon),
         make_tuple(32, 32, 15, 8, &vp9_avg_8x8_neon)));
 
+INSTANTIATE_TEST_CASE_P(
+    NEON, IntProRowTest, ::testing::Values(
+        make_tuple(16, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(32, &vp9_int_pro_row_neon, &vp9_int_pro_row_c),
+        make_tuple(64, &vp9_int_pro_row_neon, &vp9_int_pro_row_c)));
 #endif
 
 #if HAVE_MSA

diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index fabb438..a1798d7 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc

@@ -14,6 +14,7 @@
 #include "test/register_state_check.h"
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "vp9/common/vp9_blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
@@ -89,15 +90,15 @@
 }
 
 INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_c));
+                        ::testing::Values(vpx_subtract_block_c));
 
 #if HAVE_SSE2 && CONFIG_USE_X86INC
 INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_sse2));
+                        ::testing::Values(vpx_subtract_block_sse2));
 #endif
 #if HAVE_NEON
 INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
-                        ::testing::Values(vp9_subtract_block_neon));
+                        ::testing::Values(vpx_subtract_block_neon));
 #endif
 
 }  // namespace vp9

diff --git a/test/vp9_thread_test.cc b/test/vp9_thread_test.cc
index bde00ea..233e1b1 100644
--- a/test/vp9_thread_test.cc
+++ b/test/vp9_thread_test.cc

@@ -18,7 +18,7 @@
 #if CONFIG_WEBM_IO
 #include "test/webm_video_source.h"
 #endif
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 namespace {
 

diff --git a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c b/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
deleted file mode 100644
index 3c8ed11..0000000
--- a/vp8/common/arm/neon/vp8_subpixelvariance_neon.c
+++ /dev/null

@@ -1,1017 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-static const uint8_t bilinear_taps_coeff[8][2] = {
-    {128,   0},
-    {112,  16},
-    { 96,  32},
-    { 80,  48},
-    { 64,  64},
-    { 48,  80},
-    { 32,  96},
-    { 16, 112}
-};
-
-unsigned int vp8_sub_pixel_variance16x16_neon_func(
-        const unsigned char *src_ptr,
-        int src_pixels_per_line,
-        int xoffset,
-        int yoffset,
-        const unsigned char *dst_ptr,
-        int dst_pixels_per_line,
-        unsigned int *sse) {
-    int i;
-    DECLARE_ALIGNED(16, unsigned char, tmp[528]);
-    unsigned char *tmpp;
-    unsigned char *tmpp2;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
-    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
-    uint8x8_t d19u8, d20u8, d21u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64, d2s64, d3s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
-    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
-    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
-    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    tmpp2 = tmp + 272;
-    tmpp = tmp;
-    if (xoffset == 0) {  // secondpass_bfilter16x16_only
-        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
-        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
-
-        q11u8 = vld1q_u8(src_ptr);
-        src_ptr += src_pixels_per_line;
-        for (i = 4; i > 0; i--) {
-            q12u8 = vld1q_u8(src_ptr);
-            src_ptr += src_pixels_per_line;
-            q13u8 = vld1q_u8(src_ptr);
-            src_ptr += src_pixels_per_line;
-            q14u8 = vld1q_u8(src_ptr);
-            src_ptr += src_pixels_per_line;
-            q15u8 = vld1q_u8(src_ptr);
-            src_ptr += src_pixels_per_line;
-
-            __builtin_prefetch(src_ptr);
-            __builtin_prefetch(src_ptr + src_pixels_per_line);
-            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
-
-            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
-            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
-            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
-            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
-            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
-            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
-            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
-            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
-
-            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
-            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
-            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
-            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
-            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
-            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
-            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
-            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
-
-            d2u8 = vqrshrn_n_u16(q1u16, 7);
-            d3u8 = vqrshrn_n_u16(q2u16, 7);
-            d4u8 = vqrshrn_n_u16(q3u16, 7);
-            d5u8 = vqrshrn_n_u16(q4u16, 7);
-            d6u8 = vqrshrn_n_u16(q5u16, 7);
-            d7u8 = vqrshrn_n_u16(q6u16, 7);
-            d8u8 = vqrshrn_n_u16(q7u16, 7);
-            d9u8 = vqrshrn_n_u16(q8u16, 7);
-
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            q2u8 = vcombine_u8(d4u8, d5u8);
-            q3u8 = vcombine_u8(d6u8, d7u8);
-            q4u8 = vcombine_u8(d8u8, d9u8);
-
-            q11u8 = q15u8;
-
-            vst1q_u8((uint8_t *)tmpp2, q1u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q2u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q3u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q4u8);
-            tmpp2 += 16;
-        }
-    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
-        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
-
-        for (i = 4; i > 0 ; i--) {
-            d2u8 = vld1_u8(src_ptr);
-            d3u8 = vld1_u8(src_ptr + 8);
-            d4u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d5u8 = vld1_u8(src_ptr);
-            d6u8 = vld1_u8(src_ptr + 8);
-            d7u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d8u8 = vld1_u8(src_ptr);
-            d9u8 = vld1_u8(src_ptr + 8);
-            d10u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d11u8 = vld1_u8(src_ptr);
-            d12u8 = vld1_u8(src_ptr + 8);
-            d13u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-
-            __builtin_prefetch(src_ptr);
-            __builtin_prefetch(src_ptr + src_pixels_per_line);
-            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);
-
-            q7u16  = vmull_u8(d2u8, d0u8);
-            q8u16  = vmull_u8(d3u8, d0u8);
-            q9u16  = vmull_u8(d5u8, d0u8);
-            q10u16 = vmull_u8(d6u8, d0u8);
-            q11u16 = vmull_u8(d8u8, d0u8);
-            q12u16 = vmull_u8(d9u8, d0u8);
-            q13u16 = vmull_u8(d11u8, d0u8);
-            q14u16 = vmull_u8(d12u8, d0u8);
-
-            d2u8  = vext_u8(d2u8, d3u8, 1);
-            d5u8  = vext_u8(d5u8, d6u8, 1);
-            d8u8  = vext_u8(d8u8, d9u8, 1);
-            d11u8 = vext_u8(d11u8, d12u8, 1);
-
-            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
-            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
-            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
-            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
-
-            d3u8  = vext_u8(d3u8, d4u8, 1);
-            d6u8  = vext_u8(d6u8, d7u8, 1);
-            d9u8  = vext_u8(d9u8, d10u8, 1);
-            d12u8 = vext_u8(d12u8, d13u8, 1);
-
-            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
-            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
-            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
-            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
-
-            d14u8 = vqrshrn_n_u16(q7u16, 7);
-            d15u8 = vqrshrn_n_u16(q8u16, 7);
-            d16u8 = vqrshrn_n_u16(q9u16, 7);
-            d17u8 = vqrshrn_n_u16(q10u16, 7);
-            d18u8 = vqrshrn_n_u16(q11u16, 7);
-            d19u8 = vqrshrn_n_u16(q12u16, 7);
-            d20u8 = vqrshrn_n_u16(q13u16, 7);
-            d21u8 = vqrshrn_n_u16(q14u16, 7);
-
-            q7u8  = vcombine_u8(d14u8, d15u8);
-            q8u8  = vcombine_u8(d16u8, d17u8);
-            q9u8  = vcombine_u8(d18u8, d19u8);
-            q10u8 = vcombine_u8(d20u8, d21u8);
-
-            vst1q_u8((uint8_t *)tmpp2, q7u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q8u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q9u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q10u8);
-            tmpp2 += 16;
-        }
-    } else {
-        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
-        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);
-
-        d2u8 = vld1_u8(src_ptr);
-        d3u8 = vld1_u8(src_ptr + 8);
-        d4u8 = vld1_u8(src_ptr + 16);
-        src_ptr += src_pixels_per_line;
-        d5u8 = vld1_u8(src_ptr);
-        d6u8 = vld1_u8(src_ptr + 8);
-        d7u8 = vld1_u8(src_ptr + 16);
-        src_ptr += src_pixels_per_line;
-        d8u8 = vld1_u8(src_ptr);
-        d9u8 = vld1_u8(src_ptr + 8);
-        d10u8 = vld1_u8(src_ptr + 16);
-        src_ptr += src_pixels_per_line;
-        d11u8 = vld1_u8(src_ptr);
-        d12u8 = vld1_u8(src_ptr + 8);
-        d13u8 = vld1_u8(src_ptr + 16);
-        src_ptr += src_pixels_per_line;
-
-        // First Pass: output_height lines x output_width columns (17x16)
-        for (i = 3; i > 0; i--) {
-            q7u16  = vmull_u8(d2u8, d0u8);
-            q8u16  = vmull_u8(d3u8, d0u8);
-            q9u16  = vmull_u8(d5u8, d0u8);
-            q10u16 = vmull_u8(d6u8, d0u8);
-            q11u16 = vmull_u8(d8u8, d0u8);
-            q12u16 = vmull_u8(d9u8, d0u8);
-            q13u16 = vmull_u8(d11u8, d0u8);
-            q14u16 = vmull_u8(d12u8, d0u8);
-
-            d2u8  = vext_u8(d2u8, d3u8, 1);
-            d5u8  = vext_u8(d5u8, d6u8, 1);
-            d8u8  = vext_u8(d8u8, d9u8, 1);
-            d11u8 = vext_u8(d11u8, d12u8, 1);
-
-            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
-            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
-            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
-            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);
-
-            d3u8  = vext_u8(d3u8, d4u8, 1);
-            d6u8  = vext_u8(d6u8, d7u8, 1);
-            d9u8  = vext_u8(d9u8, d10u8, 1);
-            d12u8 = vext_u8(d12u8, d13u8, 1);
-
-            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
-            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
-            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
-            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);
-
-            d14u8 = vqrshrn_n_u16(q7u16, 7);
-            d15u8 = vqrshrn_n_u16(q8u16, 7);
-            d16u8 = vqrshrn_n_u16(q9u16, 7);
-            d17u8 = vqrshrn_n_u16(q10u16, 7);
-            d18u8 = vqrshrn_n_u16(q11u16, 7);
-            d19u8 = vqrshrn_n_u16(q12u16, 7);
-            d20u8 = vqrshrn_n_u16(q13u16, 7);
-            d21u8 = vqrshrn_n_u16(q14u16, 7);
-
-            d2u8 = vld1_u8(src_ptr);
-            d3u8 = vld1_u8(src_ptr + 8);
-            d4u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d5u8 = vld1_u8(src_ptr);
-            d6u8 = vld1_u8(src_ptr + 8);
-            d7u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d8u8 = vld1_u8(src_ptr);
-            d9u8 = vld1_u8(src_ptr + 8);
-            d10u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-            d11u8 = vld1_u8(src_ptr);
-            d12u8 = vld1_u8(src_ptr + 8);
-            d13u8 = vld1_u8(src_ptr + 16);
-            src_ptr += src_pixels_per_line;
-
-            q7u8 = vcombine_u8(d14u8, d15u8);
-            q8u8 = vcombine_u8(d16u8, d17u8);
-            q9u8 = vcombine_u8(d18u8, d19u8);
-            q10u8 = vcombine_u8(d20u8, d21u8);
-
-            vst1q_u8((uint8_t *)tmpp, q7u8);
-            tmpp += 16;
-            vst1q_u8((uint8_t *)tmpp, q8u8);
-            tmpp += 16;
-            vst1q_u8((uint8_t *)tmpp, q9u8);
-            tmpp += 16;
-            vst1q_u8((uint8_t *)tmpp, q10u8);
-            tmpp += 16;
-        }
-
-        // First-pass filtering for rest 5 lines
-        d14u8 = vld1_u8(src_ptr);
-        d15u8 = vld1_u8(src_ptr + 8);
-        d16u8 = vld1_u8(src_ptr + 16);
-        src_ptr += src_pixels_per_line;
-
-        q9u16  = vmull_u8(d2u8, d0u8);
-        q10u16 = vmull_u8(d3u8, d0u8);
-        q11u16 = vmull_u8(d5u8, d0u8);
-        q12u16 = vmull_u8(d6u8, d0u8);
-        q13u16 = vmull_u8(d8u8, d0u8);
-        q14u16 = vmull_u8(d9u8, d0u8);
-
-        d2u8  = vext_u8(d2u8, d3u8, 1);
-        d5u8  = vext_u8(d5u8, d6u8, 1);
-        d8u8  = vext_u8(d8u8, d9u8, 1);
-
-        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
-        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
-        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);
-
-        d3u8  = vext_u8(d3u8, d4u8, 1);
-        d6u8  = vext_u8(d6u8, d7u8, 1);
-        d9u8  = vext_u8(d9u8, d10u8, 1);
-
-        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
-        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
-        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);
-
-        q1u16 = vmull_u8(d11u8, d0u8);
-        q2u16 = vmull_u8(d12u8, d0u8);
-        q3u16 = vmull_u8(d14u8, d0u8);
-        q4u16 = vmull_u8(d15u8, d0u8);
-
-        d11u8 = vext_u8(d11u8, d12u8, 1);
-        d14u8 = vext_u8(d14u8, d15u8, 1);
-
-        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
-        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);
-
-        d12u8 = vext_u8(d12u8, d13u8, 1);
-        d15u8 = vext_u8(d15u8, d16u8, 1);
-
-        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
-        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);
-
-        d10u8 = vqrshrn_n_u16(q9u16, 7);
-        d11u8 = vqrshrn_n_u16(q10u16, 7);
-        d12u8 = vqrshrn_n_u16(q11u16, 7);
-        d13u8 = vqrshrn_n_u16(q12u16, 7);
-        d14u8 = vqrshrn_n_u16(q13u16, 7);
-        d15u8 = vqrshrn_n_u16(q14u16, 7);
-        d16u8 = vqrshrn_n_u16(q1u16, 7);
-        d17u8 = vqrshrn_n_u16(q2u16, 7);
-        d18u8 = vqrshrn_n_u16(q3u16, 7);
-        d19u8 = vqrshrn_n_u16(q4u16, 7);
-
-        q5u8 = vcombine_u8(d10u8, d11u8);
-        q6u8 = vcombine_u8(d12u8, d13u8);
-        q7u8 = vcombine_u8(d14u8, d15u8);
-        q8u8 = vcombine_u8(d16u8, d17u8);
-        q9u8 = vcombine_u8(d18u8, d19u8);
-
-        vst1q_u8((uint8_t *)tmpp, q5u8);
-        tmpp += 16;
-        vst1q_u8((uint8_t *)tmpp, q6u8);
-        tmpp += 16;
-        vst1q_u8((uint8_t *)tmpp, q7u8);
-        tmpp += 16;
-        vst1q_u8((uint8_t *)tmpp, q8u8);
-        tmpp += 16;
-        vst1q_u8((uint8_t *)tmpp, q9u8);
-
-        // secondpass_filter
-        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
-        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);
-
-        tmpp = tmp;
-        tmpp2 = tmpp + 272;
-        q11u8 = vld1q_u8(tmpp);
-        tmpp += 16;
-        for (i = 4; i > 0; i--) {
-            q12u8 = vld1q_u8(tmpp);
-            tmpp += 16;
-            q13u8 = vld1q_u8(tmpp);
-            tmpp += 16;
-            q14u8 = vld1q_u8(tmpp);
-            tmpp += 16;
-            q15u8 = vld1q_u8(tmpp);
-            tmpp += 16;
-
-            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
-            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
-            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
-            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
-            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
-            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
-            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
-            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);
-
-            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
-            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
-            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
-            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
-            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
-            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
-            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
-            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);
-
-            d2u8 = vqrshrn_n_u16(q1u16, 7);
-            d3u8 = vqrshrn_n_u16(q2u16, 7);
-            d4u8 = vqrshrn_n_u16(q3u16, 7);
-            d5u8 = vqrshrn_n_u16(q4u16, 7);
-            d6u8 = vqrshrn_n_u16(q5u16, 7);
-            d7u8 = vqrshrn_n_u16(q6u16, 7);
-            d8u8 = vqrshrn_n_u16(q7u16, 7);
-            d9u8 = vqrshrn_n_u16(q8u16, 7);
-
-            q1u8 = vcombine_u8(d2u8, d3u8);
-            q2u8 = vcombine_u8(d4u8, d5u8);
-            q3u8 = vcombine_u8(d6u8, d7u8);
-            q4u8 = vcombine_u8(d8u8, d9u8);
-
-            q11u8 = q15u8;
-
-            vst1q_u8((uint8_t *)tmpp2, q1u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q2u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q3u8);
-            tmpp2 += 16;
-            vst1q_u8((uint8_t *)tmpp2, q4u8);
-            tmpp2 += 16;
-        }
-    }
-
-    // sub_pixel_variance16x16_neon
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    tmpp = tmp + 272;
-    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
-        q0u8 = vld1q_u8(tmpp);
-        tmpp += 16;
-        q1u8 = vld1q_u8(tmpp);
-        tmpp += 16;
-        q2u8 = vld1q_u8(dst_ptr);
-        dst_ptr += dst_pixels_per_line;
-        q3u8 = vld1q_u8(dst_ptr);
-        dst_ptr += dst_pixels_per_line;
-
-        d0u8 = vget_low_u8(q0u8);
-        d1u8 = vget_high_u8(q0u8);
-        d2u8 = vget_low_u8(q1u8);
-        d3u8 = vget_high_u8(q1u8);
-
-        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
-        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
-        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vget_low_s64(q0s64);
-    d1s64 = vget_high_s64(q0s64);
-    d2s64 = vget_low_s64(q1s64);
-    d3s64 = vget_high_s64(q1s64);
-    d0s64 = vadd_s64(d0s64, d1s64);
-    d1s64 = vadd_s64(d2s64, d3s64);
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_h_neon(
-        const unsigned char *src_ptr,
-        int  source_stride,
-        const unsigned char *ref_ptr,
-        int  recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64, d2s64, d3s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8;
-    uint8x16_t q7u8, q11u8, q12u8, q13u8, q14u8;
-    uint16x8_t q0u16, q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
-        q0u8 = vld1q_u8(src_ptr);
-        q1u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q2u8 = vld1q_u8(src_ptr);
-        q3u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q4u8 = vld1q_u8(src_ptr);
-        q5u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q6u8 = vld1q_u8(src_ptr);
-        q7u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-
-        q11u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q12u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q13u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q14u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q1u8 = vextq_u8(q0u8, q1u8, 1);
-        q3u8 = vextq_u8(q2u8, q3u8, 1);
-        q5u8 = vextq_u8(q4u8, q5u8, 1);
-        q7u8 = vextq_u8(q6u8, q7u8, 1);
-
-        q0u8 = vrhaddq_u8(q0u8, q1u8);
-        q1u8 = vrhaddq_u8(q2u8, q3u8);
-        q2u8 = vrhaddq_u8(q4u8, q5u8);
-        q3u8 = vrhaddq_u8(q6u8, q7u8);
-
-        d0u8 = vget_low_u8(q0u8);
-        d1u8 = vget_high_u8(q0u8);
-        d2u8 = vget_low_u8(q1u8);
-        d3u8 = vget_high_u8(q1u8);
-        d4u8 = vget_low_u8(q2u8);
-        d5u8 = vget_high_u8(q2u8);
-        d6u8 = vget_low_u8(q3u8);
-        d7u8 = vget_high_u8(q3u8);
-
-        q4u16 = vsubl_u8(d0u8, vget_low_u8(q11u8));
-        q5u16 = vsubl_u8(d1u8, vget_high_u8(q11u8));
-        q6u16 = vsubl_u8(d2u8, vget_low_u8(q12u8));
-        q7u16 = vsubl_u8(d3u8, vget_high_u8(q12u8));
-        q0u16 = vsubl_u8(d4u8, vget_low_u8(q13u8));
-        q1u16 = vsubl_u8(d5u8, vget_high_u8(q13u8));
-        q2u16 = vsubl_u8(d6u8, vget_low_u8(q14u8));
-        q3u16 = vsubl_u8(d7u8, vget_high_u8(q14u8));
-
-        d8s16 = vreinterpret_s16_u16(vget_low_u16(q4u16));
-        d9s16 = vreinterpret_s16_u16(vget_high_u16(q4u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q4u16));
-        q9s32 = vmlal_s16(q9s32, d8s16, d8s16);
-        q10s32 = vmlal_s16(q10s32, d9s16, d9s16);
-        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
-        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q5u16));
-        q9s32 = vmlal_s16(q9s32, d10s16, d10s16);
-        q10s32 = vmlal_s16(q10s32, d11s16, d11s16);
-        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
-        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q6u16));
-        q9s32 = vmlal_s16(q9s32, d12s16, d12s16);
-        q10s32 = vmlal_s16(q10s32, d13s16, d13s16);
-        d14s16 = vreinterpret_s16_u16(vget_low_u16(q7u16));
-        d15s16 = vreinterpret_s16_u16(vget_high_u16(q7u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q7u16));
-        q9s32 = vmlal_s16(q9s32, d14s16, d14s16);
-        q10s32 = vmlal_s16(q10s32, d15s16, d15s16);
-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
-        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
-        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
-        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
-        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
-        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
-        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
-        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
-        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
-        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
-        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
-        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
-        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vget_low_s64(q0s64);
-    d1s64 = vget_high_s64(q0s64);
-    d2s64 = vget_low_s64(q1s64);
-    d3s64 = vget_high_s64(q1s64);
-    d0s64 = vadd_s64(d0s64, d1s64);
-    d1s64 = vadd_s64(d2s64, d3s64);
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_v_neon(
-        const unsigned char *src_ptr,
-        int  source_stride,
-        const unsigned char *ref_ptr,
-        int  recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d4u8, d5u8, d8u8, d9u8, d12u8, d13u8;
-    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64, d2s64, d3s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q15u8;
-    uint16x8_t q0u16, q1u16, q2u16, q3u16, q11u16, q12u16, q13u16, q14u16;
-    int32x4_t q8s32, q9s32, q10s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q8s32 = vdupq_n_s32(0);
-    q9s32 = vdupq_n_s32(0);
-    q10s32 = vdupq_n_s32(0);
-
-    q0u8 = vld1q_u8(src_ptr);
-    src_ptr += source_stride;
-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
-        q2u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q4u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q6u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-        q15u8 = vld1q_u8(src_ptr);
-        src_ptr += source_stride;
-
-        q1u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q3u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q5u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q7u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        q0u8 = vrhaddq_u8(q0u8, q2u8);
-        q2u8 = vrhaddq_u8(q2u8, q4u8);
-        q4u8 = vrhaddq_u8(q4u8, q6u8);
-        q6u8 = vrhaddq_u8(q6u8, q15u8);
-
-        d0u8  = vget_low_u8(q0u8);
-        d1u8  = vget_high_u8(q0u8);
-        d4u8  = vget_low_u8(q2u8);
-        d5u8  = vget_high_u8(q2u8);
-        d8u8  = vget_low_u8(q4u8);
-        d9u8  = vget_high_u8(q4u8);
-        d12u8 = vget_low_u8(q6u8);
-        d13u8 = vget_high_u8(q6u8);
-
-        q11u16 = vsubl_u8(d0u8, vget_low_u8(q1u8));
-        q12u16 = vsubl_u8(d1u8, vget_high_u8(q1u8));
-        q13u16 = vsubl_u8(d4u8, vget_low_u8(q3u8));
-        q14u16 = vsubl_u8(d5u8, vget_high_u8(q3u8));
-        q0u16  = vsubl_u8(d8u8, vget_low_u8(q5u8));
-        q1u16  = vsubl_u8(d9u8, vget_high_u8(q5u8));
-        q2u16  = vsubl_u8(d12u8, vget_low_u8(q7u8));
-        q3u16  = vsubl_u8(d13u8, vget_high_u8(q7u8));
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
-        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
-        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
-        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
-        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);
-        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
-        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
-        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
-        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);
-        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
-        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
-        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
-        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q0u16));
-        q9s32 = vmlal_s16(q9s32, d0s16, d0s16);
-        q10s32 = vmlal_s16(q10s32, d1s16, d1s16);
-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q1u16));
-        q9s32 = vmlal_s16(q9s32, d2s16, d2s16);
-        q10s32 = vmlal_s16(q10s32, d3s16, d3s16);
-        d4s16 = vreinterpret_s16_u16(vget_low_u16(q2u16));
-        d5s16 = vreinterpret_s16_u16(vget_high_u16(q2u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q2u16));
-        q9s32 = vmlal_s16(q9s32, d4s16, d4s16);
-        q10s32 = vmlal_s16(q10s32, d5s16, d5s16);
-        d6s16 = vreinterpret_s16_u16(vget_low_u16(q3u16));
-        d7s16 = vreinterpret_s16_u16(vget_high_u16(q3u16));
-        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q3u16));
-        q9s32 = vmlal_s16(q9s32, d6s16, d6s16);
-        q10s32 = vmlal_s16(q10s32, d7s16, d7s16);
-
-        q0u8 = q15u8;
-    }
-
-    q10s32 = vaddq_s32(q10s32, q9s32);
-    q0s64 = vpaddlq_s32(q8s32);
-    q1s64 = vpaddlq_s32(q10s32);
-
-    d0s64 = vget_low_s64(q0s64);
-    d1s64 = vget_high_s64(q0s64);
-    d2s64 = vget_low_s64(q1s64);
-    d3s64 = vget_high_s64(q1s64);
-    d0s64 = vadd_s64(d0s64, d1s64);
-    d1s64 = vadd_s64(d2s64, d3s64);
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-unsigned int vp8_variance_halfpixvar16x16_hv_neon(
-        const unsigned char *src_ptr,
-        int  source_stride,
-        const unsigned char *ref_ptr,
-        int  recon_stride,
-        unsigned int *sse) {
-    int i;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
-    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
-    uint32x2_t d0u32, d10u32;
-    int64x1_t d0s64, d1s64, d2s64, d3s64;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
-    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
-    int32x4_t q13s32, q14s32, q15s32;
-    int64x2_t q0s64, q1s64, q5s64;
-
-    q13s32 = vdupq_n_s32(0);
-    q14s32 = vdupq_n_s32(0);
-    q15s32 = vdupq_n_s32(0);
-
-    q0u8 = vld1q_u8(src_ptr);
-    q1u8 = vld1q_u8(src_ptr + 16);
-    src_ptr += source_stride;
-    q1u8 = vextq_u8(q0u8, q1u8, 1);
-    q0u8 = vrhaddq_u8(q0u8, q1u8);
-    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
-        q2u8 = vld1q_u8(src_ptr);
-        q3u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q4u8 = vld1q_u8(src_ptr);
-        q5u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q6u8 = vld1q_u8(src_ptr);
-        q7u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-        q8u8 = vld1q_u8(src_ptr);
-        q9u8 = vld1q_u8(src_ptr + 16);
-        src_ptr += source_stride;
-
-        q3u8 = vextq_u8(q2u8, q3u8, 1);
-        q5u8 = vextq_u8(q4u8, q5u8, 1);
-        q7u8 = vextq_u8(q6u8, q7u8, 1);
-        q9u8 = vextq_u8(q8u8, q9u8, 1);
-
-        q1u8 = vrhaddq_u8(q2u8, q3u8);
-        q2u8 = vrhaddq_u8(q4u8, q5u8);
-        q3u8 = vrhaddq_u8(q6u8, q7u8);
-        q4u8 = vrhaddq_u8(q8u8, q9u8);
-        q0u8 = vrhaddq_u8(q0u8, q1u8);
-        q1u8 = vrhaddq_u8(q1u8, q2u8);
-        q2u8 = vrhaddq_u8(q2u8, q3u8);
-        q3u8 = vrhaddq_u8(q3u8, q4u8);
-
-        q5u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q6u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q7u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-        q8u8 = vld1q_u8(ref_ptr);
-        ref_ptr += recon_stride;
-
-        d0u8 = vget_low_u8(q0u8);
-        d1u8 = vget_high_u8(q0u8);
-        d2u8 = vget_low_u8(q1u8);
-        d3u8 = vget_high_u8(q1u8);
-        d4u8 = vget_low_u8(q2u8);
-        d5u8 = vget_high_u8(q2u8);
-        d6u8 = vget_low_u8(q3u8);
-        d7u8 = vget_high_u8(q3u8);
-
-        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
-        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
-        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
-        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
-        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
-        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
-        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
-        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));
-
-        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
-        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
-        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
-        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);
-
-        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
-        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
-        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
-        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);
-
-        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
-        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
-        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
-        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);
-
-        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
-        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
-        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
-        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);
-
-        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
-        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
-        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
-        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);
-
-        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
-        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
-        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
-        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);
-
-        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
-        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
-        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
-        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);
-
-        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
-        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
-        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
-        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
-        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);
-
-        q0u8 = q4u8;
-    }
-
-    q15s32 = vaddq_s32(q14s32, q15s32);
-    q0s64 = vpaddlq_s32(q13s32);
-    q1s64 = vpaddlq_s32(q15s32);
-
-    d0s64 = vget_low_s64(q0s64);
-    d1s64 = vget_high_s64(q0s64);
-    d2s64 = vget_low_s64(q1s64);
-    d3s64 = vget_high_s64(q1s64);
-    d0s64 = vadd_s64(d0s64, d1s64);
-    d1s64 = vadd_s64(d2s64, d3s64);
-
-    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
-                      vreinterpret_s32_s64(d0s64));
-    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);
-
-    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
-    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);
-
-    return vget_lane_u32(d0u32, 0);
-}
-
-#define FILTER_BITS 7
-
-static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
-  const int32x4_t a = vpaddlq_s16(v_16x8);
-  const int64x2_t b = vpaddlq_s32(a);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
-  const int64x2_t b = vpaddlq_s32(v_32x4);
-  const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
-                               vreinterpret_s32_s64(vget_high_s64(b)));
-  return vget_lane_s32(c, 0);
-}
-
-static void variance_neon_w8(const uint8_t *a, int a_stride,
-                             const uint8_t *b, int b_stride,
-                             int w, int h, unsigned int *sse, int *sum) {
-  int i, j;
-  int16x8_t v_sum = vdupq_n_s16(0);
-  int32x4_t v_sse_lo = vdupq_n_s32(0);
-  int32x4_t v_sse_hi = vdupq_n_s32(0);
-
-  for (i = 0; i < h; ++i) {
-    for (j = 0; j < w; j += 8) {
-      const uint8x8_t v_a = vld1_u8(&a[j]);
-      const uint8x8_t v_b = vld1_u8(&b[j]);
-      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
-      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
-      v_sum = vaddq_s16(v_sum, sv_diff);
-      v_sse_lo = vmlal_s16(v_sse_lo,
-                           vget_low_s16(sv_diff),
-                           vget_low_s16(sv_diff));
-      v_sse_hi = vmlal_s16(v_sse_hi,
-                           vget_high_s16(sv_diff),
-                           vget_high_s16(sv_diff));
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-
-  *sum = horizontal_add_s16x8(v_sum);
-  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
-}
-
-static unsigned int variance8x8_neon(const uint8_t *a, int a_stride,
-                                     const uint8_t *b, int b_stride,
-                                     unsigned int *sse) {
-  int sum;
-  variance_neon_w8(a, a_stride, b, b_stride, 8, 8, sse, &sum);
-  return *sse - (((int64_t)sum * sum) / (8 * 8));
-}
-
-static void var_filter_block2d_bil_w8(const uint8_t *src_ptr,
-                                      uint8_t *output_ptr,
-                                      unsigned int src_pixels_per_line,
-                                      int pixel_step,
-                                      unsigned int output_height,
-                                      unsigned int output_width,
-                                      const uint8_t *vpx_filter) {
-  const uint8x8_t f0 = vmov_n_u8(vpx_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(vpx_filter[1]);
-  unsigned int i;
-  for (i = 0; i < output_height; ++i) {
-    const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
-    const uint8x8_t src_1 = vld1_u8(&src_ptr[pixel_step]);
-    const uint16x8_t a = vmull_u8(src_0, f0);
-    const uint16x8_t b = vmlal_u8(a, src_1, f1);
-    const uint8x8_t out = vrshrn_n_u16(b, FILTER_BITS);
-    vst1_u8(&output_ptr[0], out);
-    // Next row...
-    src_ptr += src_pixels_per_line;
-    output_ptr += output_width;
-  }
-}
-
-unsigned int vp8_sub_pixel_variance8x8_neon(
-        const unsigned char *src,
-        int src_stride,
-        int xoffset,
-        int yoffset,
-        const unsigned char *dst,
-        int dst_stride,
-        unsigned int *sse) {
-  DECLARE_ALIGNED(16, uint8_t, temp2[9 * 8]);
-  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);
-  if (xoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, 8, 8,
-                              8, bilinear_taps_coeff[yoffset]);
-  } else if (yoffset == 0) {
-    var_filter_block2d_bil_w8(src, temp2, src_stride, 1,
-                              9, 8,
-                              bilinear_taps_coeff[xoffset]);
-  } else {
-    var_filter_block2d_bil_w8(src, fdata3, src_stride, 1,
-                              9, 8,
-                              bilinear_taps_coeff[xoffset]);
-    var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8,
-                              8, bilinear_taps_coeff[yoffset]);
-  }
-  return variance8x8_neon(temp2, 8, dst, dst_stride, sse);
-}

diff --git a/vp8/common/arm/variance_arm.c b/vp8/common/arm/variance_arm.c
deleted file mode 100644
index 0f293f0..0000000
--- a/vp8/common/arm/variance_arm.c
+++ /dev/null

@@ -1,137 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "./vp8_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vp8/common/variance.h"
-#include "vp8/common/filter.h"
-
-// TODO(johannkoenig): Move this to vpx_dsp or vp8/encoder
-#if CONFIG_VP8_ENCODER
-
-#if HAVE_MEDIA
-#include "vp8/common/arm/bilinearfilter_arm.h"
-
-unsigned int vp8_sub_pixel_variance8x8_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[10*8];
-    unsigned char  second_pass[8*8];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                            src_pixels_per_line,
-                                            9, 8, HFilter);
-    vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                             8, 8, 8, VFilter);
-
-    return vpx_variance8x8_media(second_pass, 8, dst_ptr,
-                                 dst_pixels_per_line, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_armv6
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short first_pass[36*16];
-    unsigned char  second_pass[20*16];
-    const short *HFilter, *VFilter;
-    unsigned int var;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        var = vp8_variance_halfpixvar16x16_h_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_v_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        var = vp8_variance_halfpixvar16x16_hv_armv6(src_ptr, src_pixels_per_line,
-                                                   dst_ptr, dst_pixels_per_line, sse);
-    }
-    else
-    {
-        HFilter = vp8_bilinear_filters[xoffset];
-        VFilter = vp8_bilinear_filters[yoffset];
-
-        vp8_filter_block2d_bil_first_pass_armv6(src_ptr, first_pass,
-                                                src_pixels_per_line,
-                                                17, 16, HFilter);
-        vp8_filter_block2d_bil_second_pass_armv6(first_pass, second_pass,
-                                                 16, 16, 16, VFilter);
-
-        var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
-                                      dst_pixels_per_line, sse);
-    }
-    return var;
-}
-
-#endif  // HAVE_MEDIA
-
-
-#if HAVE_NEON
-
-extern unsigned int vp8_sub_pixel_variance16x16_neon_func
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-);
-
-unsigned int vp8_sub_pixel_variance16x16_neon
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-  if (xoffset == 4 && yoffset == 0)
-    return vp8_variance_halfpixvar16x16_h_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 0 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_v_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else if (xoffset == 4 && yoffset == 4)
-    return vp8_variance_halfpixvar16x16_hv_neon(src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse);
-  else
-    return vp8_sub_pixel_variance16x16_neon_func(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
-}
-
-#endif  // HAVE_NEON
-#endif  // CONFIG_VP8_ENCODER

diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index 5c0680f..2bfefb1 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c

@@ -20,7 +20,7 @@
 #include "./vp8_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp8/common/postproc.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
 

diff --git a/vp8/common/mips/msa/idct_msa.c b/vp8/common/mips/msa/idct_msa.c
new file mode 100644
index 0000000..e537a3f
--- /dev/null
+++ b/vp8/common/mips/msa/idct_msa.c

@@ -0,0 +1,457 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vp8/common/mips/msa/vp8_macros_msa.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                        \
+    v8i16 s4_m, s5_m, s6_m, s7_m;                                        \
+                                                                         \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);      \
+    ILVR_D2_SH(s6_m, s4_m, s7_m, s5_m, out0, out2);                      \
+    out1 = (v8i16)__msa_ilvl_d((v2i64)s6_m, (v2i64)s4_m);                \
+    out3 = (v8i16)__msa_ilvl_d((v2i64)s7_m, (v2i64)s5_m);                \
+}
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in)     \
+({                                                        \
+    v8i16 out_m;                                          \
+    v8i16 zero_m = { 0 };                                 \
+    v4i32 tmp1_m, tmp2_m;                                 \
+    v4i32 sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);     \
+                                                          \
+    ILVRL_H2_SW(in, zero_m, tmp1_m, tmp2_m);              \
+    tmp1_m >>= 16;                                        \
+    tmp2_m >>= 16;                                        \
+    tmp1_m = (tmp1_m * sinpi8_sqrt2_m) >> 16;             \
+    tmp2_m = (tmp2_m * sinpi8_sqrt2_m) >> 16;             \
+    out_m = __msa_pckev_h((v8i16)tmp2_m, (v8i16)tmp1_m);  \
+                                                          \
+    out_m;                                                \
+})
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v8i16 a1_m, b1_m, c1_m, d1_m;                                  \
+    v8i16 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v8i16 const_cospi8sqrt2minus1_m;                               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_h(cospi8sqrt2minus1);   \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1);     \
+    c_tmp2_m = __msa_mul_q_h(in3, const_cospi8sqrt2minus1_m);      \
+    c_tmp2_m = c_tmp2_m >> 1;                                      \
+    c_tmp2_m = in3 + c_tmp2_m;                                     \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = __msa_mul_q_h(in1, const_cospi8sqrt2minus1_m);      \
+    d_tmp1_m = d_tmp1_m >> 1;                                      \
+    d_tmp1_m = in1 + d_tmp1_m;                                     \
+    d_tmp2_m = EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3);     \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                  \
+    v4i32 a1_m, b1_m, c1_m, d1_m;                                  \
+    v4i32 c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                  \
+    v4i32 const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;               \
+                                                                   \
+    const_cospi8sqrt2minus1_m = __msa_fill_w(cospi8sqrt2minus1);   \
+    sinpi8_sqrt2_m = __msa_fill_w(sinpi8sqrt2);                    \
+    a1_m = in0 + in2;                                              \
+    b1_m = in0 - in2;                                              \
+    c_tmp1_m = (in1 * sinpi8_sqrt2_m) >> 16;                       \
+    c_tmp2_m = in3 + ((in3 * const_cospi8sqrt2minus1_m) >> 16);    \
+    c1_m = c_tmp1_m - c_tmp2_m;                                    \
+    d_tmp1_m = in1 + ((in1 * const_cospi8sqrt2minus1_m) >> 16);    \
+    d_tmp2_m = (in3 * sinpi8_sqrt2_m) >> 16;                       \
+    d1_m = d_tmp1_m + d_tmp2_m;                                    \
+    BUTTERFLY_4(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3);   \
+}
+
+static void idct4x4_addblk_msa(int16_t *input, uint8_t *pred,
+                               int32_t pred_stride,
+                               uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v4i32 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    VP8_IDCT_1D_W(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    VP8_IDCT_1D_W(hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SW(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void idct4x4_addconst_msa(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride,
+                                 uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 vec;
+    v8i16 res0, res1, res2, res3;
+    v16i8 zero = { 0 };
+    v16i8 pred0, pred1, pred2, pred3, dest0, dest1, dest2, dest3;
+    v16i8 mask = { 0, 2, 4, 6, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 };
+
+    vec = __msa_fill_h(in_dc);
+    vec = __msa_srari_h(vec, 3);
+    LD_SB4(pred, pred_stride, pred0, pred1, pred2, pred3);
+    ILVR_B4_SH(zero, pred0, zero, pred1, zero, pred2, zero, pred3, res0, res1,
+               res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_inv_walsh4x4_msa(int16_t *input, int16_t *mb_dq_coeff)
+{
+    v8i16 input0, input1;
+    v4i32 in0, in1, in2, in3, a1, b1, c1, d1;
+    v4i32 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+
+    LD_SH2(input, 8, input0, input1);
+    UNPCK_SH_SW(input0, in0, in1);
+    UNPCK_SH_SW(input1, in2, in3);
+    BUTTERFLY_4(in0, in1, in2, in3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, hz0, hz1, hz3, hz2);
+    TRANSPOSE4x4_SW_SW(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    BUTTERFLY_4(hz0, hz1, hz2, hz3, a1, b1, c1, d1);
+    BUTTERFLY_4(a1, d1, c1, b1, vt0, vt1, vt3, vt2);
+    ADD4(vt0, 3, vt1, 3, vt2, 3, vt3, 3, vt0, vt1, vt2, vt3);
+    SRA_4V(vt0, vt1, vt2, vt3, 3);
+    mb_dq_coeff[0] = __msa_copy_s_h((v8i16)vt0, 0);
+    mb_dq_coeff[16] = __msa_copy_s_h((v8i16)vt1, 0);
+    mb_dq_coeff[32] = __msa_copy_s_h((v8i16)vt2, 0);
+    mb_dq_coeff[48] = __msa_copy_s_h((v8i16)vt3, 0);
+    mb_dq_coeff[64] = __msa_copy_s_h((v8i16)vt0, 2);
+    mb_dq_coeff[80] = __msa_copy_s_h((v8i16)vt1, 2);
+    mb_dq_coeff[96] = __msa_copy_s_h((v8i16)vt2, 2);
+    mb_dq_coeff[112] = __msa_copy_s_h((v8i16)vt3, 2);
+    mb_dq_coeff[128] = __msa_copy_s_h((v8i16)vt0, 4);
+    mb_dq_coeff[144] = __msa_copy_s_h((v8i16)vt1, 4);
+    mb_dq_coeff[160] = __msa_copy_s_h((v8i16)vt2, 4);
+    mb_dq_coeff[176] = __msa_copy_s_h((v8i16)vt3, 4);
+    mb_dq_coeff[192] = __msa_copy_s_h((v8i16)vt0, 6);
+    mb_dq_coeff[208] = __msa_copy_s_h((v8i16)vt1, 6);
+    mb_dq_coeff[224] = __msa_copy_s_h((v8i16)vt2, 6);
+    mb_dq_coeff[240] = __msa_copy_s_h((v8i16)vt3, 6);
+}
+
+static void dequant_idct4x4_addblk_msa(int16_t *input, int16_t *dequant_input,
+                                       uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input0, input1, dequant_in0, dequant_in1, mul0, mul1;
+    v8i16 in0, in1, in2, in3;
+    v8i16 hz0_h, hz1_h, hz2_h, hz3_h;
+    v16i8 dest0, dest1, dest2, dest3;
+    v4i32 hz0_w, hz1_w, hz2_w, hz3_w;
+    v4i32 vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+    v2i64 zero = { 0 };
+    v16i8 mask = { 0, 4, 8, 12, 20, 21, 22, 23, 24,
+                   25, 26, 27, 28, 29, 30, 31 };
+
+    LD_SH2(input, 8, input0, input1);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL2(input0, dequant_in0, input1, dequant_in1, mul0, mul1);
+    PCKEV_D2_SH(zero, mul0, zero, mul1, in0, in2);
+    PCKOD_D2_SH(zero, mul0, zero, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0_h, hz1_h, hz2_h, hz3_h);
+    PCKEV_D2_SH(hz1_h, hz0_h, hz3_h, hz2_h, mul0, mul1);
+    UNPCK_SH_SW(mul0, hz0_w, hz1_w);
+    UNPCK_SH_SW(mul1, hz2_w, hz3_w);
+    TRANSPOSE4x4_SW_SW(hz0_w, hz1_w, hz2_w, hz3_w, hz0_w, hz1_w, hz2_w, hz3_w);
+    VP8_IDCT_1D_W(hz0_w, hz1_w, hz2_w, hz3_w, vt0, vt1, vt2, vt3);
+    SRARI_W4_SW(vt0, vt1, vt2, vt3, 3);
+    TRANSPOSE4x4_SW_SW(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_SB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SW(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ILVR_H4_SW(zero, res0, zero, res1, zero, res2, zero, res3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    res0 = CLIP_SW_0_255(res0);
+    res1 = CLIP_SW_0_255(res1);
+    res2 = CLIP_SW_0_255(res2);
+    res3 = CLIP_SW_0_255(res3);
+    VSHF_B2_SB(res0, dest0, res1, dest1, mask, mask, dest0, dest1);
+    VSHF_B2_SB(res2, dest2, res3, dest3, mask, mask, dest2, dest3);
+    ST_SB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_msa(int16_t *input,
+                                          int16_t *dequant_input,
+                                          uint8_t *dest, int32_t dest_stride)
+{
+    v16u8 dest0, dest1, dest2, dest3;
+    v8i16 in0, in1, in2, in3;
+    v8i16 mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+    v8i16 hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3;
+    v8i16 res0, res1, res2, res3;
+    v4i32 hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+    v4i32 vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+    v16i8 zero = { 0 };
+
+    LD_SH4(input, 8, in0, in1, in2, in3);
+    LD_SH2(dequant_input, 8, dequant_in0, dequant_in1);
+    MUL4(in0, dequant_in0, in1, dequant_in1, in2, dequant_in0, in3, dequant_in1,
+         mul0, mul1, mul2, mul3);
+    PCKEV_D2_SH(mul2, mul0, mul3, mul1, in0, in2);
+    PCKOD_D2_SH(mul2, mul0, mul3, mul1, in1, in3);
+    VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+    TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+    UNPCK_SH_SW(hz0, hz0r, hz0l);
+    UNPCK_SH_SW(hz1, hz1r, hz1l);
+    UNPCK_SH_SW(hz2, hz2r, hz2l);
+    UNPCK_SH_SW(hz3, hz3r, hz3l);
+    VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+    SRARI_W4_SW(vt0l, vt1l, vt2l, vt3l, 3);
+    VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+    SRARI_W4_SW(vt0r, vt1r, vt2r, vt3r, 3);
+    PCKEV_H4_SH(vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r, vt0, vt1, vt2,
+                vt3);
+    TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0, res1,
+               res2, res3);
+    ADD4(res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+
+    __asm__ __volatile__(
+        "sw   $zero,    0(%[input])  \n\t"
+        "sw   $zero,    4(%[input])  \n\t"
+        "sw   $zero,    8(%[input])  \n\t"
+        "sw   $zero,   12(%[input])  \n\t"
+        "sw   $zero,   16(%[input])  \n\t"
+        "sw   $zero,   20(%[input])  \n\t"
+        "sw   $zero,   24(%[input])  \n\t"
+        "sw   $zero,   28(%[input])  \n\t"
+        "sw   $zero,   32(%[input])  \n\t"
+        "sw   $zero,   36(%[input])  \n\t"
+        "sw   $zero,   40(%[input])  \n\t"
+        "sw   $zero,   44(%[input])  \n\t"
+        "sw   $zero,   48(%[input])  \n\t"
+        "sw   $zero,   52(%[input])  \n\t"
+        "sw   $zero,   56(%[input])  \n\t"
+        "sw   $zero,   60(%[input])  \n\t"::
+
+        [input] "r"(input)
+    );
+}
+
+static void dequant_idct_addconst_2x_msa(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride)
+{
+    v8i16 input_dc0, input_dc1, vec;
+    v16u8 dest0, dest1, dest2, dest3;
+    v16i8 zero = { 0 };
+    v8i16 res0, res1, res2, res3;
+
+    input_dc0 = __msa_fill_h(input[0] * dequant_input[0]);
+    input_dc1 = __msa_fill_h(input[16] * dequant_input[0]);
+    SRARI_H2_SH(input_dc0, input_dc1, 3);
+    vec = (v8i16)__msa_pckev_d((v2i64)input_dc1, (v2i64)input_dc0);
+    input[0] = 0;
+    input[16] = 0;
+    LD_UB4(dest, dest_stride, dest0, dest1, dest2, dest3);
+    ILVR_B4_SH(zero, dest0, zero, dest1, zero, dest2, zero, dest3, res0,
+               res1, res2, res3);
+    ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
+    CLIP_SH4_0_255(res0, res1, res2, res3);
+    PCKEV_B4_SH(res0, res0, res1, res1, res2, res2, res3, res3, res0, res1,
+                res2, res3);
+    PCKOD_D2_UB(dest0, res0, dest1, res1, dest0, dest1);
+    PCKOD_D2_UB(dest2, res2, dest3, res3, dest2, dest3);
+    ST_UB4(dest0, dest1, dest2, dest3, dest, dest_stride);
+}
+
+void vp8_short_idct4x4llm_msa(int16_t *input, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addblk_msa(input, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dc_only_idct_add_msa(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride)
+{
+    idct4x4_addconst_msa(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+void vp8_dequantize_b_msa(BLOCKD *d, int16_t *DQC)
+{
+    v8i16 dqc0, dqc1, q0, q1, dq0, dq1;
+
+    LD_SH2(DQC, 8, dqc0, dqc1);
+    LD_SH2(d->qcoeff, 8, q0, q1);
+    MUL2(dqc0, q0, dqc1, q1, dq0, dq1);
+    ST_SH2(dq0, dq1, d->dqcoeff, 8);
+}
+
+void vp8_dequant_idct_add_msa(int16_t *input, int16_t *dq,
+                              uint8_t *dest, int32_t stride)
+{
+    dequant_idct4x4_addblk_msa(input, dq, dest, stride);
+
+    __asm__ __volatile__ (
+        "sw     $zero,    0(%[input])     \n\t"
+        "sw     $zero,    4(%[input])     \n\t"
+        "sw     $zero,    8(%[input])     \n\t"
+        "sw     $zero,   12(%[input])     \n\t"
+        "sw     $zero,   16(%[input])     \n\t"
+        "sw     $zero,   20(%[input])     \n\t"
+        "sw     $zero,   24(%[input])     \n\t"
+        "sw     $zero,   28(%[input])     \n\t"
+
+        :
+        : [input] "r" (input)
+    );
+}
+
+void vp8_dequant_idct_add_y_block_msa(int16_t *q, int16_t *dq,
+                                      uint8_t *dst, int32_t stride,
+                                      char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+    uint8_t i;
+
+    for (i = 4; i--;)
+    {
+        if (eobs_h[0])
+        {
+            if (eobs_h[0] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst, stride);
+            }
+        }
+
+        q += 32;
+
+        if (eobs_h[1])
+        {
+            if (eobs_h[1] & 0xfefe)
+            {
+                dequant_idct4x4_addblk_2x_msa(q, dq, dst + 8, stride);
+            }
+            else
+            {
+                dequant_idct_addconst_2x_msa(q, dq, dst + 8, stride);
+            }
+        }
+
+        q += 32;
+        dst += (4 * stride);
+        eobs_h += 2;
+    }
+}
+
+void vp8_dequant_idct_add_uv_block_msa(int16_t *q, int16_t *dq,
+                                       uint8_t *dstu, uint8_t *dstv,
+                                       int32_t stride, char *eobs)
+{
+    int16_t *eobs_h = (int16_t *)eobs;
+
+    if (eobs_h[0])
+    {
+        if (eobs_h[0] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+    dstu += (stride * 4);
+
+    if (eobs_h[1])
+    {
+        if (eobs_h[1] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstu, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstu, stride);
+        }
+    }
+
+    q += 32;
+
+    if (eobs_h[2])
+    {
+        if (eobs_h[2] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+
+    q += 32;
+    dstv += (stride * 4);
+
+    if (eobs_h[3])
+    {
+        if (eobs_h[3] & 0xfefe)
+        {
+            dequant_idct4x4_addblk_2x_msa(q, dq, dstv, stride);
+        }
+        else
+        {
+            dequant_idct_addconst_2x_msa(q, dq, dstv, stride);
+        }
+    }
+}

diff --git a/vp8/common/mips/msa/vp8_macros_msa.h b/vp8/common/mips/msa/vp8_macros_msa.h
new file mode 100644
index 0000000..345a62f
--- /dev/null
+++ b/vp8/common/mips/msa/vp8_macros_msa.h

@@ -0,0 +1,515 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+#define VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
+#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
+
+#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
+#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+/* Description : Load vectors with 16 byte elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Load 16 byte elements in 'out0' from (psrc)
+                 Load 16 byte elements in 'out1' from (psrc + stride)
+*/
+#define LD_B2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_B(RTYPE, (psrc));                 \
+    out1 = LD_B(RTYPE, (psrc) + stride);        \
+}
+#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
+
+#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \
+{                                                            \
+    LD_B2(RTYPE, (psrc), stride, out0, out1);                \
+    LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \
+}
+#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
+
+/* Description : Load vectors with 8 halfword elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load 8 halfword elements in 'out0' from (psrc)
+                 Load 8 halfword elements in 'out1' from (psrc + stride)
+*/
+#define LD_H2(RTYPE, psrc, stride, out0, out1)  \
+{                                               \
+    out0 = LD_H(RTYPE, (psrc));                 \
+    out1 = LD_H(RTYPE, (psrc) + (stride));      \
+}
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
+
+#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \
+{                                                           \
+    LD_H2(RTYPE, (psrc), stride, out0, out1);               \
+    LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \
+}
+#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_B(RTYPE, in0, (pdst));                 \
+    ST_B(RTYPE, in1, (pdst) + stride);        \
+}
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \
+{                                                         \
+    ST_B2(RTYPE, in0, in1, (pdst), stride);               \
+    ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+#define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride)  \
+{                                             \
+    ST_H(RTYPE, in0, (pdst));                 \
+    ST_H(RTYPE, in1, (pdst) + stride);        \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)   \
+{                                                                      \
+    out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);  \
+    out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);  \
+}
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in)                               \
+({                                                      \
+    v8i16 max_m = __msa_ldi_h(255);                     \
+    v8i16 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+    out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+    out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1)  \
+{                                 \
+    in0 = CLIP_SH_0_255(in0);     \
+    in1 = CLIP_SH_0_255(in1);     \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3)  \
+{                                           \
+    CLIP_SH2_0_255(in0, in1);               \
+    CLIP_SH2_0_255(in2, in3);               \
+}
+
+/* Description : Clips all signed word elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed word
+*/
+#define CLIP_SW_0_255(in)                               \
+({                                                      \
+    v4i32 max_m = __msa_ldi_w(255);                     \
+    v4i32 out_m;                                        \
+                                                        \
+    out_m = __msa_maxi_s_w((v4i32)in, 0);               \
+    out_m = __msa_min_s_w((v4i32)max_m, (v4i32)out_m);  \
+    out_m;                                              \
+})
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+#define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);  \
+}
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+#define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);  \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+#define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3)                         \
+{                                                               \
+    ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+#define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);  \
+}
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)       \
+{                                                            \
+    out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+    out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave both left and right half of input vectors
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements from 'in0' and 'in1' are
+                 interleaved and written to 'out0'
+*/
+#define ILVRL_H2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
+}
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
+#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
+
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1)            \
+{                                                        \
+    out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+    out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);  \
+    out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);  \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);  \
+    out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);  \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3)                         \
+{                                                                \
+    PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \
+    PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double elements of 'in0' are copied to the left half of
+                 'out0' & even double elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
+
+/* Description : Pack odd double word elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Odd double word elements of 'in0' are copied to the left half
+                 of 'out0' & odd double word elements of 'in1' are copied to
+                 the right half of 'out0'.
+*/
+#define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)   \
+{                                                         \
+    out0 = (RTYPE)__msa_pckod_d((v2i64)in0, (v2i64)in1);  \
+    out1 = (RTYPE)__msa_pckod_d((v2i64)in2, (v2i64)in3);  \
+}
+#define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
+#define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift)  \
+{                                          \
+    in0 = in0 >> shift;                    \
+    in1 = in1 >> shift;                    \
+    in2 = in2 >> shift;                    \
+    in3 = in3 >> shift;                    \
+}
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift)            \
+{                                                   \
+    in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+    in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \
+{                                                   \
+    SRARI_W2(RTYPE, in0, in1, shift);               \
+    SRARI_W2(RTYPE, in2, in3, shift);               \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 * in1;                         \
+    out1 = in2 * in3;                         \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    MUL2(in0, in1, in2, in3, out0, out1);             \
+    MUL2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1)  \
+{                                             \
+    out0 = in0 + in1;                         \
+    out1 = in2 + in3;                         \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3)                  \
+{                                                     \
+    ADD2(in0, in1, in2, in3, out0, out1);             \
+    ADD2(in4, in5, in6, in7, out2, out3);             \
+}
+
+/* Description : Sign extend halfword elements from input vector and return
+                 the result in pair of vectors
+   Arguments   : Input   - in            (halfword vector)
+                 Outputs - out0, out1   (sign extended word vectors)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved right with same vector 'in0' to
+                 generate 4 signed word elements in 'out0'
+                 Then interleaved left with same vector 'in0' to
+                 generate 4 signed word elements in 'out1'
+*/
+#define UNPCK_SH_SW(in, out0, out1)        \
+{                                          \
+    v8i16 tmp_m;                           \
+                                           \
+    tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
+    ILVRL_H2_SW(tmp_m, in, out0, out1);    \
+}
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                \
+    out0 = in0 + in3;                                            \
+    out1 = in1 + in2;                                            \
+                                                                 \
+    out2 = in1 - in2;                                            \
+    out3 = in0 - in3;                                            \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                        \
+    ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                     \
+    ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                     \
+    ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);             \
+    ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);             \
+}
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \
+{                                                                       \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                       \
+                                                                        \
+    ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \
+    ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \
+                                                                        \
+    out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);               \
+    out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);               \
+    out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);               \
+}
+#endif  /* VP8_COMMON_MIPS_MSA_VP8_MACROS_MSA_H_ */

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index fed2088..db38222 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl

@@ -29,21 +29,21 @@
 # Dequant
 #
 add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc";
-specialize qw/vp8_dequantize_b mmx media neon/;
+specialize qw/vp8_dequantize_b mmx media neon msa/;
 $vp8_dequantize_b_media=vp8_dequantize_b_v6;
 
 add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride";
-specialize qw/vp8_dequant_idct_add mmx media neon dspr2/;
+specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/;
 $vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6;
 $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2/;
+specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/;
 $vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2/;
+specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/;
 $vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6;
 $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2;
 
@@ -108,7 +108,7 @@
 #
 #idct16
 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride";
-specialize qw/vp8_short_idct4x4llm mmx media neon dspr2/;
+specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/;
 $vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual;
 $vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2;
 
@@ -120,13 +120,13 @@
 
 #iwalsh16
 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output";
-specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2/;
+specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/;
 $vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6;
 $vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2;
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2/;
+specialize qw/vp8_dc_only_idct_add	mmx media neon dspr2 msa/;
 $vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6;
 $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2;
 
@@ -238,47 +238,6 @@
 $vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6;
 
 #
-# Sub-pixel Variance
-#
-add_proto qw/unsigned int vp8_sub_pixel_variance4x4/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance4x4 mmx sse2/;
-$vp8_sub_pixel_variance4x4_sse2=vp8_sub_pixel_variance4x4_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x8 mmx sse2 media/;
-$vp8_sub_pixel_variance8x8_sse2=vp8_sub_pixel_variance8x8_wmt;
-$vp8_sub_pixel_variance8x8_media=vp8_sub_pixel_variance8x8_armv6;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance8x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance8x16 mmx sse2/;
-$vp8_sub_pixel_variance8x16_sse2=vp8_sub_pixel_variance8x16_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x8/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x8 mmx sse2 ssse3/;
-$vp8_sub_pixel_variance16x8_sse2=vp8_sub_pixel_variance16x8_wmt;
-
-add_proto qw/unsigned int vp8_sub_pixel_variance16x16/, "const unsigned char  *src_ptr, int  source_stride, int  xoffset, int  yoffset, const unsigned char *ref_ptr, int Refstride, unsigned int *sse";
-specialize qw/vp8_sub_pixel_variance16x16 mmx sse2 ssse3 media neon_asm/;
-$vp8_sub_pixel_variance16x16_sse2=vp8_sub_pixel_variance16x16_wmt;
-$vp8_sub_pixel_variance16x16_media=vp8_sub_pixel_variance16x16_armv6;
-$vp8_sub_pixel_variance16x16_neon_asm=vp8_sub_pixel_variance16x16_neon;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_h mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_h_sse2=vp8_variance_halfpixvar16x16_h_wmt;
-$vp8_variance_halfpixvar16x16_h_media=vp8_variance_halfpixvar16x16_h_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_v mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_v_sse2=vp8_variance_halfpixvar16x16_v_wmt;
-$vp8_variance_halfpixvar16x16_v_media=vp8_variance_halfpixvar16x16_v_armv6;
-
-add_proto qw/unsigned int vp8_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned int *sse";
-specialize qw/vp8_variance_halfpixvar16x16_hv mmx sse2 media neon/;
-$vp8_variance_halfpixvar16x16_hv_sse2=vp8_variance_halfpixvar16x16_hv_wmt;
-$vp8_variance_halfpixvar16x16_hv_media=vp8_variance_halfpixvar16x16_hv_armv6;
-
-#
 # Encoder functions below this point.
 #
 if (vpx_config("CONFIG_VP8_ENCODER") eq "yes") {
@@ -343,15 +302,6 @@
 specialize qw/vp8_mbuverror mmx sse2/;
 $vp8_mbuverror_sse2=vp8_mbuverror_xmm;
 
-add_proto qw/void vp8_subtract_b/, "struct block *be, struct blockd *bd, int pitch";
-specialize qw/vp8_subtract_b mmx sse2 neon/;
-
-add_proto qw/void vp8_subtract_mby/, "short *diff, unsigned char *src, int src_stride, unsigned char *pred, int pred_stride";
-specialize qw/vp8_subtract_mby mmx sse2 neon/;
-
-add_proto qw/void vp8_subtract_mbuv/, "short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride, unsigned char *upred, unsigned char *vpred, int pred_stride";
-specialize qw/vp8_subtract_mbuv mmx sse2 neon/;
-
 #
 # Motion search
 #

diff --git a/vp8/common/variance.h b/vp8/common/variance.h
deleted file mode 100644
index c6c9f41..0000000
--- a/vp8/common/variance.h
+++ /dev/null

@@ -1,92 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP8_COMMON_VARIANCE_H_
-#define VP8_COMMON_VARIANCE_H_
-
-#include "vpx_config.h"
-
-#include "vpx/vpx_integer.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vpx_sad_fn_t)(
-    const uint8_t *src_ptr,
-    int source_stride,
-    const uint8_t *ref_ptr,
-    int ref_stride);
-
-typedef void (*vp8_copy32xn_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    unsigned char *ref_ptr,
-    int ref_stride,
-    int n);
-
-typedef void (*vpx_sad_multi_fn_t)(
-    const unsigned char *src_ptr,
-    int source_stride,
-    const unsigned char *ref_array,
-    int  ref_stride,
-    unsigned int *sad_array);
-
-typedef void (*vpx_sad_multi_d_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char * const ref_array[],
-     int  ref_stride,
-     unsigned int *sad_array
-    );
-
-typedef unsigned int (*vpx_variance_fn_t)
-    (
-     const unsigned char *src_ptr,
-     int source_stride,
-     const unsigned char *ref_ptr,
-     int  ref_stride,
-     unsigned int *sse
-    );
-
-typedef unsigned int (*vp8_subpixvariance_fn_t)
-    (
-      const unsigned char  *src_ptr,
-      int  source_stride,
-      int  xoffset,
-      int  yoffset,
-      const unsigned char *ref_ptr,
-      int Refstride,
-      unsigned int *sse
-    );
-
-typedef struct variance_vtable
-{
-    vpx_sad_fn_t            sdf;
-    vpx_variance_fn_t       vf;
-    vp8_subpixvariance_fn_t svf;
-    vpx_variance_fn_t       svf_halfpix_h;
-    vpx_variance_fn_t       svf_halfpix_v;
-    vpx_variance_fn_t       svf_halfpix_hv;
-    vpx_sad_multi_fn_t      sdx3f;
-    vpx_sad_multi_fn_t      sdx8f;
-    vpx_sad_multi_d_fn_t    sdx4df;
-#if ARCH_X86 || ARCH_X86_64
-    vp8_copy32xn_fn_t       copymem;
-#endif
-} vp8_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP8_COMMON_VARIANCE_H_

diff --git a/vp8/common/variance_c.c b/vp8/common/variance_c.c
deleted file mode 100644
index 02915a4..0000000
--- a/vp8/common/variance_c.c
+++ /dev/null

@@ -1,337 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "filter.h"
-#include "variance.h"
-
-/* This is a bad idea.
- * ctz = count trailing zeros */
-static int ctz(int a) {
-  int b = 0;
-  while (a != 1) {
-    a >>= 1;
-    b++;
-  }
-  return b;
-}
-
-static unsigned int variance(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    int  w,
-    int  h,
-    unsigned int *sse)
-{
-    int i, j;
-    int diff, sum;
-
-    sum = 0;
-    *sse = 0;
-
-    for (i = 0; i < h; i++)
-    {
-        for (j = 0; j < w; j++)
-        {
-            diff = src_ptr[j] - ref_ptr[j];
-            sum += diff;
-            *sse += diff * diff;
-        }
-
-        src_ptr += source_stride;
-        ref_ptr += recon_stride;
-    }
-
-    return (*sse - (((unsigned int)sum * sum) >> (int)((ctz(w) + ctz(h)))));
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_first_pass
- *
- *  INPUTS        : UINT8  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : INT32 *output_ptr        : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement first-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Produces INT32 output to retain precision for next pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_first_pass
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int i, j;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply bilinear filter */
-            output_ptr[j] = (((int)src_ptr[0]          * vp8_filter[0]) +
-                             ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                             (VP8_FILTER_WEIGHT / 2)) >> VP8_FILTER_SHIFT;
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-/****************************************************************************
- *
- *  ROUTINE       : filter_block2d_bil_second_pass
- *
- *  INPUTS        : INT32  *src_ptr          : Pointer to source block.
- *                  UINT32 src_pixels_per_line : Stride of input block.
- *                  UINT32 pixel_step        : Offset between filter input samples (see notes).
- *                  UINT32 output_height     : Input block height.
- *                  UINT32 output_width      : Input block width.
- *                  INT32  *vp8_filter          : Array of 2 bi-linear filter taps.
- *
- *  OUTPUTS       : UINT16 *output_ptr       : Pointer to filtered block.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Applies a 1-D 2-tap bi-linear filter to the source block in
- *                  either horizontal or vertical direction to produce the
- *                  filtered output block. Used to implement second-pass
- *                  of 2-D separable filter.
- *
- *  SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- *                  Two filter taps should sum to VP8_FILTER_WEIGHT.
- *                  pixel_step defines whether the filter is applied
- *                  horizontally (pixel_step=1) or vertically (pixel_step=stride).
- *                  It defines the offset required to move from one input
- *                  to the next.
- *
- ****************************************************************************/
-static void var_filter_block2d_bil_second_pass
-(
-    const unsigned short *src_ptr,
-    unsigned char  *output_ptr,
-    unsigned int  src_pixels_per_line,
-    unsigned int  pixel_step,
-    unsigned int  output_height,
-    unsigned int  output_width,
-    const short *vp8_filter
-)
-{
-    unsigned int  i, j;
-    int  Temp;
-
-    for (i = 0; i < output_height; i++)
-    {
-        for (j = 0; j < output_width; j++)
-        {
-            /* Apply filter */
-            Temp = ((int)src_ptr[0]          * vp8_filter[0]) +
-                   ((int)src_ptr[pixel_step] * vp8_filter[1]) +
-                   (VP8_FILTER_WEIGHT / 2);
-            output_ptr[j] = (unsigned int)(Temp >> VP8_FILTER_SHIFT);
-            src_ptr++;
-        }
-
-        /* Next row... */
-        src_ptr    += src_pixels_per_line - output_width;
-        output_ptr += output_width;
-    }
-}
-
-
-unsigned int vp8_sub_pixel_variance4x4_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-    unsigned short FData3[5*4]; /* Temp data bufffer used in filtering */
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    /* First filter 1d Horizontal */
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 5, 4, HFilter);
-
-    /* Now filter Verticaly */
-    var_filter_block2d_bil_second_pass(FData3, temp2, 4,  4,  4,  4, VFilter);
-
-    return variance(temp2, 4, dst_ptr, dst_pixels_per_line, 4, 4, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*8]; /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 8, 8, VFilter);
-
-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance16x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[17*16];   /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 16, 16, VFilter);
-
-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 16, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 0,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 0, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_c(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_c(src_ptr, source_stride, 4, 4,
-                                         ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_sub_pixel_variance16x8_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[16*9];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 9, 16, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 16, 16, 8, 16, VFilter);
-
-    return variance(temp2, 16, dst_ptr, dst_pixels_per_line, 16, 8, sse);
-}
-
-unsigned int vp8_sub_pixel_variance8x16_c
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    unsigned short FData3[9*16];    /* Temp data bufffer used in filtering */
-    unsigned char  temp2[20*16];
-    const short *HFilter, *VFilter;
-
-
-    HFilter = vp8_bilinear_filters[xoffset];
-    VFilter = vp8_bilinear_filters[yoffset];
-
-
-    var_filter_block2d_bil_first_pass(src_ptr, FData3, src_pixels_per_line, 1, 17, 8, HFilter);
-    var_filter_block2d_bil_second_pass(FData3, temp2, 8, 8, 16, 8, VFilter);
-
-    return variance(temp2, 8, dst_ptr, dst_pixels_per_line, 8, 16, sse);
-}

diff --git a/vp8/common/x86/variance_impl_sse2.asm b/vp8/common/x86/variance_impl_sse2.asm
deleted file mode 100644
index 26de5e8..0000000
--- a/vp8/common/x86/variance_impl_sse2.asm
+++ /dev/null

@@ -1,972 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-;void vp8_filter_block2d_bil_var_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp8_filter_block2d_bil_var_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    push rbx
-    ; end prolog
-
-        pxor            xmm6,           xmm6                 ;
-        pxor            xmm7,           xmm7                 ;
-
-        lea             rsi,            [GLOBAL(xmm_bi_rd)]  ; rounding
-        movdqa          xmm4,           XMMWORD PTR [rsi]
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_sse2)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              filter_block2d_bil_var_sse2_sp_only
-
-        shl             rax,            5                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              filter_block2d_bil_var_sse2_fp_only
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        movq            xmm3,           QWORD PTR [rsi+1]    ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]                ;
-        punpcklbw       xmm3,           xmm0
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift     ;
-        movdqa          xmm5,           xmm1
-
-        movsxd          rbx,            dword ptr arg(1) ;ref_pixels_per_line
-        lea             rsi,            [rsi + rbx]
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        movq            xmm3,           QWORD PTR [rsi+1]             ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4               ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movdqa          xmm3,           xmm5                 ;
-        movdqa          xmm5,           xmm1                 ;
-
-        pmullw          xmm3,           [rdx]               ;
-        pmullw          xmm1,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rbx]          ;ref_pixels_per_line
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_var_sse2_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip all if both xoffset=0 and yoffset=0
-        je              filter_block2d_bil_var_sse2_full_pixel
-
-        shl             rdx,            5
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movq            xmm1,           QWORD PTR [rsi]      ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        lea             rsi,            [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
-        movq            xmm3,           QWORD PTR [rsi]             ;
-        punpcklbw       xmm3,           xmm0                 ;
-        movdqa          xmm5,           xmm3
-
-        pmullw          xmm1,           [rdx]               ;
-        pmullw          xmm3,           [rdx+16]             ;
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4                 ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        movdqa          xmm1,           xmm5                 ;
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_sp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0                 ;
-
-filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]               ;
-        punpcklbw       xmm1,           xmm0                 ;
-
-        movq            xmm2,           QWORD PTR [rdi]               ;
-        punpcklbw       xmm2,           xmm0                 ;
-
-        psubw           xmm1,           xmm2                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_full_pixel_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                 ;
-        movsxd          rbx,            dword ptr arg(3)     ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
-        movq            xmm1,           QWORD PTR [rsi]       ;
-        movq            xmm3,           QWORD PTR [rsi+1]     ;
-
-        punpcklbw       xmm1,           xmm0                 ;
-        pmullw          xmm1,           [rax]               ;
-        punpcklbw       xmm3,           xmm0                 ;
-        pmullw          xmm3,           [rax+16]             ;
-
-        paddw           xmm1,           xmm3                 ;
-        paddw           xmm1,           xmm4  ;
-        psraw           xmm1,           xmm_filter_shift    ;
-
-        movq            xmm3,           QWORD PTR [rdi]     ;
-        punpcklbw       xmm3,           xmm0                 ;
-
-        psubw           xmm1,           xmm3                 ;
-        paddw           xmm6,           xmm1                 ;
-
-        pmaddwd         xmm1,           xmm1                 ;
-        paddd           xmm7,           xmm1                 ;
-        lea             rsi,            [rsi + rdx]
-        lea             rdi,            [rdi + rbx]          ;src_pixels_per_line
-
-        sub             rcx,            1                   ;
-        jnz             filter_block2d_bil_fp_only_loop       ;
-
-        jmp             filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(7) ; sum
-        mov             rdi,            arg(8) ; sumsquared
-
-        movd            [rsi],          mm2    ; xsum
-        movd            [rdi],          mm4    ; xxsum
-
-    ; begin epilog
-    pop rbx
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-%else
-        add             rsi, r8
-%endif
-
-vp8_half_horiz_vert_variance8x_h_1:
-
-        movq            xmm1,           QWORD PTR [rsi]     ;
-        movq            xmm2,           QWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance8x_h_1     ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3) horizontal line 1
-
-        lea             rsi,            [rsi + rax]
-
-vp8_half_horiz_vert_variance16x_h_1:
-        movdqu          xmm1,           XMMWORD PTR [rsi]     ;
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]   ;
-        pavgb           xmm1,           xmm2                ;  xmm1 = avg(xmm1,xmm3) horizontal line i+1
-
-        pavgb           xmm5,           xmm1                ;  xmm = vertical average of the above
-
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-
-        movq            xmm3,           QWORD PTR [rdi+8]
-        punpcklbw       xmm3,           xmm0
-        psubw           xmm4,           xmm3
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm1                ;  save xmm1 for use on the next row
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_vert_variance16x_h_1     ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_vert_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_vert_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+rax] ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_vert_variance8x_h_1          ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_vert_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
-sym(vp8_half_vert_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0)              ;ref_ptr
-
-        mov             rdi,            arg(2)              ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)    ;Height
-        movsxd          rax,            dword ptr arg(1)    ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        movdqu          xmm5,           XMMWORD PTR [rsi]
-        lea             rsi,            [rsi + rax          ]
-        pxor            xmm0,           xmm0
-
-vp8_half_vert_variance16x_h_1:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm4,           xmm5
-        punpcklbw       xmm5,           xmm0
-        punpckhbw       xmm4,           xmm0
-
-        movq            xmm2,           QWORD PTR [rdi]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm5,           xmm2
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-        psubw           xmm4,           xmm2
-
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm4
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm4,           xmm4
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm4
-
-        movdqa          xmm5,           xmm3
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1
-        jnz             vp8_half_vert_variance16x_h_1
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_half_horiz_variance8x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance8x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-%if ABI_IS_32BIT=0
-    movsxd          r8, dword ptr arg(1) ;ref_pixels_per_line
-    movsxd          r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            xmm0,           xmm0                ;
-vp8_half_horiz_variance8x_h_1:
-        movq            xmm5,           QWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s8
-        movq            xmm3,           QWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s9
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d8
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-
-%if ABI_IS_32BIT
-        add             esi,            dword ptr arg(1) ;ref_pixels_per_line    ;  next source
-        add             edi,            dword ptr arg(3) ;src_pixels_per_line    ;  next destination
-%else
-        add             rsi, r8
-        add             rdi, r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance8x_h_1        ;
-
-        movdq2q         mm6,            xmm6                ;
-        movdq2q         mm7,            xmm7                ;
-
-        psrldq          xmm6,           8
-        psrldq          xmm7,           8
-
-        movdq2q         mm2,            xmm6
-        movdq2q         mm3,            xmm7
-
-        paddw           mm6,            mm2
-        paddd           mm7,            mm3
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rsi,            arg(5) ; sum
-        mov             rdi,            arg(6) ; sumsquared
-
-        movd            [rsi],          mm2                 ;
-        movd            [rdi],          mm4                 ;
-
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_half_horiz_variance16x_h_sse2
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
-sym(vp8_half_horiz_variance16x_h_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6                ;  error accumulator
-        pxor            xmm7,           xmm7                ;  sse eaccumulator
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-        movsxd          rax,            dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)    ;src_pixels_per_line
-
-        pxor            xmm0,           xmm0                ;
-
-vp8_half_horiz_variance16x_h_1:
-        movdqu          xmm5,           XMMWORD PTR [rsi]     ;  xmm5 = s0,s1,s2..s15
-        movdqu          xmm3,           XMMWORD PTR [rsi+1]   ;  xmm3 = s1,s2,s3..s16
-
-        pavgb           xmm5,           xmm3                ;  xmm5 = avg(xmm1,xmm3)
-        movdqa          xmm1,           xmm5
-        punpcklbw       xmm5,           xmm0                ;  xmm5 = words of above
-        punpckhbw       xmm1,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]     ;  xmm3 = d0,d1,d2..d7
-        punpcklbw       xmm3,           xmm0                ;  xmm3 = words of above
-        movq            xmm2,           QWORD PTR [rdi+8]
-        punpcklbw       xmm2,           xmm0
-
-        psubw           xmm5,           xmm3                ;  xmm5 -= xmm3
-        psubw           xmm1,           xmm2
-        paddw           xmm6,           xmm5                ;  xmm6 += accumulated column differences
-        paddw           xmm6,           xmm1
-        pmaddwd         xmm5,           xmm5                ;  xmm5 *= xmm5
-        pmaddwd         xmm1,           xmm1
-        paddd           xmm7,           xmm5                ;  xmm7 += accumulated square column differences
-        paddd           xmm7,           xmm1
-
-        lea             rsi,            [rsi + rax]
-        lea             rdi,            [rdi + rdx]
-
-        sub             rcx,            1                   ;
-        jnz             vp8_half_horiz_variance16x_h_1        ;
-
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(5) ;[Sum]
-        mov         rdi,            arg(6) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-;    short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_sse2:
-    dw 128, 128, 128, 128, 128, 128, 128, 128,  0,  0,  0,  0,  0,  0,  0,  0
-    dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
-    dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
-    dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
-    dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
-    dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
-    dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

diff --git a/vp8/common/x86/variance_impl_ssse3.asm b/vp8/common/x86/variance_impl_ssse3.asm
deleted file mode 100644
index 686b4a9..0000000
--- a/vp8/common/x86/variance_impl_ssse3.asm
+++ /dev/null

@@ -1,364 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift            7
-
-
-;void vp8_filter_block2d_bil_var_ssse3
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    int  xoffset,
-;    int  yoffset,
-;    int *sum,
-;    unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp8_filter_block2d_bil_var_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        pxor            xmm6,           xmm6
-        pxor            xmm7,           xmm7
-
-        lea             rcx,            [GLOBAL(vp8_bilinear_filters_ssse3)]
-        movsxd          rax,            dword ptr arg(5)     ; xoffset
-
-        cmp             rax,            0                    ; skip first_pass filter if xoffset=0
-        je              .filter_block2d_bil_var_ssse3_sp_only
-
-        shl             rax,            4                    ; point to filter coeff with xoffset
-        lea             rax,            [rax + rcx]          ; HFilter
-
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; skip second_pass filter if yoffset=0
-        je              .filter_block2d_bil_var_ssse3_fp_only
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-
-        movdqu          xmm0,           XMMWORD PTR [rsi]
-        movdqu          xmm1,           XMMWORD PTR [rsi+1]
-        movdqa          xmm2,           xmm0
-
-        punpcklbw       xmm0,           xmm1
-        punpckhbw       xmm2,           xmm1
-        pmaddubsw       xmm0,           [rax]
-        pmaddubsw       xmm2,           [rax]
-
-        paddw           xmm0,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm0,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        packuswb        xmm0,           xmm2
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        lea             rsi,            [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-        packuswb        xmm1,           xmm3
-
-        movdqa          xmm2,           xmm0
-        movdqa          xmm0,           xmm1
-        movdqa          xmm3,           xmm2
-
-        punpcklbw       xmm2,           xmm1
-        punpckhbw       xmm3,           xmm1
-        pmaddubsw       xmm2,           [rdx]
-        pmaddubsw       xmm3,           [rdx]
-
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm2,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm1,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm1,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm2,           xmm1
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm2
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm2,           xmm2
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm2
-        paddd           xmm7,           xmm3
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1)     ;ref_pixels_per_line
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rsi,            [rsi + r8]
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_var_ssse3_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
-        movsxd          rdx,            dword ptr arg(6)     ; yoffset
-
-        cmp             rdx,            0                    ; Both xoffset =0 and yoffset=0
-        je              .filter_block2d_bil_var_ssse3_full_pixel
-
-        shl             rdx,            4
-        lea             rdx,            [rdx + rcx]          ; VFilter
-
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqa          xmm0,           xmm1
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-        lea             rsi,            [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
-        movdqu          xmm3,           XMMWORD PTR [rsi]
-        movdqa          xmm2,           xmm1
-        movdqa          xmm0,           xmm3
-
-        punpcklbw       xmm1,           xmm3
-        punpckhbw       xmm2,           xmm3
-        pmaddubsw       xmm1,           [rdx]
-        pmaddubsw       xmm2,           [rdx]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm2,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm2,           xmm_filter_shift
-
-        movq            xmm3,           QWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm3,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        movdqa          xmm1,           xmm0
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_sp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rax,            dword ptr arg(1)     ;ref_pixels_per_line
-        movsxd          rdx,            dword ptr arg(3)     ;src_pixels_per_line
-        pxor            xmm0,           xmm0
-
-.filter_block2d_bil_full_pixel_loop:
-        movq            xmm1,           QWORD PTR [rsi]
-        punpcklbw       xmm1,           xmm0
-        movq            xmm2,           QWORD PTR [rsi+8]
-        punpcklbw       xmm2,           xmm0
-
-        movq            xmm3,           QWORD PTR [rdi]
-        punpcklbw       xmm3,           xmm0
-        movq            xmm4,           QWORD PTR [rdi+8]
-        punpcklbw       xmm4,           xmm0
-
-        psubw           xmm1,           xmm3
-        psubw           xmm2,           xmm4
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm2
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm2,           xmm2
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm2
-
-        lea             rsi,            [rsi + rax]          ;ref_pixels_per_line
-        lea             rdi,            [rdi + rdx]          ;src_pixels_per_line
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_full_pixel_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
-        mov             rsi,            arg(0)               ;ref_ptr
-        mov             rdi,            arg(2)               ;src_ptr
-        movsxd          rcx,            dword ptr arg(4)     ;Height
-        movsxd          rdx,            dword ptr arg(1)     ;ref_pixels_per_line
-
-        pxor            xmm0,           xmm0
-
-%if ABI_IS_32BIT=0
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
-        movdqu          xmm1,           XMMWORD PTR [rsi]
-        movdqu          xmm2,           XMMWORD PTR [rsi+1]
-        movdqa          xmm3,           xmm1
-
-        punpcklbw       xmm1,           xmm2
-        punpckhbw       xmm3,           xmm2
-        pmaddubsw       xmm1,           [rax]
-        pmaddubsw       xmm3,           [rax]
-
-        paddw           xmm1,           [GLOBAL(xmm_bi_rd)]
-        paddw           xmm3,           [GLOBAL(xmm_bi_rd)]
-        psraw           xmm1,           xmm_filter_shift
-        psraw           xmm3,           xmm_filter_shift
-
-        movq            xmm2,           XMMWORD PTR [rdi]
-        pxor            xmm4,           xmm4
-        punpcklbw       xmm2,           xmm4
-        movq            xmm5,           QWORD PTR [rdi+8]
-        punpcklbw       xmm5,           xmm4
-
-        psubw           xmm1,           xmm2
-        psubw           xmm3,           xmm5
-        paddw           xmm6,           xmm1
-        paddw           xmm6,           xmm3
-        pmaddwd         xmm1,           xmm1
-        pmaddwd         xmm3,           xmm3
-        paddd           xmm7,           xmm1
-        paddd           xmm7,           xmm3
-
-        lea             rsi,            [rsi + rdx]
-%if ABI_IS_32BIT
-        add             rdi,            dword ptr arg(3)     ;src_pixels_per_line
-%else
-        lea             rdi,            [rdi + r9]
-%endif
-
-        sub             rcx,            1
-        jnz             .filter_block2d_bil_fp_only_loop
-
-        jmp             .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
-        pxor        xmm0,           xmm0
-        pxor        xmm1,           xmm1
-        pxor        xmm5,           xmm5
-
-        punpcklwd   xmm0,           xmm6
-        punpckhwd   xmm1,           xmm6
-        psrad       xmm0,           16
-        psrad       xmm1,           16
-        paddd       xmm0,           xmm1
-        movdqa      xmm1,           xmm0
-
-        movdqa      xmm6,           xmm7
-        punpckldq   xmm6,           xmm5
-        punpckhdq   xmm7,           xmm5
-        paddd       xmm6,           xmm7
-
-        punpckldq   xmm0,           xmm5
-        punpckhdq   xmm1,           xmm5
-        paddd       xmm0,           xmm1
-
-        movdqa      xmm7,           xmm6
-        movdqa      xmm1,           xmm0
-
-        psrldq      xmm7,           8
-        psrldq      xmm1,           8
-
-        paddd       xmm6,           xmm7
-        paddd       xmm0,           xmm1
-
-        mov         rsi,            arg(7) ;[Sum]
-        mov         rdi,            arg(8) ;[SSE]
-
-        movd        [rsi],       xmm0
-        movd        [rdi],       xmm6
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
-    times 8 dw 64
-align 16
-vp8_bilinear_filters_ssse3:
-    times 8 db 128, 0
-    times 8 db 112, 16
-    times 8 db 96,  32
-    times 8 db 80,  48
-    times 8 db 64,  64
-    times 8 db 48,  80
-    times 8 db 32,  96
-    times 8 db 16,  112

diff --git a/vp8/common/x86/variance_ssse3.c b/vp8/common/x86/variance_ssse3.c
deleted file mode 100644
index 2a0df64..0000000
--- a/vp8/common/x86/variance_ssse3.c
+++ /dev/null

@@ -1,157 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-
-extern void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_ssse3
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance16x16_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_ssse3
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_ssse3(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}

diff --git a/vp8/common/x86/vp8_variance_impl_mmx.asm b/vp8/common/x86/vp8_variance_impl_mmx.asm
deleted file mode 100644
index 97f2527..0000000
--- a/vp8/common/x86/vp8_variance_impl_mmx.asm
+++ /dev/null

@@ -1,353 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define mmx_filter_shift            7
-
-;void vp8_filter_block2d_bil4x4_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil4x4_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-
-        mov             rax,            arg(4) ;HFilter             ;
-        mov             rdx,            arg(5) ;VFilter             ;
-
-        mov             rsi,            arg(0) ;ref_ptr              ;
-        mov             rdi,            arg(2) ;src_ptr              ;
-
-        mov             rcx,            4                   ;
-        pxor            mm0,            mm0                 ;
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-%if ABI_IS_32BIT
-        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
-%else
-        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
-        add             rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
-        movd            mm1,            [rsi]               ;
-        movd            mm3,            [rsi+1]             ;
-
-        punpcklbw       mm1,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        movq            mm3,            mm5                 ;
-
-        movq            mm5,            mm1                 ;
-        pmullw          mm3,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        paddw           mm1,            mm3                 ;
-
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        movd            mm3,            [rdi]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        paddw           mm6,            mm1                 ;
-
-        pmaddwd         mm1,            mm1                 ;
-        paddd           mm7,            mm1                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(6) ;sum
-        mov             rsi,            arg(7) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
-;void vp8_filter_block2d_bil_var_mmx
-;(
-;    unsigned char *ref_ptr,
-;    int ref_pixels_per_line,
-;    unsigned char *src_ptr,
-;    int src_pixels_per_line,
-;    unsigned int Height,
-;    unsigned short *HFilter,
-;    unsigned short *VFilter,
-;    int *sum,
-;    unsigned int *sumsquared
-;)
-global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp8_filter_block2d_bil_var_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 9
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    sub         rsp, 16
-    ; end prolog
-
-        pxor            mm6,            mm6                 ;
-        pxor            mm7,            mm7                 ;
-        mov             rax,            arg(5) ;HFilter             ;
-
-        mov             rdx,            arg(6) ;VFilter             ;
-        mov             rsi,            arg(0) ;ref_ptr              ;
-
-        mov             rdi,            arg(2) ;src_ptr              ;
-        movsxd          rcx,            dword ptr arg(4) ;Height              ;
-
-        pxor            mm0,            mm0                 ;
-        movq            mm1,            [rsi]               ;
-
-        movq            mm3,            [rsi+1]             ;
-        movq            mm2,            mm1                 ;
-
-        movq            mm4,            mm3                 ;
-        punpcklbw       mm1,            mm0                 ;
-
-        punpckhbw       mm2,            mm0                 ;
-        pmullw          mm1,            [rax]               ;
-
-        pmullw          mm2,            [rax]               ;
-        punpcklbw       mm3,            mm0                 ;
-
-        punpckhbw       mm4,            mm0                 ;
-        pmullw          mm3,            [rax+8]             ;
-
-        pmullw          mm4,            [rax+8]             ;
-        paddw           mm1,            mm3                 ;
-
-        paddw           mm2,            mm4                 ;
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm2,            mmx_filter_shift    ;
-        movq            mm5,            mm1
-
-        packuswb        mm5,            mm2                 ;
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
-        add             rsi,            r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
-        movq            mm1,            [rsi]               ;
-        movq            mm3,            [rsi+1]             ;
-
-        movq            mm2,            mm1                 ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm1,            mm0                 ;
-        punpckhbw       mm2,            mm0                 ;
-
-        pmullw          mm1,            [rax]               ;
-        pmullw          mm2,            [rax]               ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        pmullw          mm3,            [rax+8]             ;
-        pmullw          mm4,            [rax+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm1,            mmx_filter_shift    ;
-
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            mm5                 ;
-        movq            mm4,            mm5                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        movq            mm5,            mm1                 ;
-        packuswb        mm5,            mm2                 ;
-
-        pmullw          mm3,            [rdx]               ;
-        pmullw          mm4,            [rdx]               ;
-
-        pmullw          mm1,            [rdx+8]             ;
-        pmullw          mm2,            [rdx+8]             ;
-
-        paddw           mm1,            mm3                 ;
-        paddw           mm2,            mm4                 ;
-
-        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
-        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
-
-        psraw           mm1,            mmx_filter_shift    ;
-        psraw           mm2,            mmx_filter_shift    ;
-
-        movq            mm3,            [rdi]               ;
-        movq            mm4,            mm3                 ;
-
-        punpcklbw       mm3,            mm0                 ;
-        punpckhbw       mm4,            mm0                 ;
-
-        psubw           mm1,            mm3                 ;
-        psubw           mm2,            mm4                 ;
-
-        paddw           mm6,            mm1                 ;
-        pmaddwd         mm1,            mm1                 ;
-
-        paddw           mm6,            mm2                 ;
-        pmaddwd         mm2,            mm2                 ;
-
-        paddd           mm7,            mm1                 ;
-        paddd           mm7,            mm2                 ;
-
-%if ABI_IS_32BIT
-        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
-        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
-%else
-        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
-        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
-        add             rsi,            r8
-        add             rdi,            r9
-%endif
-        sub             rcx,            1                   ;
-        jnz             .filter_block2d_bil_var_mmx_loop       ;
-
-
-        pxor            mm3,            mm3                 ;
-        pxor            mm2,            mm2                 ;
-
-        punpcklwd       mm2,            mm6                 ;
-        punpckhwd       mm3,            mm6                 ;
-
-        paddd           mm2,            mm3                 ;
-        movq            mm6,            mm2                 ;
-
-        psrlq           mm6,            32                  ;
-        paddd           mm2,            mm6                 ;
-
-        psrad           mm2,            16                  ;
-        movq            mm4,            mm7                 ;
-
-        psrlq           mm4,            32                  ;
-        paddd           mm4,            mm7                 ;
-
-        mov             rdi,            arg(7) ;sum
-        mov             rsi,            arg(8) ;sumsquared
-
-        movd            dword ptr [rdi],          mm2                 ;
-        movd            dword ptr [rsi],          mm4                 ;
-
-    ; begin epilog
-    add rsp, 16
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
-    times 4 dw 64

diff --git a/vp8/common/x86/vp8_variance_mmx.c b/vp8/common/x86/vp8_variance_mmx.c
deleted file mode 100644
index e594b1e..0000000
--- a/vp8/common/x86/vp8_variance_mmx.c
+++ /dev/null

@@ -1,244 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx
-(
-    const unsigned char *src_ptr,
-    unsigned short *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-extern void filter_block1d_v6_mmx
-(
-    const short *src_ptr,
-    unsigned char *output_ptr,
-    unsigned int pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    short *filter
-);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-extern void vp8_filter_block2d_bil_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse)
-
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp8_sub_pixel_variance16x8_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum0, &xxsum0
-    );
-
-
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr + 8, src_pixels_per_line,
-        dst_ptr + 8, dst_pixels_per_line, 8,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum1, &xxsum1
-    );
-
-    xsum0 += xsum1;
-    xxsum0 += xxsum1;
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_mmx
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
-                                           ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *sse)
-{
-    return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
-                                           ref_ptr, recon_stride, sse);
-}

diff --git a/vp8/common/x86/vp8_variance_sse2.c b/vp8/common/x86/vp8_variance_sse2.c
deleted file mode 100644
index 1c15ed8..0000000
--- a/vp8/common/x86/vp8_variance_sse2.c
+++ /dev/null

@@ -1,403 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp8_rtcd.h"
-#include "vpx_config.h"
-#include "vp8/common/variance.h"
-#include "vpx_ports/mem.h"
-#include "vp8/common/x86/filter_x86.h"
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *filter);
-
-extern void vp8_filter_block2d_bil4x4_var_mmx
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    const short *HFilter,
-    const short *VFilter,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-void vp8_filter_block2d_bil_var_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int  xoffset,
-    int  yoffset,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_horiz_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance8x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-void vp8_half_vert_variance16x_h_sse2
-(
-    const unsigned char *ref_ptr,
-    int ref_pixels_per_line,
-    const unsigned char *src_ptr,
-    int src_pixels_per_line,
-    unsigned int Height,
-    int *sum,
-    unsigned int *sumsquared
-);
-
-unsigned int vp8_sub_pixel_variance4x4_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-    vp8_filter_block2d_bil4x4_var_mmx(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line,
-        vp8_bilinear_filters_x86_4[xoffset], vp8_bilinear_filters_x86_4[yoffset],
-        &xsum, &xxsum
-    );
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp8_sub_pixel_variance8x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp8_sub_pixel_variance16x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-
-    /* note we could avoid these if statements if the calling function
-     * just called the appropriate functions inside.
-     */
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum0, &xxsum0
-        );
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum1, &xxsum1
-        );
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp8_sub_pixel_variance16x8_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-
-)
-{
-    int xsum0, xsum1;
-    unsigned int xxsum0, xxsum1;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance16x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            &xsum0, &xxsum0);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum0, &xxsum0);
-
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr + 8, src_pixels_per_line,
-            dst_ptr + 8, dst_pixels_per_line, 8,
-            xoffset, yoffset,
-            &xsum1, &xxsum1);
-        xsum0 += xsum1;
-        xxsum0 += xxsum1;
-    }
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp8_sub_pixel_variance8x16_wmt
-(
-    const unsigned char  *src_ptr,
-    int  src_pixels_per_line,
-    int  xoffset,
-    int  yoffset,
-    const unsigned char *dst_ptr,
-    int dst_pixels_per_line,
-    unsigned int *sse
-)
-{
-    int xsum;
-    unsigned int xxsum;
-
-    if (xoffset == 4 && yoffset == 0)
-    {
-        vp8_half_horiz_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 0 && yoffset == 4)
-    {
-        vp8_half_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else if (xoffset == 4 && yoffset == 4)
-    {
-        vp8_half_horiz_vert_variance8x_h_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            &xsum, &xxsum);
-    }
-    else
-    {
-        vp8_filter_block2d_bil_var_sse2(
-            src_ptr, src_pixels_per_line,
-            dst_ptr, dst_pixels_per_line, 16,
-            xoffset, yoffset,
-            &xsum, &xxsum);
-    }
-
-    *sse = xxsum;
-    return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_h_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_v_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-    vp8_half_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-
-unsigned int vp8_variance_halfpixvar16x16_hv_wmt(
-    const unsigned char *src_ptr,
-    int  src_pixels_per_line,
-    const unsigned char *dst_ptr,
-    int  dst_pixels_per_line,
-    unsigned int *sse)
-{
-    int xsum0;
-    unsigned int xxsum0;
-
-    vp8_half_horiz_vert_variance16x_h_sse2(
-        src_ptr, src_pixels_per_line,
-        dst_ptr, dst_pixels_per_line, 16,
-        &xsum0, &xxsum0);
-
-    *sse = xxsum0;
-    return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}

diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c
deleted file mode 100644
index d3ab7b1..0000000
--- a/vp8/encoder/arm/neon/subtract_neon.c
+++ /dev/null

@@ -1,154 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vp8/encoder/block.h"
-
-void vp8_subtract_b_neon(
-        BLOCK *be,
-        BLOCKD *bd,
-        int pitch) {
-    unsigned char *src_ptr, *predictor;
-    int src_stride;
-    int16_t *src_diff;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint16x8_t q10u16, q11u16, q12u16, q13u16;
-
-    src_ptr = *be->base_src + be->src;
-    src_stride = be->src_stride;
-    predictor = bd->predictor;
-
-    d0u8 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d2u8 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d4u8 = vld1_u8(src_ptr);
-    src_ptr += src_stride;
-    d6u8 = vld1_u8(src_ptr);
-
-    d1u8 = vld1_u8(predictor);
-    predictor += pitch;
-    d3u8 = vld1_u8(predictor);
-    predictor += pitch;
-    d5u8 = vld1_u8(predictor);
-    predictor += pitch;
-    d7u8 = vld1_u8(predictor);
-
-    q10u16 = vsubl_u8(d0u8, d1u8);
-    q11u16 = vsubl_u8(d2u8, d3u8);
-    q12u16 = vsubl_u8(d4u8, d5u8);
-    q13u16 = vsubl_u8(d6u8, d7u8);
-
-    src_diff = be->src_diff;
-    vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
-    src_diff += pitch;
-    vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
-    src_diff += pitch;
-    vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
-    src_diff += pitch;
-    vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
-    return;
-}
-
-void vp8_subtract_mby_neon(
-        int16_t *diff,
-        unsigned char *src,
-        int src_stride,
-        unsigned char *pred,
-        int pred_stride) {
-    int i;
-    uint8x16_t q0u8, q1u8, q2u8, q3u8;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-    for (i = 0; i < 8; i++) {  // subtract_mby_loop
-        q0u8 = vld1q_u8(src);
-        src += src_stride;
-        q2u8 = vld1q_u8(src);
-        src += src_stride;
-        q1u8 = vld1q_u8(pred);
-        pred += pred_stride;
-        q3u8 = vld1q_u8(pred);
-        pred += pred_stride;
-
-        q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
-        q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
-        q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
-        q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
-
-        vst1q_u16((uint16_t *)diff, q8u16);
-        diff += 8;
-        vst1q_u16((uint16_t *)diff, q9u16);
-        diff += 8;
-        vst1q_u16((uint16_t *)diff, q10u16);
-        diff += 8;
-        vst1q_u16((uint16_t *)diff, q11u16);
-        diff += 8;
-    }
-    return;
-}
-
-void vp8_subtract_mbuv_neon(
-        int16_t *diff,
-        unsigned char *usrc,
-        unsigned char *vsrc,
-        int src_stride,
-        unsigned char *upred,
-        unsigned char *vpred,
-        int pred_stride) {
-    int i, j;
-    unsigned char *src_ptr, *pred_ptr;
-    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
-    uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
-    diff += 256;
-    for (i = 0; i < 2; i++) {
-        if (i == 0) {
-            src_ptr = usrc;
-            pred_ptr = upred;
-        } else if (i == 1) {
-            src_ptr = vsrc;
-            pred_ptr = vpred;
-        }
-
-        for (j = 0; j < 2; j++) {
-            d0u8 = vld1_u8(src_ptr);
-            src_ptr += src_stride;
-            d1u8 = vld1_u8(pred_ptr);
-            pred_ptr += pred_stride;
-            d2u8 = vld1_u8(src_ptr);
-            src_ptr += src_stride;
-            d3u8 = vld1_u8(pred_ptr);
-            pred_ptr += pred_stride;
-            d4u8 = vld1_u8(src_ptr);
-            src_ptr += src_stride;
-            d5u8 = vld1_u8(pred_ptr);
-            pred_ptr += pred_stride;
-            d6u8 = vld1_u8(src_ptr);
-            src_ptr += src_stride;
-            d7u8 = vld1_u8(pred_ptr);
-            pred_ptr += pred_stride;
-
-            q8u16  = vsubl_u8(d0u8, d1u8);
-            q9u16  = vsubl_u8(d2u8, d3u8);
-            q10u16 = vsubl_u8(d4u8, d5u8);
-            q11u16 = vsubl_u8(d6u8, d7u8);
-
-            vst1q_u16((uint16_t *)diff, q8u16);
-            diff += 8;
-            vst1q_u16((uint16_t *)diff, q9u16);
-            diff += 8;
-            vst1q_u16((uint16_t *)diff, q10u16);
-            diff += 8;
-            vst1q_u16((uint16_t *)diff, q11u16);
-            diff += 8;
-        }
-    }
-    return;
-}

diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index dfd0a23..cf180c1 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c

@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_config.h"
 #include "vp8_rtcd.h"
@@ -19,78 +20,29 @@
 #include "vpx_mem/vpx_mem.h"
 #include "rdopt.h"
 
-void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *src_ptr = (*(be->base_src) + be->src);
-    short *diff_ptr = be->src_diff;
-    unsigned char *pred_ptr = bd->predictor;
-    int src_stride = be->src_stride;
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
+  unsigned char *src_ptr = (*(be->base_src) + be->src);
+  short *diff_ptr = be->src_diff;
+  unsigned char *pred_ptr = bd->predictor;
+  int src_stride = be->src_stride;
 
-    int r, c;
-
-    for (r = 0; r < 4; r++)
-    {
-        for (c = 0; c < 4; c++)
-        {
-            diff_ptr[c] = src_ptr[c] - pred_ptr[c];
-        }
-
-        diff_ptr += pitch;
-        pred_ptr += pitch;
-        src_ptr  += src_stride;
-    }
+  vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
+                     pred_ptr, pitch);
 }
 
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
                          int src_stride, unsigned char *upred,
-                         unsigned char *vpred, int pred_stride)
-{
-    short *udiff = diff + 256;
-    short *vdiff = diff + 320;
+                         unsigned char *vpred, int pred_stride) {
+  short *udiff = diff + 256;
+  short *vdiff = diff + 320;
 
-    int r, c;
-
-    for (r = 0; r < 8; r++)
-    {
-        for (c = 0; c < 8; c++)
-        {
-            udiff[c] = usrc[c] - upred[c];
-        }
-
-        udiff += 8;
-        upred += pred_stride;
-        usrc  += src_stride;
-    }
-
-    for (r = 0; r < 8; r++)
-    {
-        for (c = 0; c < 8; c++)
-        {
-            vdiff[c] = vsrc[c] - vpred[c];
-        }
-
-        vdiff += 8;
-        vpred += pred_stride;
-        vsrc  += src_stride;
-    }
+  vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
+  vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
 }
 
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
-                        unsigned char *pred, int pred_stride)
-{
-    int r, c;
-
-    for (r = 0; r < 16; r++)
-    {
-        for (c = 0; c < 16; c++)
-        {
-            diff[c] = src[c] - pred[c];
-        }
-
-        diff += 16;
-        pred += pred_stride;
-        src  += src_stride;
-    }
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride) {
+  vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
 }
 
 static void vp8_subtract_mb(MACROBLOCK *x)

diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 0b3ec87..10b3d86 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h

@@ -19,6 +19,13 @@
 #endif
 void vp8_encode_inter16x16(MACROBLOCK *x);
 
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
+                       int src_stride, unsigned char *upred,
+                       unsigned char *vpred, int pred_stride);
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+                      unsigned char *pred, int pred_stride);
+
 void vp8_build_dcblock(MACROBLOCK *b);
 void vp8_transform_mb(MACROBLOCK *mb);
 void vp8_transform_mbuv(MACROBLOCK *x);

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 3deb4ab..4c2acc7 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -16,7 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 #include "block.h"
 #include "onyx_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodeintra.h"
 #include "vp8/common/setupintrarecon.h"
 #include "vp8/common/systemdependent.h"

diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index f284f7c..1694af8 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h

@@ -13,7 +13,7 @@
 #define VP8_ENCODER_MCOMP_H_
 
 #include "block.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 40e29e1..c2a7ac4 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -2132,17 +2132,17 @@
 
     cpi->fn_ptr[BLOCK_16X16].sdf            = vpx_sad16x16;
     cpi->fn_ptr[BLOCK_16X16].vf             = vpx_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf            = vp8_sub_pixel_variance16x16;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vp8_variance_halfpixvar16x16_h;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vp8_variance_halfpixvar16x16_v;
-    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vp8_variance_halfpixvar16x16_hv;
+    cpi->fn_ptr[BLOCK_16X16].svf            = vpx_sub_pixel_variance16x16;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_h  = vpx_variance_halfpixvar16x16_h;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_v  = vpx_variance_halfpixvar16x16_v;
+    cpi->fn_ptr[BLOCK_16X16].svf_halfpix_hv = vpx_variance_halfpixvar16x16_hv;
     cpi->fn_ptr[BLOCK_16X16].sdx3f          = vpx_sad16x16x3;
     cpi->fn_ptr[BLOCK_16X16].sdx8f          = vpx_sad16x16x8;
     cpi->fn_ptr[BLOCK_16X16].sdx4df         = vpx_sad16x16x4d;
 
     cpi->fn_ptr[BLOCK_16X8].sdf            = vpx_sad16x8;
     cpi->fn_ptr[BLOCK_16X8].vf             = vpx_variance16x8;
-    cpi->fn_ptr[BLOCK_16X8].svf            = vp8_sub_pixel_variance16x8;
+    cpi->fn_ptr[BLOCK_16X8].svf            = vpx_sub_pixel_variance16x8;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_16X8].svf_halfpix_hv = NULL;
@@ -2152,7 +2152,7 @@
 
     cpi->fn_ptr[BLOCK_8X16].sdf            = vpx_sad8x16;
     cpi->fn_ptr[BLOCK_8X16].vf             = vpx_variance8x16;
-    cpi->fn_ptr[BLOCK_8X16].svf            = vp8_sub_pixel_variance8x16;
+    cpi->fn_ptr[BLOCK_8X16].svf            = vpx_sub_pixel_variance8x16;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X16].svf_halfpix_hv = NULL;
@@ -2162,7 +2162,7 @@
 
     cpi->fn_ptr[BLOCK_8X8].sdf            = vpx_sad8x8;
     cpi->fn_ptr[BLOCK_8X8].vf             = vpx_variance8x8;
-    cpi->fn_ptr[BLOCK_8X8].svf            = vp8_sub_pixel_variance8x8;
+    cpi->fn_ptr[BLOCK_8X8].svf            = vpx_sub_pixel_variance8x8;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_8X8].svf_halfpix_hv = NULL;
@@ -2172,7 +2172,7 @@
 
     cpi->fn_ptr[BLOCK_4X4].sdf            = vpx_sad4x4;
     cpi->fn_ptr[BLOCK_4X4].vf             = vpx_variance4x4;
-    cpi->fn_ptr[BLOCK_4X4].svf            = vp8_sub_pixel_variance4x4;
+    cpi->fn_ptr[BLOCK_4X4].svf            = vpx_sub_pixel_variance4x4;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_h  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_v  = NULL;
     cpi->fn_ptr[BLOCK_4X4].svf_halfpix_hv = NULL;

diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index c48e2f4..6fe8f23 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h

@@ -18,7 +18,7 @@
 #include "treewriter.h"
 #include "tokenize.h"
 #include "vp8/common/onyxc_int.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "encodemb.h"
 #include "quantize.h"
 #include "vp8/common/entropy.h"

diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 053bf11..f3443db 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c

@@ -22,7 +22,7 @@
 #include "encodemb.h"
 #include "vp8/common/reconinter.h"
 #include "vp8/common/reconintra4x4.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 17194f0..edd6c58 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -29,7 +29,7 @@
 #include "vp8/common/quant_common.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "vp8/common/variance.h"
+#include "vpx_dsp/variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
 #include "vpx_mem/vpx_mem.h"
@@ -500,9 +500,9 @@
 
     if ((mv_row | mv_col) & 7)
     {
-        vp8_sub_pixel_variance8x8(uptr, pre_stride,
+        vpx_sub_pixel_variance8x8(uptr, pre_stride,
             mv_col & 7, mv_row & 7, upred_ptr, uv_stride, &sse2);
-        vp8_sub_pixel_variance8x8(vptr, pre_stride,
+        vpx_sub_pixel_variance8x8(vptr, pre_stride,
             mv_col & 7, mv_row & 7, vpred_ptr, uv_stride, &sse1);
         sse2 += sse1;
     }

diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c
index 448217f..14282db 100644
--- a/vp8/encoder/x86/quantize_ssse3.c
+++ b/vp8/encoder/x86/quantize_ssse3.c

@@ -17,7 +17,7 @@
 #include <intrin.h>
 #pragma intrinsic(_BitScanReverse)
 static int bsr(int mask) {
-  int eob;
+  unsigned long eob;
   _BitScanReverse(&eob, mask);
   eob++;
   if (mask == 0)

diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
deleted file mode 100644
index 794dd22..0000000
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ /dev/null

@@ -1,223 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp8_subtract_b_mmx_impl) PRIVATE
-sym(vp8_subtract_b_mmx_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi],      mm0
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],mm0
-
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*4],        mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    [rdi+rcx*2],        mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_mmx) PRIVATE
-sym(vp8_subtract_mby_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;src
-    movsxd      rdx,        dword ptr arg(2);src_stride
-    mov         rax,        arg(3)          ;pred
-    push        rbx
-    movsxd      rbx,        dword ptr arg(4);pred_stride
-
-    pxor        mm0,        mm0
-    mov         rcx,        16
-
-
-.submby_loop:
-    movq        mm1,        [rsi]
-    movq        mm3,        [rax]
-
-    movq        mm2,        mm1
-    movq        mm4,        mm3
-
-    punpcklbw   mm1,        mm0
-    punpcklbw   mm3,        mm0
-
-    punpckhbw   mm2,        mm0
-    punpckhbw   mm4,        mm0
-
-    psubw       mm1,        mm3
-    psubw       mm2,        mm4
-
-    movq        [rdi],      mm1
-    movq        [rdi+8],    mm2
-
-    movq        mm1,        [rsi+8]
-    movq        mm3,        [rax+8]
-
-    movq        mm2,        mm1
-    movq        mm4,        mm3
-
-    punpcklbw   mm1,        mm0
-    punpcklbw   mm3,        mm0
-
-    punpckhbw   mm2,        mm0
-    punpckhbw   mm4,        mm0
-
-    psubw       mm1,        mm3
-    psubw       mm2,        mm4
-
-    movq        [rdi+16],   mm1
-    movq        [rdi+24],   mm2
-    add         rdi,        32
-    lea         rax,        [rax+rbx]
-    lea         rsi,        [rsi+rdx]
-    dec         rcx
-    jnz         .submby_loop
-
-    pop rbx
-    pop rdi
-    pop rsi
-    ; begin epilog
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-
-global sym(vp8_subtract_mbuv_mmx) PRIVATE
-sym(vp8_subtract_mbuv_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;usrc
-    movsxd      rdx,        dword ptr arg(3);src_stride;
-    mov         rax,        arg(4)          ;upred
-    add         rdi,        256*2           ;diff = diff + 256 (shorts)
-    mov         rcx,        8
-    push        rbx
-    movsxd      rbx,        dword ptr arg(6);pred_stride
-
-    pxor        mm7,        mm7
-
-.submbu_loop:
-    movq        mm0,        [rsi]
-    movq        mm1,        [rax]
-    movq        mm3,        mm0
-    movq        mm4,        mm1
-    punpcklbw   mm0,        mm7
-    punpcklbw   mm1,        mm7
-    punpckhbw   mm3,        mm7
-    punpckhbw   mm4,        mm7
-    psubw       mm0,        mm1
-    psubw       mm3,        mm4
-    movq        [rdi],      mm0
-    movq        [rdi+8],    mm3
-    add         rdi, 16
-    add         rsi, rdx
-    add         rax, rbx
-
-    dec         rcx
-    jnz         .submbu_loop
-
-    mov         rsi,        arg(2)          ;vsrc
-    mov         rax,        arg(5)          ;vpred
-    mov         rcx,        8
-
-.submbv_loop:
-    movq        mm0,        [rsi]
-    movq        mm1,        [rax]
-    movq        mm3,        mm0
-    movq        mm4,        mm1
-    punpcklbw   mm0,        mm7
-    punpcklbw   mm1,        mm7
-    punpckhbw   mm3,        mm7
-    punpckhbw   mm4,        mm7
-    psubw       mm0,        mm1
-    psubw       mm3,        mm4
-    movq        [rdi],      mm0
-    movq        [rdi+8],    mm3
-    add         rdi, 16
-    add         rsi, rdx
-    add         rax, rbx
-
-    dec         rcx
-    jnz         .submbv_loop
-
-    pop         rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret

diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
deleted file mode 100644
index a5d17f5..0000000
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ /dev/null

@@ -1,245 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-;                            short *diff, unsigned char *Predictor,
-;                            int pitch);
-global sym(vp8_subtract_b_sse2_impl) PRIVATE
-sym(vp8_subtract_b_sse2_impl):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-        mov     rdi,        arg(2) ;diff
-        mov     rax,        arg(3) ;Predictor
-        mov     rsi,        arg(0) ;z
-        movsxd  rdx,        dword ptr arg(1);src_stride;
-        movsxd  rcx,        dword ptr arg(4);pitch
-        pxor    mm7,        mm7
-
-        movd    mm0,        [rsi]
-        movd    mm1,        [rax]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi],      mm0
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-        movd    mm0,        [rsi+rdx*2]
-        movd    mm1,        [rax+rcx*2]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*4], mm0
-
-        lea     rsi,        [rsi+rdx*2]
-        lea     rcx,        [rcx+rcx*2]
-
-        movd    mm0,        [rsi+rdx]
-        movd    mm1,        [rax+rcx]
-        punpcklbw   mm0,    mm7
-        punpcklbw   mm1,    mm7
-        psubw   mm0,        mm1
-        movq    MMWORD PTR [rdi+rcx*2], mm0
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_sse2) PRIVATE
-sym(vp8_subtract_mby_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;src
-    movsxd      rdx,        dword ptr arg(2);src_stride
-    mov         rax,        arg(3)          ;pred
-    movdqa      xmm4,       [GLOBAL(t80)]
-    push        rbx
-    mov         rcx,        8               ; do two lines at one time
-    movsxd      rbx,        dword ptr arg(4);pred_stride
-
-.submby_loop:
-    movdqa      xmm0,       [rsi]           ; src
-    movdqa      xmm1,       [rax]           ; pred
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm1            ; put sign back to subtraction
-
-    movdqa      xmm3,       [rsi + rdx]
-    movdqa      xmm5,       [rax + rbx]
-
-    lea         rsi,        [rsi+rdx*2]
-    lea         rax,        [rax+rbx*2]
-
-    movdqa      [rdi],      xmm0
-    movdqa      [rdi +16],  xmm2
-
-    movdqa      xmm1,       xmm3
-    psubb       xmm3,       xmm5
-
-    pxor        xmm5,       xmm4            ;convert to signed values
-    pxor        xmm1,       xmm4
-    pcmpgtb     xmm5,       xmm1            ; obtain sign information
-
-    movdqa      xmm1,       xmm3
-    punpcklbw   xmm3,       xmm5            ; put sign back to subtraction
-    punpckhbw   xmm1,       xmm5            ; put sign back to subtraction
-
-    movdqa      [rdi +32],  xmm3
-    movdqa      [rdi +48],  xmm1
-
-    add         rdi,        64
-    dec         rcx
-    jnz         .submby_loop
-
-    pop rbx
-    pop rdi
-    pop rsi
-    ; begin epilog
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
-;                         int src_stride, unsigned char *upred,
-;                         unsigned char *vpred, int pred_stride)
-global sym(vp8_subtract_mbuv_sse2) PRIVATE
-sym(vp8_subtract_mbuv_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 7
-    GET_GOT     rbx
-    push rsi
-    push rdi
-    ; end prolog
-
-    movdqa      xmm4,       [GLOBAL(t80)]
-    mov         rdi,        arg(0)          ;diff
-    mov         rsi,        arg(1)          ;usrc
-    movsxd      rdx,        dword ptr arg(3);src_stride;
-    mov         rax,        arg(4)          ;upred
-    add         rdi,        256*2           ;diff = diff + 256 (shorts)
-    mov         rcx,        4
-    push        rbx
-    movsxd      rbx,        dword ptr arg(6);pred_stride
-
-    ;u
-.submbu_loop:
-    movq        xmm0,       [rsi]           ; src
-    movq        xmm2,       [rsi+rdx]       ; src -- next line
-    movq        xmm1,       [rax]           ; pred
-    movq        xmm3,       [rax+rbx]       ; pred -- next line
-    lea         rsi,        [rsi + rdx*2]
-    lea         rax,        [rax + rbx*2]
-
-    punpcklqdq  xmm0,       xmm2
-    punpcklqdq  xmm1,       xmm3
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1            ; subtraction with sign missed
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    movdqa      xmm3,       xmm1
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
-
-    movdqa      [rdi],      xmm0            ; store difference
-    movdqa      [rdi +16],  xmm2            ; store difference
-    add         rdi,        32
-    sub         rcx, 1
-    jnz         .submbu_loop
-
-    mov         rsi,        arg(2)          ;vsrc
-    mov         rax,        arg(5)          ;vpred
-    mov         rcx,        4
-
-    ;v
-.submbv_loop:
-    movq        xmm0,       [rsi]           ; src
-    movq        xmm2,       [rsi+rdx]       ; src -- next line
-    movq        xmm1,       [rax]           ; pred
-    movq        xmm3,       [rax+rbx]       ; pred -- next line
-    lea         rsi,        [rsi + rdx*2]
-    lea         rax,        [rax + rbx*2]
-
-    punpcklqdq  xmm0,       xmm2
-    punpcklqdq  xmm1,       xmm3
-
-    movdqa      xmm2,       xmm0
-    psubb       xmm0,       xmm1            ; subtraction with sign missed
-
-    pxor        xmm1,       xmm4            ;convert to signed values
-    pxor        xmm2,       xmm4
-    pcmpgtb     xmm1,       xmm2            ; obtain sign information
-
-    movdqa      xmm2,       xmm0
-    movdqa      xmm3,       xmm1
-    punpcklbw   xmm0,       xmm1            ; put sign back to subtraction
-    punpckhbw   xmm2,       xmm3            ; put sign back to subtraction
-
-    movdqa      [rdi],      xmm0            ; store difference
-    movdqa      [rdi +16],  xmm2            ; store difference
-    add         rdi,        32
-    sub         rcx, 1
-    jnz         .submbv_loop
-
-    pop         rbx
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-SECTION_RODATA
-align 16
-t80:
-    times 16 db 0x80

diff --git a/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/vp8/encoder/x86/vp8_enc_stubs_mmx.c
index cf3d8ca..7bf5155 100644
--- a/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_mmx.c

@@ -65,14 +65,3 @@
     return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
 }
 
-void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *z = *(be->base_src) + be->src;
-    unsigned int  src_stride = be->src_stride;
-    short *diff = &be->src_diff[0];
-    unsigned char *predictor = &bd->predictor[0];
-    vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}

diff --git a/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/vp8/encoder/x86/vp8_enc_stubs_sse2.c
index 3dfbee3..be9aaf3 100644
--- a/vp8/encoder/x86/vp8_enc_stubs_sse2.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_sse2.c

@@ -30,14 +30,3 @@
     return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
 }
 
-void vp8_subtract_b_sse2_impl(unsigned char *z,  int src_stride,
-                             short *diff, unsigned char *predictor,
-                             int pitch);
-void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
-{
-    unsigned char *z = *(be->base_src) + be->src;
-    unsigned int  src_stride = be->src_stride;
-    short *diff = &be->src_diff[0];
-    unsigned char *predictor = &bd->predictor[0];
-    vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}

diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 236d8a5..47ad97c 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk

@@ -63,8 +63,6 @@
 VP8_COMMON_SRCS-yes += common/reconintra4x4.c
 VP8_COMMON_SRCS-yes += common/setupintrarecon.c
 VP8_COMMON_SRCS-yes += common/swapyv12buffer.c
-VP8_COMMON_SRCS-yes += common/variance_c.c
-VP8_COMMON_SRCS-yes += common/variance.h
 VP8_COMMON_SRCS-yes += common/vp8_entropymodedata.h
 
 
@@ -86,8 +84,6 @@
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/recon_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_mmx.c
-VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp8_variance_impl_mmx.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/copy_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idct_blk_sse2.c
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm
@@ -96,12 +92,8 @@
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp8_variance_sse2.c
-VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/variance_impl_sse2.asm
 VP8_COMMON_SRCS-$(HAVE_SSE3) += common/x86/copy_sse3.asm
 VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/subpixel_ssse3.asm
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_ssse3.c
-VP8_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/variance_impl_ssse3.asm
 
 ifeq ($(CONFIG_POSTPROC),yes)
 VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
@@ -122,10 +114,13 @@
 VP8_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/dequantize_dspr2.c
 
 # common (c)
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/idct_msa.c
+VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp8_macros_msa.h
+
+# common (c)
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/filter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/dequantize_arm.c
-VP8_COMMON_SRCS-$(ARCH_ARM)  += common/arm/variance_arm.c
 
 # common (media)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/bilinearfilter_arm.c
@@ -145,9 +140,6 @@
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequant_idct_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/dequantize_v6$(ASM)
 VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/idct_blk_v6.c
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6$(ASM)
-VP8_COMMON_SRCS-$(HAVE_MEDIA)  += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM)
 
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/bilinearpredict_neon.c
@@ -166,6 +158,5 @@
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/reconintra_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/shortidct4x4llm_neon.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/sixtappredict_neon.c
-VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp8_subpixelvariance_neon.c
 
 $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))

diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 5e4ef05..99d40ec 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk

@@ -82,7 +82,6 @@
 endif
 
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/dct_mmx.asm
-VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/subtract_mmx.asm
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
@@ -94,7 +93,6 @@
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c
 endif
 
-VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm

diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 0b0f6a7..838b53d 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk

@@ -25,5 +25,4 @@
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/denoising_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/fastquantizeb_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/shortfdct_neon.c
-VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/subtract_neon.c
 VP8_CX_SRCS-$(HAVE_NEON)  += encoder/arm/neon/vp8_shortwalsh4x4_neon.c

diff --git a/vp9/common/mips/msa/vp9_convolve_avg_msa.c b/vp9/common/mips/msa/vp9_convolve_avg_msa.c
index eb87760..7c11e40 100644
--- a/vp9/common/mips/msa/vp9_convolve_avg_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_avg_msa.c

@@ -8,7 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
                            uint8_t *dst, int32_t dst_stride, int32_t height) {

diff --git a/vp9/common/mips/msa/vp9_convolve_copy_msa.c b/vp9/common/mips/msa/vp9_convolve_copy_msa.c
index 7a292c5..39a0b24 100644
--- a/vp9/common/mips/msa/vp9_convolve_copy_msa.c
+++ b/vp9/common/mips/msa/vp9_convolve_copy_msa.c

@@ -9,7 +9,7 @@
  */
 
 #include <string.h>
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
                             uint8_t *dst, int32_t dst_stride, int32_t height) {

diff --git a/vp9/common/mips/msa/vp9_convolve_msa.h b/vp9/common/mips/msa/vp9_convolve_msa.h
index 40fe94d..71c616b 100644
--- a/vp9/common/mips/msa/vp9_convolve_msa.h
+++ b/vp9/common/mips/msa/vp9_convolve_msa.h

@@ -12,7 +12,7 @@
 #define VP9_COMMON_MIPS_MSA_VP9_CONVOLVE_MSA_H_
 
 #include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 extern const uint8_t mc_filt_mask_arr[16 * 3];
 

diff --git a/vp9/common/mips/msa/vp9_idct_msa.h b/vp9/common/mips/msa/vp9_idct_msa.h
index 60e27fc..c86e65a 100644
--- a/vp9/common/mips/msa/vp9_idct_msa.h
+++ b/vp9/common/mips/msa/vp9_idct_msa.h

@@ -13,7 +13,7 @@
 
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \
   v8i16 k0_m = __msa_fill_h(cnst0);                                  \

diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c
index 2fc6105..abf2704 100644
--- a/vp9/common/mips/msa/vp9_intra_predict_msa.c
+++ b/vp9/common/mips/msa/vp9_intra_predict_msa.c

@@ -9,7 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 #define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) {  \
   out0 = __msa_subs_u_h(out0, in0);                \

diff --git a/vp9/common/mips/msa/vp9_loopfilter_msa.h b/vp9/common/mips/msa/vp9_loopfilter_msa.h
index 0643e41..bfbe870 100644
--- a/vp9/common/mips/msa/vp9_loopfilter_msa.h
+++ b/vp9/common/mips/msa/vp9_loopfilter_msa.h

@@ -11,7 +11,7 @@
 #ifndef VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
 #define VP9_COMMON_MIPS_MSA_VP9_LOOPFILTER_MSA_H_
 
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 #define VP9_LPF_FILTER4_8W(p1_in, p0_in, q0_in, q1_in, mask_in, hev_in,  \
                            p1_out, p0_out, q0_out, q1_out) {             \

diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
deleted file mode 100644
index e008eaf..0000000
--- a/vp9/common/mips/msa/vp9_macros_msa.h
+++ /dev/null

@@ -1,1885 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
-#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
-
-#include <msa.h>
-
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
-
-#define LD_B(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
-#define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
-
-#define LD_H(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
-#define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
-
-#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
-#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
-
-#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
-#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
-
-#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
-
-#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
-#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
-
-#if (__mips_isa_rev >= 6)
-#define LH(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint16_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "lh  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#define LW(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint32_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "lw  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#if (__mips == 64)
-#define LD(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint64_t val_m = 0;                               \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ld  %[val_m],  %[psrc_m]  \n\t"              \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-#else  // !(__mips == 64)
-#define LD(psrc) ({                                        \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
-  uint32_t val0_m, val1_m;                                 \
-  uint64_t val_m = 0;                                      \
-                                                           \
-  val0_m = LW(psrc_m);                                     \
-  val1_m = LW(psrc_m + 4);                                 \
-                                                           \
-  val_m = (uint64_t)(val1_m);                              \
-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
-                                                           \
-  val_m;                                                   \
-})
-#endif  // (__mips == 64)
-
-#define SH(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint16_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sh  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-
-#define SW(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint32_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sw  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-
-#define SD(val, pdst) {                 \
-  uint8_t *pdst_m = (uint8_t *)(pdst);  \
-  const uint64_t val_m = (val);         \
-                                        \
-  __asm__ __volatile__ (                \
-      "sd  %[val_m],  %[pdst_m]  \n\t"  \
-                                        \
-      : [pdst_m] "=m" (*pdst_m)         \
-      : [val_m] "r" (val_m)             \
-  );                                    \
-}
-#else  // !(__mips_isa_rev >= 6)
-#define LH(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint16_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#define LW(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint32_t val_m;                                   \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "ulw  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-
-#if (__mips == 64)
-#define LD(psrc) ({                                 \
-  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
-  uint64_t val_m = 0;                               \
-                                                    \
-  __asm__ __volatile__ (                            \
-      "uld  %[val_m],  %[psrc_m]  \n\t"             \
-                                                    \
-      : [val_m] "=r" (val_m)                        \
-      : [psrc_m] "m" (*psrc_m)                      \
-  );                                                \
-                                                    \
-  val_m;                                            \
-})
-#else  // !(__mips == 64)
-#define LD(psrc) ({                                        \
-  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
-  uint32_t val0_m, val1_m;                                 \
-  uint64_t val_m = 0;                                      \
-                                                           \
-  val0_m = LW(psrc_m1);                                    \
-  val1_m = LW(psrc_m1 + 4);                                \
-                                                           \
-  val_m = (uint64_t)(val1_m);                              \
-  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
-  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
-                                                           \
-  val_m;                                                   \
-})
-#endif  // (__mips == 64)
-
-#define SH(val, pdst) {                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);   \
-  const uint16_t val_m = (val);          \
-                                         \
-  __asm__ __volatile__ (                 \
-      "ush  %[val_m],  %[pdst_m]  \n\t"  \
-                                         \
-      : [pdst_m] "=m" (*pdst_m)          \
-      : [val_m] "r" (val_m)              \
-  );                                     \
-}
-
-#define SW(val, pdst) {                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);   \
-  const uint32_t val_m = (val);          \
-                                         \
-  __asm__ __volatile__ (                 \
-      "usw  %[val_m],  %[pdst_m]  \n\t"  \
-                                         \
-      : [pdst_m] "=m" (*pdst_m)          \
-      : [val_m] "r" (val_m)              \
-  );                                     \
-}
-
-#define SD(val, pdst) {                                     \
-  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
-  uint32_t val0_m, val1_m;                                  \
-                                                            \
-  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
-  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
-                                                            \
-  SW(val0_m, pdst_m1);                                      \
-  SW(val1_m, pdst_m1 + 4);                                  \
-}
-#endif  // (__mips_isa_rev >= 6)
-
-/* Description : Load 4 words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1, out2, out3
-   Details     : Load word in 'out0' from (psrc)
-                 Load word in 'out1' from (psrc + stride)
-                 Load word in 'out2' from (psrc + 2 * stride)
-                 Load word in 'out3' from (psrc + 3 * stride)
-*/
-#define LW4(psrc, stride, out0, out1, out2, out3) {  \
-  out0 = LW((psrc));                                 \
-  out1 = LW((psrc) + stride);                        \
-  out2 = LW((psrc) + 2 * stride);                    \
-  out3 = LW((psrc) + 3 * stride);                    \
-}
-
-/* Description : Load double words with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load double word in 'out0' from (psrc)
-                 Load double word in 'out1' from (psrc + stride)
-*/
-#define LD2(psrc, stride, out0, out1) {  \
-  out0 = LD((psrc));                     \
-  out1 = LD((psrc) + stride);            \
-}
-#define LD4(psrc, stride, out0, out1, out2, out3) {  \
-  LD2((psrc), stride, out0, out1);                   \
-  LD2((psrc) + 2 * stride, stride, out2, out3);      \
-}
-
-/* Description : Store 4 words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store word from 'in0' to (pdst)
-                 Store word from 'in1' to (pdst + stride)
-                 Store word from 'in2' to (pdst + 2 * stride)
-                 Store word from 'in3' to (pdst + 3 * stride)
-*/
-#define SW4(in0, in1, in2, in3, pdst, stride) {  \
-  SW(in0, (pdst))                                \
-  SW(in1, (pdst) + stride);                      \
-  SW(in2, (pdst) + 2 * stride);                  \
-  SW(in3, (pdst) + 3 * stride);                  \
-}
-
-/* Description : Store 4 double words with stride
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Store double word from 'in0' to (pdst)
-                 Store double word from 'in1' to (pdst + stride)
-                 Store double word from 'in2' to (pdst + 2 * stride)
-                 Store double word from 'in3' to (pdst + 3 * stride)
-*/
-#define SD4(in0, in1, in2, in3, pdst, stride) {  \
-  SD(in0, (pdst))                                \
-  SD(in1, (pdst) + stride);                      \
-  SD(in2, (pdst) + 2 * stride);                  \
-  SD(in3, (pdst) + 3 * stride);                  \
-}
-
-/* Description : Load vectors with 16 byte elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Load 16 byte elements in 'out0' from (psrc)
-                 Load 16 byte elements in 'out1' from (psrc + stride)
-*/
-#define LD_B2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_B(RTYPE, (psrc));                     \
-  out1 = LD_B(RTYPE, (psrc) + stride);            \
-}
-#define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
-#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
-
-#define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_B2(RTYPE, (psrc), stride, out0, out1);                   \
-  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
-}
-#define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
-#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
-
-#define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
-  out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
-}
-#define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
-#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
-
-#define LD_B7(RTYPE, psrc, stride,                             \
-              out0, out1, out2, out3, out4, out5, out6) {      \
-  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
-  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
-}
-#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
-
-#define LD_B8(RTYPE, psrc, stride,                                    \
-              out0, out1, out2, out3, out4, out5, out6, out7) {       \
-  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
-  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
-}
-#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
-#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
-
-/* Description : Load vectors with 8 halfword elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-   Details     : Load 8 halfword elements in 'out0' from (psrc)
-                 Load 8 halfword elements in 'out1' from (psrc + stride)
-*/
-#define LD_H2(RTYPE, psrc, stride, out0, out1) {  \
-  out0 = LD_H(RTYPE, (psrc));                     \
-  out1 = LD_H(RTYPE, (psrc) + (stride));          \
-}
-#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
-
-#define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
-  LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
-  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);      \
-}
-#define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
-
-#define LD_H8(RTYPE, psrc, stride,                                    \
-              out0, out1, out2, out3, out4, out5, out6, out7) {       \
-  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
-  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
-}
-#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
-
-#define LD_H16(RTYPE, psrc, stride,                                     \
-               out0, out1, out2, out3, out4, out5, out6, out7,          \
-               out8, out9, out10, out11, out12, out13, out14, out15) {  \
-  LD_H8(RTYPE, (psrc), stride,                                          \
-        out0, out1, out2, out3, out4, out5, out6, out7);                \
-  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
-        out8, out9, out10, out11, out12, out13, out14, out15);          \
-}
-#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
-
-/* Description : Load 4x4 block of signed halfword elements from 1D source
-                 data into 4 vectors (Each vector with 4 signed halfwords)
-   Arguments   : Input   - psrc
-                 Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
-  out0 = LD_SH(psrc);                                    \
-  out2 = LD_SH(psrc + 8);                                \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
-}
-
-/* Description : Load 2 vectors of signed word elements with stride
-   Arguments   : Inputs  - psrc, stride
-                 Outputs - out0, out1
-                 Return Type - signed word
-*/
-#define LD_SW2(psrc, stride, out0, out1) {  \
-  out0 = LD_SW((psrc));                     \
-  out1 = LD_SW((psrc) + stride);            \
-}
-
-/* Description : Store vectors of 16 byte elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 16 byte elements from 'in0' to (pdst)
-                 Store 16 byte elements from 'in1' to (pdst + stride)
-*/
-#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
-  ST_B(RTYPE, in0, (pdst));                     \
-  ST_B(RTYPE, in1, (pdst) + stride);            \
-}
-#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
-
-#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
-  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
-  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
-}
-#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
-
-#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
-              pdst, stride) {                                     \
-  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
-  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
-}
-#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
-
-/* Description : Store vectors of 8 halfword elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 8 halfword elements from 'in0' to (pdst)
-                 Store 8 halfword elements from 'in1' to (pdst + stride)
-*/
-#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
-  ST_H(RTYPE, in0, (pdst));                     \
-  ST_H(RTYPE, in1, (pdst) + stride);            \
-}
-#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
-
-#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
-  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
-  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
-}
-#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
-
-#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
-  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
-  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
-}
-#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
-
-/* Description : Store vectors of word elements with stride
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Store 4 word elements from 'in0' to (pdst)
-                 Store 4 word elements from 'in1' to (pdst + stride)
-*/
-#define ST_SW2(in0, in1, pdst, stride) {  \
-  ST_SW(in0, (pdst));                     \
-  ST_SW(in1, (pdst) + stride);            \
-}
-
-/* Description : Store 2x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in, stidx, pdst, stride
-   Details     : Index 'stidx' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst)
-                 Index 'stidx+1' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + stride)
-                 Index 'stidx+2' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 2 * stride)
-                 Index 'stidx+3' halfword element from 'in' vector is copied to
-                 the GP register and stored to (pdst + 3 * stride)
-*/
-#define ST2x4_UB(in, stidx, pdst, stride) {         \
-  uint16_t out0_m, out1_m, out2_m, out3_m;          \
-  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
-                                                    \
-  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
-  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
-  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
-  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
-                                                    \
-  SH(out0_m, pblk_2x4_m);                           \
-  SH(out1_m, pblk_2x4_m + stride);                  \
-  SH(out2_m, pblk_2x4_m + 2 * stride);              \
-  SH(out3_m, pblk_2x4_m + 3 * stride);              \
-}
-
-/* Description : Store 4x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst)
-                 Index 1 word element from 'in' vector is copied to the GP
-                 register and stored to (pdst + stride)
-*/
-#define ST4x2_UB(in, pdst, stride) {        \
-  uint32_t out0_m, out1_m;                  \
-  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
-                                            \
-  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
-  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
-                                            \
-  SW(out0_m, pblk_4x2_m);                   \
-  SW(out1_m, pblk_4x2_m + stride);          \
-}
-
-/* Description : Store 4x4 byte block to destination memory from input vector
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : 'Idx0' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst)
-                 'Idx1' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + stride)
-                 'Idx2' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 'Idx3' word element from input vector 'in0' is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
-  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
-  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
-                                                                    \
-  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
-  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
-  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
-  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
-                                                                    \
-  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
-}
-#define ST4x8_UB(in0, in1, pdst, stride) {                        \
-  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
-                                                                  \
-  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
-  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
-}
-
-/* Description : Store 8x1 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-*/
-#define ST8x1_UB(in, pdst) {              \
-  uint64_t out0_m;                        \
-                                          \
-  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
-  SD(out0_m, pdst);                       \
-}
-
-/* Description : Store 8x2 byte block to destination memory from input vector
-   Arguments   : Inputs - in, pdst, stride
-   Details     : Index 0 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in' vector is copied to the
-                 GP register and stored to (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride) {        \
-  uint64_t out0_m, out1_m;                  \
-  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
-                                            \
-  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
-  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
-                                            \
-  SD(out0_m, pblk_8x2_m);                   \
-  SD(out1_m, pblk_8x2_m + stride);          \
-}
-
-/* Description : Store 8x4 byte block to destination memory from input
-                 vectors
-   Arguments   : Inputs - in0, in1, pdst, stride
-   Details     : Index 0 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst)
-                 Index 1 double word element from 'in0' vector is copied to the
-                 GP register and stored to (pdst + stride)
-                 Index 0 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 2 * stride)
-                 Index 1 double word element from 'in1' vector is copied to the
-                 GP register and stored to (pdst + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride) {                  \
-  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
-  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
-                                                            \
-  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
-  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
-  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
-  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
-                                                            \
-  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
-}
-
-/* Description : average with rounding (in0 + in1 + 1) / 2.
-   Arguments   : Inputs  - in0, in1, in2, in3,
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from 'in0' vector is added with
-                 each unsigned byte element from 'in1' vector. Then average
-                 with rounding is calculated and written to 'out0'
-*/
-#define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_aver_u_b((v16u8)in0, (v16u8)in1);    \
-  out1 = (RTYPE)__msa_aver_u_b((v16u8)in2, (v16u8)in3);    \
-}
-#define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
-
-#define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)                \
-  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)                \
-}
-#define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide with zero
-   Arguments   : Inputs  - in0, in1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
-  v16i8 zero_m = { 0 };                                              \
-  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
-  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
-}
-#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
-
-#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
-                  out0, out1, out2, out3, slide_val) {  \
-  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
-  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
-}
-#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
-
-/* Description : Immediate number of elements to slide
-   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
-                 value specified in the 'slide_val'
-*/
-#define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) {  \
-  out0 = (RTYPE)__msa_sldi_b((v16i8)in0_0, (v16i8)in1_0, slide_val);         \
-  out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
-}
-#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
-
-#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
-                out0, out1, out2, slide_val) {                        \
-  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
-  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
-}
-#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
-#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
-
-/* Description : Shuffle byte vector elements as per mask vector
-   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
-                 'out0' as per control vector 'mask0'
-*/
-#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
-  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
-  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
-}
-#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
-#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
-#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
-
-#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
-                out0, out1, out2, out3) {                        \
-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
-  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
-}
-#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
-#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Unsigned byte elements from 'mult0' are multiplied with
-                 unsigned byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. unsigned halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
-}
-#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
-
-#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
-                 cnst0, cnst1, cnst2, cnst3,                \
-                 out0, out1, out2, out3) {                  \
-  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
-  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
-}
-#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Dot product of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
-}
-#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
-
-#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
-                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
-  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
-  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
-}
-#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
-}
-#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
-
-#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
-                 cnst0, cnst1, cnst2, cnst3,                \
-                 out0, out1, out2, out3) {                  \
-  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
-  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
-}
-#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
-
-/* Description : Dot product of word vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed word elements from 'mult0' are multiplied with
-                 signed word elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed double word.
-                 The multiplication result of adjacent odd-even elements
-                 are added together and written to the 'out0' vector
-*/
-#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
-  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
-  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
-}
-#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
-
-/* Description : Dot product & addition of byte vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed byte elements from 'mult0' are multiplied with
-                 signed byte elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed halfword.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
-  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
-  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
-}
-#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
-
-#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
-                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
-  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
-  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
-}
-#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
-
-/* Description : Dot product & addition of halfword vector elements
-   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'mult0' are multiplied with
-                 signed halfword elements from 'cnst0' producing a result
-                 twice the size of input i.e. signed word.
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
-  out0 = (RTYPE)__msa_dpadd_s_w((v4i32)out0, (v8i16)mult0, (v8i16)cnst0);  \
-  out1 = (RTYPE)__msa_dpadd_s_w((v4i32)out1, (v8i16)mult1, (v8i16)cnst1);  \
-}
-#define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
-
-/* Description : Dot product & addition of double word vector elements
-   Arguments   : Inputs  - mult0, mult1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed word element from 'mult0' is multiplied with itself
-                 producing an intermediate result twice the size of input
-                 i.e. signed double word
-                 The multiplication result of adjacent odd-even elements
-                 are added to the 'out0' vector
-*/
-#define DPADD_SD2(RTYPE, mult0, mult1, out0, out1) {                       \
-  out0 = (RTYPE)__msa_dpadd_s_d((v2i64)out0, (v4i32)mult0, (v4i32)mult0);  \
-  out1 = (RTYPE)__msa_dpadd_s_d((v2i64)out1, (v4i32)mult1, (v4i32)mult1);  \
-}
-#define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
-
-/* Description : Minimum values between unsigned elements of
-                 either vector are copied to the output vector
-   Arguments   : Inputs  - in0, in1, min_vec
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Minimum of unsigned halfword element values from 'in0' and
-                 'min_vec' are written to output vector 'in0'
-*/
-#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
-  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
-  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
-}
-#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
-
-#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
-  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
-  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
-}
-#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Clips all signed halfword elements of input vector
-                 between 0 & 255
-   Arguments   : Input  - in
-                 Output - out_m
-                 Return Type - signed halfword
-*/
-#define CLIP_SH_0_255(in) ({                          \
-  v8i16 max_m = __msa_ldi_h(255);                     \
-  v8i16 out_m;                                        \
-                                                      \
-  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
-  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
-  out_m;                                              \
-})
-#define CLIP_SH2_0_255(in0, in1) {  \
-  in0 = CLIP_SH_0_255(in0);         \
-  in1 = CLIP_SH_0_255(in1);         \
-}
-#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
-  CLIP_SH2_0_255(in0, in1);                   \
-  CLIP_SH2_0_255(in2, in3);                   \
-}
-
-/* Description : Horizontal addition of 4 signed word elements of input vector
-   Arguments   : Input  - in       (signed word vector)
-                 Output - sum_m    (i32 sum)
-                 Return Type - signed word (GP)
-   Details     : 4 signed word elements of 'in' vector are added together and
-                 the resulting integer sum is returned
-*/
-#define HADD_SW_S32(in) ({                        \
-  v2i64 res0_m, res1_m;                           \
-  int32_t sum_m;                                  \
-                                                  \
-  res0_m = __msa_hadd_s_d((v4i32)in, (v4i32)in);  \
-  res1_m = __msa_splati_d(res0_m, 1);             \
-  res0_m = res0_m + res1_m;                       \
-  sum_m = __msa_copy_s_w((v4i32)res0_m, 0);       \
-  sum_m;                                          \
-})
-
-/* Description : Horizontal addition of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is added to
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
-  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
-}
-#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
-
-#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
-  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
-}
-#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of unsigned byte vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each unsigned odd byte element from 'in0' is subtracted from
-                 even unsigned byte element from 'in0' (pairwise) and the
-                 halfword result is written to 'out0'
-*/
-#define HSUB_UB2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hsub_u_h((v16u8)in0, (v16u8)in0);  \
-  out1 = (RTYPE)__msa_hsub_u_h((v16u8)in1, (v16u8)in1);  \
-}
-#define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
-
-/* Description : Horizontal subtraction of signed halfword vector elements
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Each signed odd halfword element from 'in0' is subtracted from
-                 even signed halfword element from 'in0' (pairwise) and the
-                 word result is written to 'out0'
-*/
-#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
-  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
-  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
-}
-#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
-
-/* Description : Set element n input vector to GPR value
-   Arguments   : Inputs - in0, in1, in2, in3
-                 Output - out
-                 Return Type - as per RTYPE
-   Details     : Set element 0 in vector 'out' to value specified in 'in0'
-*/
-#define INSERT_W2(RTYPE, in0, in1, out) {           \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
-}
-#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
-
-#define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 2, in2);   \
-  out = (RTYPE)__msa_insert_w((v4i32)out, 3, in3);   \
-}
-#define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
-#define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
-
-#define INSERT_D2(RTYPE, in0, in1, out) {           \
-  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
-  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
-}
-#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
-#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
-  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
-}
-#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
-#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave even halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
-  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
-}
-#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
-#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
-#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
-
-/* Description : Interleave even word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
-  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
-}
-#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
-
-/* Description : Interleave even double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'
-*/
-#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
-  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
-}
-#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
-
-/* Description : Interleave left half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
-}
-#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
-#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
-#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
-#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
-
-#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
-#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
-
-/* Description : Interleave left half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
-}
-#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave left half of word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
-                 and written to 'out0'.
-*/
-#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
-}
-#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
-#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
-
-/* Description : Interleave right half of byte elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
-                 and written to out0.
-*/
-#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
-}
-#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
-#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
-#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
-#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
-
-#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
-#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
-#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
-#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
-
-#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                in8, in9, in10, in11, in12, in13, in14, in15,      \
-                out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
-          out0, out1, out2, out3);                                 \
-  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
-          out4, out5, out6, out7);                                 \
-}
-#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
-
-/* Description : Interleave right half of halfword elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of halfword elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
-}
-#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
-
-#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
-
-#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
-  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
-}
-#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
-#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
-
-#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave right half of double word elements from vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of double word elements of 'in0' and 'in1' are
-                 interleaved and written to 'out0'.
-*/
-#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
-  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
-  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
-}
-#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
-#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
-#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
-
-#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
-  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
-}
-#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
-
-#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
-#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
-
-/* Description : Interleave both left and right half of input vectors
-   Arguments   : Inputs  - in0, in1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Right half of byte elements from 'in0' and 'in1' are
-                 interleaved and written to 'out0'
-*/
-#define ILVRL_B2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);  \
-  out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
-}
-#define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
-#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
-#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
-#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
-
-#define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
-  out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
-}
-#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
-#define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
-
-#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
-  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
-  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
-}
-#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
-#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range.
-                 The results are written in place
-*/
-#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
-  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
-  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
-}
-#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
-
-#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
-  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
-  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
-}
-#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
-
-/* Description : Saturate the halfword element values to the max
-                 unsigned value of (sat_val + 1) bits
-                 The element data width remains unchanged
-   Arguments   : Inputs  - in0, in1, sat_val
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned halfword element from 'in0' is saturated to the
-                 value generated with (sat_val + 1) bit range
-                 The results are written in place
-*/
-#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
-  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
-  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
-}
-#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
-
-#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
-  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
-  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
-}
-#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Indexed halfword element values are replicated to all
-                 elements in output vector
-   Arguments   : Inputs  - in, idx0, idx1
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : 'idx0' element value from 'in' vector is replicated to all
-                  elements in 'out0' vector
-                  Valid index range for halfword operation is 0-7
-*/
-#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
-  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
-  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
-}
-#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
-
-#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
-                  out0, out1, out2, out3) {           \
-  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
-  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
-}
-#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
-#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even byte elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even byte elements of 'in0' are copied to the left half of
-                 'out0' & even byte elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
-  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
-}
-#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
-#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
-#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
-
-#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
-#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
-#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even halfword elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even halfword elements of 'in0' are copied to the left half of
-                 'out0' & even halfword elements of 'in1' are copied to the
-                 right half of 'out0'.
-*/
-#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
-  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
-}
-#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
-#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
-
-#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
-
-/* Description : Pack even double word elements of vector pairs
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Even double elements of 'in0' are copied to the left half of
-                 'out0' & even double elements of 'in1' are copied to the right
-                 half of 'out0'.
-*/
-#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_pckev_d((v2i64)in0, (v2i64)in1);     \
-  out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
-}
-#define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
-#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
-
-#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
-
-/* Description : Each byte element is logically xor'ed with immediate 128
-   Arguments   : Inputs  - in0, in1
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each unsigned byte element from input vector 'in0' is
-                 logically xor'ed with 128 and the result is stored in-place.
-*/
-#define XORI_B2_128(RTYPE, in0, in1) {         \
-  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
-  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
-}
-#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
-#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
-
-#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
-  XORI_B2_128(RTYPE, in0, in1);                \
-  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
-}
-#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
-
-#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
-  XORI_B2_128(RTYPE, in0, in1);                   \
-  XORI_B2_128(RTYPE, in2, in3);                   \
-}
-#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
-#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
-
-#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
-  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
-  XORI_B3_128(RTYPE, in4, in5, in6);                             \
-}
-#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
-
-/* Description : Average of signed halfword elements -> (a + b) / 2
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each signed halfword element from 'in0' is added to each
-                 signed halfword element of 'in1' with full precision resulting
-                 in one extra bit in the result. The result is then divided by
-                 2 and written to 'out0'
-*/
-#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                out0, out1, out2, out3) {                       \
-  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
-  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
-  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
-  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
-}
-#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Addition of signed halfword elements and signed saturation
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-                 Return Type - as per RTYPE
-   Details     : Signed halfword elements from 'in0' are added to signed
-                 halfword elements of 'in1'. The result is then signed saturated
-                 between halfword data type range
-*/
-#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
-  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
-  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
-}
-#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
-
-#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
-                 out0, out1, out2, out3) {                       \
-  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
-  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
-}
-#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
-
-/* Description : Shift left all elements of vector (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is left shifted by 'shift' and
-                 the result is written in-place.
-*/
-#define SLLI_4V(in0, in1, in2, in3, shift) {  \
-  in0 = in0 << shift;                         \
-  in1 = in1 << shift;                         \
-  in2 = in2 << shift;                         \
-  in3 = in3 << shift;                         \
-}
-
-/* Description : Arithmetic shift right all elements of vector
-                 (generic for all data types)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - in place operation
-                 Return Type - as per input vector RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is a GP variable.
-*/
-#define SRA_4V(in0, in1, in2, in3, shift) {  \
-  in0 = in0 >> shift;                        \
-  in1 = in1 >> shift;                        \
-  in2 = in2 >> shift;                        \
-  in3 = in3 >> shift;                        \
-}
-
-/* Description : Shift right arithmetic rounded words
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the number of bits in the corresponding element in the vector
-                 'shift'. The last discarded bit is added to shifted value for
-                 rounding and the result is written in-place.
-                 'shift' is a vector.
-*/
-#define SRAR_W2(RTYPE, in0, in1, shift) {               \
-  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
-  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
-}
-
-#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRAR_W2(RTYPE, in0, in1, shift)                    \
-  SRAR_W2(RTYPE, in2, in3, shift)                    \
-}
-#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
-
-/* Description : Shift right arithmetic rounded (immediate)
-   Arguments   : Inputs  - in0, in1, shift
-                 Outputs - in place operation
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is shifted right arithmetically by
-                 the value in 'shift'. The last discarded bit is added to the
-                 shifted value for rounding and the result is written in-place.
-                 'shift' is an immediate value.
-*/
-#define SRARI_H2(RTYPE, in0, in1, shift) {        \
-  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
-  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
-}
-#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
-#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
-
-#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_H2(RTYPE, in0, in1, shift);                   \
-  SRARI_H2(RTYPE, in2, in3, shift);                   \
-}
-#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
-#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
-
-#define SRARI_W2(RTYPE, in0, in1, shift) {        \
-  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
-  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
-}
-#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
-
-#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
-  SRARI_W2(RTYPE, in0, in1, shift);                   \
-  SRARI_W2(RTYPE, in2, in3, shift);                   \
-}
-#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
-
-/* Description : Logical shift right all elements of vector (immediate)
-   Arguments   : Inputs  - in0, in1, in2, in3, shift
-                 Outputs - out0, out1, out2, out3
-                 Return Type - as per RTYPE
-   Details     : Each element of vector 'in0' is right shifted by 'shift' and
-                 the result is written in-place. 'shift' is an immediate value.
-*/
-#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
-  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
-  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
-  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
-  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
-}
-#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
-
-/* Description : Multiplication of pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element from 'in0' is multiplied with elements from 'in1'
-                 and the result is written to 'out0'
-*/
-#define MUL2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 * in1;                             \
-  out1 = in2 * in3;                             \
-}
-#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  MUL2(in0, in1, in2, in3, out0, out1);               \
-  MUL2(in4, in5, in6, in7, out2, out3);               \
-}
-
-/* Description : Addition of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in0' is added to 'in1' and result is written
-                 to 'out0'.
-*/
-#define ADD2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 + in1;                             \
-  out1 = in2 + in3;                             \
-}
-#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  ADD2(in0, in1, in2, in3, out0, out1);               \
-  ADD2(in4, in5, in6, in7, out2, out3);               \
-}
-
-/* Description : Subtraction of 2 pairs of vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1
-   Details     : Each element in 'in1' is subtracted from 'in0' and result is
-                 written to 'out0'.
-*/
-#define SUB2(in0, in1, in2, in3, out0, out1) {  \
-  out0 = in0 - in1;                             \
-  out1 = in2 - in3;                             \
-}
-#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
-             out0, out1, out2, out3) {                \
-  out0 = in0 - in1;                                   \
-  out1 = in2 - in3;                                   \
-  out2 = in4 - in5;                                   \
-  out3 = in6 - in7;                                   \
-}
-
-/* Description : Sign extend halfword elements from right half of the vector
-   Arguments   : Input  - in    (halfword vector)
-                 Output - out   (sign extended word vector)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved with same vector 'in0' to generate
-                 4 word elements keeping sign intact
-*/
-#define UNPCK_R_SH_SW(in, out) {                 \
-  v8i16 sign_m;                                  \
-                                                 \
-  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
-  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
-}
-
-/* Description : Zero extend unsigned byte elements to halfword elements
-   Arguments   : Input   - in          (unsigned byte vector)
-                 Outputs - out0, out1  (unsigned  halfword vectors)
-                 Return Type - signed halfword
-   Details     : Zero extended right half of vector is returned in 'out0'
-                 Zero extended left half of vector is returned in 'out1'
-*/
-#define UNPCK_UB_SH(in, out0, out1) {   \
-  v16i8 zero_m = { 0 };                 \
-                                        \
-  ILVRL_B2_SH(zero_m, in, out0, out1);  \
-}
-
-/* Description : Sign extend halfword elements from input vector and return
-                 the result in pair of vectors
-   Arguments   : Input   - in            (halfword vector)
-                 Outputs - out0, out1   (sign extended word vectors)
-                 Return Type - signed word
-   Details     : Sign bit of halfword elements from input vector 'in' is
-                 extracted and interleaved right with same vector 'in0' to
-                 generate 4 signed word elements in 'out0'
-                 Then interleaved left with same vector 'in0' to
-                 generate 4 signed word elements in 'out1'
-*/
-#define UNPCK_SH_SW(in, out0, out1) {    \
-  v8i16 tmp_m;                           \
-                                         \
-  tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
-  ILVRL_H2_SW(tmp_m, in, out0, out1);    \
-}
-
-/* Description : Butterfly of 4 input vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  out0 = in0 + in3;                                                \
-  out1 = in1 + in2;                                                \
-                                                                   \
-  out2 = in1 - in2;                                                \
-  out3 = in0 - in3;                                                \
-}
-
-/* Description : Butterfly of 8 input vectors
-   Arguments   : Inputs  - in0 ...  in7
-                 Outputs - out0 .. out7
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  out0 = in0 + in7;                                                    \
-  out1 = in1 + in6;                                                    \
-  out2 = in2 + in5;                                                    \
-  out3 = in3 + in4;                                                    \
-                                                                       \
-  out4 = in3 - in4;                                                    \
-  out5 = in2 - in5;                                                    \
-  out6 = in1 - in6;                                                    \
-  out7 = in0 - in7;                                                    \
-}
-
-/* Description : Butterfly of 16 input vectors
-   Arguments   : Inputs  - in0 ...  in15
-                 Outputs - out0 .. out15
-   Details     : Butterfly operation
-*/
-#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
-                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
-                     out0, out1, out2, out3, out4, out5, out6, out7,          \
-                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
-  out0 = in0 + in15;                                                          \
-  out1 = in1 + in14;                                                          \
-  out2 = in2 + in13;                                                          \
-  out3 = in3 + in12;                                                          \
-  out4 = in4 + in11;                                                          \
-  out5 = in5 + in10;                                                          \
-  out6 = in6 + in9;                                                           \
-  out7 = in7 + in8;                                                           \
-                                                                              \
-  out8 = in7 - in8;                                                           \
-  out9 = in6 - in9;                                                           \
-  out10 = in5 - in10;                                                         \
-  out11 = in4 - in11;                                                         \
-  out12 = in3 - in12;                                                         \
-  out13 = in2 - in13;                                                         \
-  out14 = in1 - in14;                                                         \
-  out15 = in0 - in15;                                                         \
-}
-
-/* Description : Transpose input 8x8 byte block
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
-  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
-                                                                           \
-  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
-             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
-  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
-  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
-  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
-  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
-  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
-  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
-}
-#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
-
-/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
-                           in8, in9, in10, in11, in12, in13, in14, in15
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - unsigned byte
-*/
-#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                            in8, in9, in10, in11, in12, in13, in14, in15,      \
-                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
-  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
-                                                                               \
-  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
-  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
-  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
-  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
-                                                                               \
-  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
-  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
-  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
-  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
-  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
-  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
-  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
-  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
-                                                                               \
-  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
-  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
-  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
-  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-                                                                               \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
-  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
-  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
-  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
-}
-
-/* Description : Transpose 4x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v8i16 s0_m, s1_m;                                                       \
-                                                                          \
-  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
-  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
-}
-
-/* Description : Transpose 4x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
-                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
-  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
-  v8i16 zero_m = { 0 };                                                       \
-                                                                              \
-  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
-             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
-  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
-  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
-                                                                              \
-  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
-  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
-  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
-  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
-                                                                              \
-  out4 = zero_m;                                                              \
-  out5 = zero_m;                                                              \
-  out6 = zero_m;                                                              \
-  out7 = zero_m;                                                              \
-}
-
-/* Description : Transpose 8x4 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - signed halfword
-*/
-#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-                                                                          \
-  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
-  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
-  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
-  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
-}
-
-/* Description : Transpose 8x8 block with half word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
-                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
-                 Return Type - as per RTYPE
-*/
-#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
-                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
-  v8i16 s0_m, s1_m;                                                       \
-  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
-  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
-                                                                          \
-  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
-  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
-  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
-  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
-  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
-  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
-           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
-  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
-  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
-  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
-  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
-}
-#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
-
-/* Description : Transpose 4x4 block with word elements in vectors
-   Arguments   : Inputs  - in0, in1, in2, in3
-                 Outputs - out0, out1, out2, out3
-                 Return Type - signed word
-*/
-#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
-  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
-                                                                          \
-  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
-  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
-                                                                          \
-  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
-  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
-  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
-  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
-}
-
-/* Description : Add block 4x4
-   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
-   Details     : Least significant 4 bytes from each input vector are added to
-                 the destination bytes, clipped between 0-255 and stored.
-*/
-#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
-  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
-  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
-  v16i8 dst0_m = { 0 };                                         \
-  v16i8 dst1_m = { 0 };                                         \
-  v16i8 zero_m = { 0 };                                         \
-                                                                \
-  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
-  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
-  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
-  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
-  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
-  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
-  CLIP_SH2_0_255(res0_m, res1_m);                               \
-  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
-  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
-}
-
-/* Description : Pack even elements of input vectors & xor with 128
-   Arguments   : Inputs - in0, in1
-                 Output - out_m
-                 Return Type - unsigned byte
-   Details     : Signed byte even elements from 'in0' and 'in1' are packed
-                 together in one vector and the resulting vector is xor'ed with
-                 128 to shift the range from signed to unsigned byte
-*/
-#define PCKEV_XORI128_UB(in0, in1) ({                    \
-  v16u8 out_m;                                           \
-                                                         \
-  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
-  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
-  out_m;                                                 \
-})
-
-/* Description : Converts inputs to unsigned bytes, interleave, average & store
-                 as 8x4 unsigned byte block
-   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
-                          pdst, stride
-*/
-#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
-                                dst0, dst1, dst2, dst3, pdst, stride) {  \
-  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
-  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
-                                                                         \
-  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
-  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
-  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
-  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
-  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
-}
-
-/* Description : Pack even byte elements and store byte vector in destination
-                 memory
-   Arguments   : Inputs - in0, in1, pdst
-*/
-#define PCKEV_ST_SB(in0, in1, pdst) {             \
-  v16i8 tmp_m;                                    \
-                                                  \
-  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
-  ST_SB(tmp_m, (pdst));                           \
-}
-
-/* Description : Horizontal 2 tap filter kernel code
-   Arguments   : Inputs - in0, in1, mask, coeff, shift
-*/
-#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
-  v16i8 tmp0_m;                                                \
-  v8u16 tmp1_m;                                                \
-                                                               \
-  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
-  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
-  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
-  tmp1_m = __msa_sat_u_h(tmp1_m, shift);                       \
-                                                               \
-  tmp1_m;                                                      \
-})
-#endif  /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */

diff --git a/vp9/common/mips/msa/vp9_mfqe_msa.c b/vp9/common/mips/msa/vp9_mfqe_msa.c
index 64cb9a8..7257cd6 100644
--- a/vp9/common/mips/msa/vp9_mfqe_msa.c
+++ b/vp9/common/mips/msa/vp9_mfqe_msa.c

@@ -10,7 +10,7 @@
 
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void filter_by_weight8x8_msa(const uint8_t *src_ptr, int32_t src_stride,
                                     uint8_t *dst_ptr, int32_t dst_stride,

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index d293cf2..fd8fe3f 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -188,8 +188,11 @@
 #endif
 
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
+#if CONFIG_VP9_ENCODER
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[64 * 64]);
-
+#else
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
+#endif
   int lossless;
   int corrupted;
 

diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index 2477e6e..76cdb05 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c

@@ -132,10 +132,6 @@
          (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
 }
 
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
-  return mv_class_base(c) + offset;
-}
-
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
                              int incr, int usehp) {
   int s, z, c, o, d, e, f;

diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 75e6861..637f451 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h

@@ -106,8 +106,6 @@
 }
 
 MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset);
-int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset);
-
 
 typedef struct {
   unsigned int sign[2];

diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 14654a5..4b2198f 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c

@@ -96,15 +96,9 @@
 };
 
 
-static const InterpKernel* filter_kernels[4] = {
+const InterpKernel *vp9_filter_kernels[4] = {
   sub_pel_filters_8,
   sub_pel_filters_8lp,
   sub_pel_filters_8s,
   bilinear_filters
 };
-
-const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter) {
-  assert(filter != SWITCHABLE);
-  return filter_kernels[filter];
-}
-

diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index 13d38af..40d6a0d 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h

@@ -40,7 +40,7 @@
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
-const InterpKernel *vp9_get_interp_kernel(INTERP_FILTER filter);
+extern const InterpKernel *vp9_filter_kernels[4];
 
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index bacbd8a..8f1240a 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -13,7 +13,7 @@
 
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"
@@ -348,7 +348,7 @@
           (const vp9_prob (*)[PARTITION_TYPES - 1])cm->fc->partition_prob;
 }
 
-static INLINE void init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
+static INLINE void vp9_init_macroblockd(VP9_COMMON *cm, MACROBLOCKD *xd) {
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index be5a150..6d38fab 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -161,7 +161,7 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const MODE_INFO *mi = xd->mi[0];
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
   int ref;
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index af2307e..524e79f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -802,88 +802,6 @@
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
-
-# variance
-add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance32x32 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x8 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
-add_proto qw/unsigned int vp9_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
-add_proto qw/unsigned int vp9_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
-#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
-
-add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-specialize qw/vp9_sub_pixel_avg_variance4x4 msa/, "$sse_x86inc", "$ssse3_x86inc";
-
 add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
 specialize qw/vp9_avg_8x8 sse2 neon msa/;
 
@@ -903,7 +821,7 @@
 specialize qw/vp9_satd sse2/;
 
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
-specialize qw/vp9_int_pro_row sse2/;
+specialize qw/vp9_int_pro_row sse2 neon/;
 
 add_proto qw/int16_t vp9_int_pro_col/, "uint8_t const *ref, const int width";
 specialize qw/vp9_int_pro_col sse2/;
@@ -922,9 +840,6 @@
 
 # ENCODEMB INVOKE
 
-add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block neon msa/, "$sse2_x86inc";
-
 #
 # Denoiser
 #
@@ -1088,249 +1003,11 @@
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_sub_pixel_avg_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_10_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_10_sub_pixel_avg_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x8/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vp9_highbd_12_sub_pixel_variance4x4/;
-
-  add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
-  specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
-
-
   # ENCODEMB INVOKE
 
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 
-  add_proto qw/void vp9_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vp9_highbd_subtract_block/;
-
   add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_highbd_quantize_fp/;
 

diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index 94e6dfe..07af1bc 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h

@@ -12,7 +12,7 @@
 #define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_
 #include "./vpx_config.h"
 #include "vp9/common/vp9_loopfilter.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 struct VP9Common;
 struct FRAME_COUNTS;

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 51e3e45..6c64540 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -18,7 +18,7 @@
 #include "vpx_ports/mem.h"
 #include "vpx_ports/mem_ops.h"
 #include "vpx_scale/vpx_scale.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_common.h"
@@ -188,7 +188,7 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   if (eob > 0) {
     TX_TYPE tx_type = DCT_DCT;
-    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = pd->dqcoeff;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       if (xd->lossless) {
@@ -315,7 +315,10 @@
                           x, y, plane);
 
   if (!mi->mbmi.skip) {
-    const int eob = vp9_decode_block_tokens(xd, plane, block,
+    const scan_order *sc = (plane || xd->lossless) ?
+        &vp9_default_scan_orders[tx_size] :
+        &vp9_scan_orders[tx_size][intra_mode_to_tx_type_lookup[mode]];
+    const int eob = vp9_decode_block_tokens(xd, plane, sc,
                                             plane_bsize, x, y, tx_size,
                                             args->r, args->seg_id);
     inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride,
@@ -337,8 +340,9 @@
   MACROBLOCKD *const xd = args->xd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
   int x, y, eob;
+  const scan_order *sc = &vp9_default_scan_orders[tx_size];
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-  eob = vp9_decode_block_tokens(xd, plane, block, plane_bsize,
+  eob = vp9_decode_block_tokens(xd, plane, sc, plane_bsize,
                                 x, y, tx_size, args->r, args->seg_id);
   inverse_transform_block(xd, plane, block, tx_size,
                           &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
@@ -575,10 +579,10 @@
   // width/height is not a multiple of 8 pixels.
   if (is_scaled || scaled_mv.col || scaled_mv.row ||
       (frame_width & 0x7) || (frame_height & 0x7)) {
-    int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
+    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
 
     // Get reference block bottom right horizontal coordinate.
-    int x1 = (x0_16 + (w - 1) * xs) >> SUBPEL_BITS;
+    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
     int x_pad = 0, y_pad = 0;
 
     if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
@@ -650,7 +654,7 @@
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
   const MODE_INFO *mi = xd->mi[0];
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
   const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
   const int is_compound = has_second_ref(&mi->mbmi);
 
@@ -1324,7 +1328,7 @@
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd);
     }
   }
 
@@ -1544,7 +1548,7 @@
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
                           &tile_data->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      init_macroblockd(cm, &tile_data->xd);
+      vp9_init_macroblockd(cm, &tile_data->xd);
 
       worker->had_error = 0;
       if (i == num_workers - 1 || n == tile_cols - 1) {

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d587c8f..0d08f8c 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -242,6 +242,7 @@
   // Integer part
   if (class0) {
     d = vp9_read_tree(r, vp9_mv_class0_tree, mvcomp->class0);
+    mag = 0;
   } else {
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
@@ -249,6 +250,7 @@
     d = 0;
     for (i = 0; i < n; ++i)
       d |= vp9_read(r, mvcomp->bits[i]) << i;
+    mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
@@ -260,7 +262,7 @@
              : 1;
 
   // Result
-  mag = vp9_get_mv_mag(mv_class, (d << 3) | (fr << 1) | hp) + 1;
+  mag += ((d << 3) | (fr << 1) | hp) + 1;
   return sign ? -mag : mag;
 }
 

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index cbb3266..80114565 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -20,7 +20,7 @@
 #include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_scale/vpx_scale.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_loopfilter.h"

diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 50c03c1..76a8746 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h

@@ -15,7 +15,7 @@
 
 #include "vpx/vpx_codec.h"
 #include "vpx_scale/yv12config.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_thread_common.h"
 #include "vp9/common/vp9_onyxc_int.h"

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 3304e64..4e3ab82 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -17,7 +17,6 @@
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #include "vp9/common/vp9_idct.h"
 #endif
-#include "vp9/common/vp9_scan.h"
 
 #include "vp9/decoder/vp9_detokenize.h"
 
@@ -207,7 +206,7 @@
 }
 
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, int block,
+                            int plane, const scan_order *sc,
                             BLOCK_SIZE plane_bsize, int x, int y,
                             TX_SIZE tx_size, vp9_reader *r,
                             int seg_id) {
@@ -215,10 +214,9 @@
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx = get_entropy_context(tx_size, pd->above_context + x,
                                                pd->left_context + y);
-  const scan_order *so = get_scan(xd, tx_size, pd->plane_type, block);
   const int eob = decode_coefs(xd, pd->plane_type,
-                               BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               dequant, ctx, so->scan, so->neighbors, r);
+                               pd->dqcoeff, tx_size,
+                               dequant, ctx, sc->scan, sc->neighbors, r);
   vp9_set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   return eob;
 }

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index df17606..ed826a4 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -14,13 +14,14 @@
 
 #include "vp9/decoder/vp9_decoder.h"
 #include "vp9/decoder/vp9_reader.h"
+#include "vp9/common/vp9_scan.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 int vp9_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, int block,
+                            int plane, const scan_order *sc,
                             BLOCK_SIZE plane_bsize, int x, int y,
                             TX_SIZE tx_size, vp9_reader *r,
                             int seg_id);

diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h
index 8628801..f6cdccd 100644
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h

@@ -12,7 +12,7 @@
 #define VP9_DECODER_VP9_DTHREAD_H_
 
 #include "./vpx_config.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 #include "vpx/internal/vpx_codec_internal.h"
 
 struct VP9Common;

diff --git a/vp9/decoder/vp9_reader.c b/vp9/decoder/vp9_reader.c
index 6bb4f9f..2c96f74 100644
--- a/vp9/decoder/vp9_reader.c
+++ b/vp9/decoder/vp9_reader.c

@@ -13,11 +13,6 @@
 
 #include "vp9/decoder/vp9_reader.h"
 
-// This is meant to be a large, positive constant that can still be efficiently
-// loaded as an immediate (on platforms like ARM, for example).
-// Even relatively modest values like 100 would work fine.
-#define LOTS_OF_BITS 0x40000000
-
 int vp9_reader_init(vp9_reader *r,
                     const uint8_t *buffer,
                     size_t size,
@@ -86,21 +81,3 @@
   }
   return r->buffer;
 }
-
-int vp9_reader_has_error(vp9_reader *r) {
-  // Check if we have reached the end of the buffer.
-  //
-  // Variable 'count' stores the number of bits in the 'value' buffer, minus
-  // 8. The top byte is part of the algorithm, and the remainder is buffered
-  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
-  // occupied, 8 for the algorithm and 8 in the buffer.
-  //
-  // When reading a byte from the user's buffer, count is filled with 8 and
-  // one byte is filled into the value buffer. When we reach the end of the
-  // data, count is additionally filled with LOTS_OF_BITS. So when
-  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
-  //
-  // 1 if we have tried to decode bits after the end of stream was encountered.
-  // 0 No error.
-  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
-}

diff --git a/vp9/decoder/vp9_reader.h b/vp9/decoder/vp9_reader.h
index a68a1d5..4959985 100644
--- a/vp9/decoder/vp9_reader.h
+++ b/vp9/decoder/vp9_reader.h

@@ -29,6 +29,11 @@
 
 #define BD_VALUE_SIZE ((int)sizeof(BD_VALUE) * CHAR_BIT)
 
+// This is meant to be a large, positive constant that can still be efficiently
+// loaded as an immediate (on platforms like ARM, for example).
+// Even relatively modest values like 100 would work fine.
+#define LOTS_OF_BITS 0x40000000
+
 typedef struct {
   // Be careful when reordering this struct, it may impact the cache negatively.
   BD_VALUE value;
@@ -49,10 +54,26 @@
 
 void vp9_reader_fill(vp9_reader *r);
 
-int vp9_reader_has_error(vp9_reader *r);
-
 const uint8_t *vp9_reader_find_end(vp9_reader *r);
 
+static INLINE int vp9_reader_has_error(vp9_reader *r) {
+  // Check if we have reached the end of the buffer.
+  //
+  // Variable 'count' stores the number of bits in the 'value' buffer, minus
+  // 8. The top byte is part of the algorithm, and the remainder is buffered
+  // to be shifted into it. So if count == 8, the top 16 bits of 'value' are
+  // occupied, 8 for the algorithm and 8 in the buffer.
+  //
+  // When reading a byte from the user's buffer, count is filled with 8 and
+  // one byte is filled into the value buffer. When we reach the end of the
+  // data, count is additionally filled with LOTS_OF_BITS. So when
+  // count == LOTS_OF_BITS - 1, the user's data has been exhausted.
+  //
+  // 1 if we have tried to decode bits after the end of stream was encountered.
+  // 0 No error.
+  return r->count > BD_VALUE_SIZE && r->count < LOTS_OF_BITS;
+}
+
 static INLINE int vp9_read(vp9_reader *r, int prob) {
   unsigned int bit = 0;
   BD_VALUE value;

diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index f505fcb..fecab57 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c

@@ -47,3 +47,56 @@
 
   return (horizontal_add_u16x8(v_sum) + 32) >> 6;
 }
+
+void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+                          const int ref_stride, const int height) {
+  int i;
+  uint16x8_t vec_sum_lo = vdupq_n_u16(0);
+  uint16x8_t vec_sum_hi = vdupq_n_u16(0);
+  const int shift_factor = ((height >> 5) + 3) * -1;
+  const int16x8_t vec_shift = vdupq_n_s16(shift_factor);
+
+  for (i = 0; i < height; i += 8) {
+    const uint8x16_t vec_row1 = vld1q_u8(ref);
+    const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride);
+    const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2);
+    const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3);
+    const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4);
+    const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5);
+    const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6);
+    const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7);
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7));
+
+    vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8));
+    vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8));
+
+    ref += ref_stride * 8;
+  }
+
+  vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift);
+  vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift);
+
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo));
+  hbuf += 8;
+  vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi));
+}

diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vp9/encoder/mips/msa/vp9_avg_msa.c
index f2e8b27..611adb1 100644
--- a/vp9/encoder/mips/msa/vp9_avg_msa.c
+++ b/vp9/encoder/mips/msa/vp9_avg_msa.c

@@ -9,7 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) {
   uint32_t sum_out;

diff --git a/vp9/encoder/mips/msa/vp9_error_msa.c b/vp9/encoder/mips/msa/vp9_error_msa.c
index 9709092..1dc70bd 100644
--- a/vp9/encoder/mips/msa/vp9_error_msa.c
+++ b/vp9/encoder/mips/msa/vp9_error_msa.c

@@ -9,7 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 #define BLOCK_ERROR_BLOCKSIZE_MSA(BSize)                                   \
 static int64_t block_error_##BSize##size_msa(const int16_t *coeff_ptr,     \

diff --git a/vp9/encoder/mips/msa/vp9_fdct_msa.h b/vp9/encoder/mips/msa/vp9_fdct_msa.h
index ad66576..d111421 100644
--- a/vp9/encoder/mips/msa/vp9_fdct_msa.h
+++ b/vp9/encoder/mips/msa/vp9_fdct_msa.h

@@ -13,7 +13,7 @@
 
 #include "vpx_ports/mem.h"
 #include "vp9/common/vp9_idct.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 #define VP9_DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1) {  \
   v8i16 k0_m = __msa_fill_h(cnst0);                                  \

diff --git a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
index 4053bff..363aabb 100644
--- a/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c
+++ b/vp9/encoder/mips/msa/vp9_temporal_filter_msa.c

@@ -9,7 +9,7 @@
  */
 
 #include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void temporal_filter_apply_8size_msa(uint8_t *frm1_ptr,
                                             uint32_t stride,

diff --git a/vp9/encoder/mips/msa/vp9_variance_msa.c b/vp9/encoder/mips/msa/vp9_variance_msa.c
deleted file mode 100644
index 33fb496..0000000
--- a/vp9/encoder/mips/msa/vp9_variance_msa.c
+++ /dev/null

@@ -1,2011 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_filter.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
-
-DECLARE_ALIGNED(256, static const int8_t, vp9_bilinear_filters_msa[15][2]) = {
-  { 120,   8 },
-  { 112,  16 },
-  { 104,  24 },
-  {  96,  32 },
-  {  88,  40 },
-  {  80,  48 },
-  {  72,  56 },
-  {  64,  64 },
-  {  56,  72 },
-  {  48,  80 },
-  {  40,  88 },
-  {  32,  96 },
-  {  24, 104 },
-  {  16, 112 },
-  {   8, 120 }
-};
-
-#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
-  v16u8 src_l0_m, src_l1_m;                                        \
-  v8i16 res_l0_m, res_l1_m;                                        \
-                                                                   \
-  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
-  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
-  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
-                                                                   \
-  sub += res_l0_m + res_l1_m;                                      \
-}
-
-#define VARIANCE_WxH(sse, diff, shift) \
-  sse - (((uint32_t)diff * diff) >> shift)
-
-#define VARIANCE_LARGE_WxH(sse, diff, shift) \
-  sse - (((int64_t)diff * diff) >> shift)
-
-static uint32_t avg_sse_diff_4width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred,
-                                        int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 pred, src = { 0 };
-  v16u8 ref = { 0 };
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LW4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_8width_msa(const uint8_t *src_ptr,
-                                        int32_t src_stride,
-                                        const uint8_t *ref_ptr,
-                                        int32_t ref_stride,
-                                        const uint8_t *sec_pred,
-                                        int32_t height,
-                                        int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    PCKEV_D4_UB(src1, src0, src3, src2, ref1, ref0, ref3, ref2,
-                src0, src1, ref0, ref1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_16width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height,
-                                         int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src, ref, pred;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    ref = LD_UB(ref_ptr);
-    ref_ptr += ref_stride;
-    src = __msa_aver_u_b(src, pred);
-    CALC_MSE_AVG_B(src, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32width_msa(const uint8_t *src_ptr,
-                                         int32_t src_stride,
-                                         const uint8_t *ref_ptr,
-                                         int32_t ref_stride,
-                                         const uint8_t *sec_pred,
-                                         int32_t height,
-                                         int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_32x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, ref0, ref1, pred0, pred1;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB2(src_ptr, 16, src0, src1);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x32_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 16; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src2, ref2, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src3, ref3, var, avg1);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t avg_sse_diff_64x64_msa(const uint8_t *src_ptr,
-                                       int32_t src_stride,
-                                       const uint8_t *ref_ptr,
-                                       int32_t ref_stride,
-                                       const uint8_t *sec_pred,
-                                       int32_t *diff) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v8i16 avg0 = { 0 };
-  v8i16 avg1 = { 0 };
-  v8i16 avg2 = { 0 };
-  v8i16 avg3 = { 0 };
-  v4i32 vec, var = { 0 };
-
-  for (ht_cnt = 32; ht_cnt--;) {
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-
-    LD_UB4(sec_pred, 16, pred0, pred1, pred2, pred3);
-    sec_pred += 64;
-    LD_UB4(src_ptr, 16, src0, src1, src2, src3);
-    src_ptr += src_stride;
-    LD_UB4(ref_ptr, 16, ref0, ref1, ref2, ref3);
-    ref_ptr += ref_stride;
-    AVER_UB4_UB(src0, pred0, src1, pred1, src2, pred2, src3, pred3,
-                src0, src1, src2, src3);
-    CALC_MSE_AVG_B(src0, ref0, var, avg0);
-    CALC_MSE_AVG_B(src1, ref1, var, avg1);
-    CALC_MSE_AVG_B(src2, ref2, var, avg2);
-    CALC_MSE_AVG_B(src3, ref3, var, avg3);
-  }
-
-  vec = __msa_hadd_s_w(avg0, avg0);
-  vec += __msa_hadd_s_w(avg1, avg1);
-  vec += __msa_hadd_s_w(avg2, avg2);
-  vec += __msa_hadd_s_w(avg3, avg3);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const int8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    CALC_MSE_AVG_B(src0, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const int8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 filt0, out, ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3, const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8u16 const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                out0, out1, out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
-                out4, out5, out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
-    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
-                src0, src1, src2, src3);
-    CALC_MSE_AVG_B(src0, dst0, var, avg);
-    CALC_MSE_AVG_B(src1, dst1, var, avg);
-    CALC_MSE_AVG_B(src2, dst2, var, avg);
-    CALC_MSE_AVG_B(src3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const int8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4, out;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 ref = { 0 };
-  v16u8 src2110, src4332;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
-               src10_r, src21_r, src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
-                                                int32_t src_stride,
-                                                const uint8_t *dst,
-                                                int32_t dst_stride,
-                                                const int8_t *filter,
-                                                int32_t height,
-                                                int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
-               vec0, vec1, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                tmp0, tmp1, tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3;
-  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
-                                            filter, height, &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter_horiz,
-                                                 const int8_t *filter_vert,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out, ref = { 0 };
-  v16u8 filt_vt, filt_hz, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
-                                                 int32_t src_stride,
-                                                 const uint8_t *dst,
-                                                 int32_t dst_stride,
-                                                 const int8_t *filter_horiz,
-                                                 const int8_t *filter_vert,
-                                                 int32_t height,
-                                                 int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v16u8 filt_vt, filt_hz, vec0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const int8_t *filter_horiz,
-                                                  const int8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
-  v8u16 tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-    CALC_MSE_AVG_B(src2, ref2, var, avg);
-    CALC_MSE_AVG_B(src3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const int8_t *filter_horiz,
-                                                  const int8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
-                                                  int32_t src_stride,
-                                                  const uint8_t *dst,
-                                                  int32_t dst_stride,
-                                                  const int8_t *filter_horiz,
-                                                  const int8_t *filter_vert,
-                                                  int32_t height,
-                                                  int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
-                                             filter_horiz, filter_vert, height,
-                                             &diff0[loop_cnt]);
-    src += 16;
-    dst += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_h_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const int8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 out, pred, filt0, ref = { 0 };
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
-    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
-    out = (v16u8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_h_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const int8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 out, pred, filt0;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16i8 src0, src1, src2, src3;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                vec0, vec1, vec2, vec3);
-    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
-    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
-    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
-                src0, src1, src2, src3);
-    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
-
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref0, var, avg);
-    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_h_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             const uint8_t *dst,
-                                             int32_t dst_stride,
-                                             const uint8_t *sec_pred,
-                                             const int8_t *filter,
-                                             int32_t height,
-                                             int32_t *diff,
-                                             int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v16u8 dst0, dst1, dst2, dst3;
-  v16u8 tmp0, tmp1, tmp2, tmp3;
-  v16u8 pred0, pred1, pred2, pred3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
-  v8u16 const255;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  const255 = (v8u16)__msa_ldi_h(255);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_SB4(src, src_stride, src0, src2, src4, src6);
-    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
-    dst += (4 * dst_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
-    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
-    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
-    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                out0, out1, out2, out3);
-    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
-                out4, out5, out6, out7);
-    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
-    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
-    MIN_UH4_UH(out0, out1, out2, out3, const255);
-    MIN_UH4_UH(out4, out5, out6, out7, const255);
-    PCKEV_B4_UB(out1, out0, out3, out2, out5, out4, out7, out6,
-                tmp0, tmp1, tmp2, tmp3);
-    AVER_UB4_UB(tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3, pred3,
-                tmp0, tmp1, tmp2, tmp3);
-
-    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
-    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
-    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
-    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  return subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_h_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_h_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_v_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const int8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 src10_r, src32_r, src21_r, src43_r;
-  v16u8 out, pred, ref = { 0 };
-  v16u8 src2110, src4332, filt0;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-  v8u16 tmp0, tmp1;
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
-               src10_r, src21_r, src32_r, src43_r);
-    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
-    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_v_msa(const uint8_t *src,
-                                                    int32_t src_stride,
-                                                    const uint8_t *dst,
-                                                    int32_t dst_stride,
-                                                    const uint8_t *sec_pred,
-                                                    const int8_t *filter,
-                                                    int32_t height,
-                                                    int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, filt0;
-  v8u16 vec0, vec1, vec2, vec3;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
-               vec0, vec1, vec2, vec3);
-    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
-                tmp0, tmp1, tmp2, tmp3);
-    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
-    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
-    AVER_UB2_UB(src0, pred0, src1, pred1, src0, src1);
-    CALC_MSE_AVG_B(src0, ref0, var, avg);
-    CALC_MSE_AVG_B(src1, ref1, var, avg);
-
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_v_msa(const uint8_t *src,
-                                             int32_t src_stride,
-                                             const uint8_t *dst,
-                                             int32_t dst_stride,
-                                             const uint8_t *sec_pred,
-                                             const int8_t *filter,
-                                             int32_t height,
-                                             int32_t *diff,
-                                             int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 out0, out1, out2, out3, filt0;
-  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
-  v8u16 tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter);
-  filt0 = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    ILVR_B2_UH(src1, src0, src2, src1, vec0, vec2);
-    ILVL_B2_UH(src1, src0, src2, src1, vec1, vec3);
-    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    ILVR_B2_UH(src3, src2, src4, src3, vec4, vec6);
-    ILVL_B2_UH(src3, src2, src4, src3, vec5, vec7);
-    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
-
-    src0 = src4;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
-                out0, out1, out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  return subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                      sec_pred, filter, height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_v_msa(const uint8_t *src,
-                                                     int32_t src_stride,
-                                                     const uint8_t *dst,
-                                                     int32_t dst_stride,
-                                                     const uint8_t *sec_pred,
-                                                     const int8_t *filter,
-                                                     int32_t height,
-                                                     int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_v_msa(src, src_stride, dst, dst_stride,
-                                        sec_pred, filter, height,
-                                        &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_4width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const int8_t *filter_horiz, const int8_t *filter_vert,
-  int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  uint32_t ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 out, pred, ref = { 0 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    pred = LD_UB(sec_pred);
-    sec_pred += 16;
-    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
-    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
-    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-    out = __msa_aver_u_b(out, pred);
-    CALC_MSE_AVG_B(out, ref, var, avg);
-    src0 = src4;
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_8width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const int8_t *filter_horiz, const int8_t *filter_vert,
-  int32_t height, int32_t *diff) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 src0, src1, src2, src3, src4;
-  v16u8 pred0, pred1, out0, out1;
-  v16u8 filt_hz, filt_vt, vec0;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  src0 = LD_UB(src);
-  src += src_stride;
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src1, src2, src3, src4);
-    src += (4 * src_stride);
-    LD_UB2(sec_pred, 16, pred0, pred1);
-    sec_pred += 32;
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
-    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
-    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-
-    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
-    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
-
-    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
-    SAT_UH2_UH(tmp2, tmp3, 7);
-    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
-    AVER_UB2_UB(out0, pred0, out1, pred1, out0, out1);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t subpel_avg_ssediff_16w_hv_msa(const uint8_t *src,
-                                              int32_t src_stride,
-                                              const uint8_t *dst,
-                                              int32_t dst_stride,
-                                              const uint8_t *sec_pred,
-                                              const int8_t *filter_horiz,
-                                              const int8_t *filter_vert,
-                                              int32_t height,
-                                              int32_t *diff,
-                                              int32_t width) {
-  int16_t filtval;
-  uint32_t loop_cnt;
-  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
-  v16u8 ref0, ref1, ref2, ref3;
-  v16u8 pred0, pred1, pred2, pred3;
-  v16u8 out0, out1, out2, out3;
-  v16u8 filt_hz, filt_vt, vec0, vec1;
-  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
-  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
-  v8i16 avg = { 0 };
-  v4i32 vec, var = { 0 };
-
-  filtval = LH(filter_horiz);
-  filt_hz = (v16u8)__msa_fill_h(filtval);
-  filtval = LH(filter_vert);
-  filt_vt = (v16u8)__msa_fill_h(filtval);
-
-  LD_UB2(src, 8, src0, src1);
-  src += src_stride;
-
-  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-
-  for (loop_cnt = (height >> 2); loop_cnt--;) {
-    LD_UB4(src, src_stride, src0, src2, src4, src6);
-    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
-    src += (4 * src_stride);
-    LD_UB4(sec_pred, width, pred0, pred1, pred2, pred3);
-    sec_pred += (4 * width);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
-    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
-    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
-    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
-    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
-    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
-    SAT_UH2_UH(tmp0, tmp1, 7);
-    out3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
-
-    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
-    dst += (4 * dst_stride);
-
-    AVER_UB4_UB(out0, pred0, out1, pred1, out2, pred2, out3, pred3,
-                out0, out1, out2, out3);
-
-    CALC_MSE_AVG_B(out0, ref0, var, avg);
-    CALC_MSE_AVG_B(out1, ref1, var, avg);
-    CALC_MSE_AVG_B(out2, ref2, var, avg);
-    CALC_MSE_AVG_B(out3, ref3, var, avg);
-  }
-
-  vec = __msa_hadd_s_w(avg, avg);
-  *diff = HADD_SW_S32(vec);
-
-  return HADD_SW_S32(var);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_16width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const int8_t *filter_horiz, const int8_t *filter_vert,
-  int32_t height, int32_t *diff) {
-  return subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                       sec_pred, filter_horiz, filter_vert,
-                                       height, diff, 16);
-}
-
-static uint32_t sub_pixel_avg_sse_diff_32width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const int8_t *filter_horiz, const int8_t *filter_vert,
-  int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[2];
-
-  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 32);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1];
-
-  return sse;
-}
-
-static uint32_t sub_pixel_avg_sse_diff_64width_hv_msa(
-  const uint8_t *src, int32_t src_stride,
-  const uint8_t *dst, int32_t dst_stride,
-  const uint8_t *sec_pred,
-  const int8_t *filter_horiz, const int8_t *filter_vert,
-  int32_t height, int32_t *diff) {
-  uint32_t loop_cnt, sse = 0;
-  int32_t diff0[4];
-
-  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
-    sse += subpel_avg_ssediff_16w_hv_msa(src, src_stride, dst, dst_stride,
-                                         sec_pred, filter_horiz, filter_vert,
-                                         height, &diff0[loop_cnt], 64);
-    src += 16;
-    dst += 16;
-    sec_pred += 16;
-  }
-
-  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
-
-  return sse;
-}
-
-#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
-#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
-#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
-#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
-#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
-
-#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
-#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
-#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
-#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
-
-#define VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
-uint32_t vp9_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
-                                                 int32_t src_stride,     \
-                                                 int32_t xoffset,        \
-                                                 int32_t yoffset,        \
-                                                 const uint8_t *ref,     \
-                                                 int32_t ref_stride,     \
-                                                 uint32_t *sse) {        \
-  int32_t diff;                                                          \
-  uint32_t var;                                                          \
-  const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1];        \
-  const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1];        \
-                                                                         \
-  if (yoffset) {                                                         \
-    if (xoffset) {                                                       \
-      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
-                                                   ref, ref_stride,      \
-                                                   h_filter, v_filter,   \
-                                                   ht, &diff);           \
-    } else {                                                             \
-      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
-                                                  ref, ref_stride,       \
-                                                  v_filter, ht, &diff);  \
-    }                                                                    \
-                                                                         \
-    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
-  } else {                                                               \
-    if (xoffset) {                                                       \
-      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
-                                                  ref, ref_stride,       \
-                                                  h_filter, ht, &diff);  \
-                                                                         \
-      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
-    } else {                                                             \
-      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
-                                          ref, ref_stride, sse);         \
-    }                                                                    \
-  }                                                                      \
-                                                                         \
-  return var;                                                            \
-}
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
-
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
-VP9_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);
-
-#define VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(wd, ht)                          \
-uint32_t vp9_sub_pixel_avg_variance##wd##x##ht##_msa(                         \
-  const uint8_t *src_ptr, int32_t src_stride,                                 \
-  int32_t xoffset, int32_t yoffset,                                           \
-  const uint8_t *ref_ptr, int32_t ref_stride,                                 \
-  uint32_t *sse, const uint8_t *sec_pred) {                                   \
-  int32_t diff;                                                               \
-  const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1];             \
-  const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1];             \
-                                                                              \
-  if (yoffset) {                                                              \
-    if (xoffset) {                                                            \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_hv_msa(src_ptr, src_stride,   \
-                                                       ref_ptr, ref_stride,   \
-                                                       sec_pred, h_filter,    \
-                                                       v_filter, ht, &diff);  \
-    } else {                                                                  \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_v_msa(src_ptr, src_stride,    \
-                                                      ref_ptr, ref_stride,    \
-                                                      sec_pred, v_filter,     \
-                                                      ht, &diff);             \
-    }                                                                         \
-  } else {                                                                    \
-    if (xoffset) {                                                            \
-      *sse = sub_pixel_avg_sse_diff_##wd##width_h_msa(src_ptr, src_stride,    \
-                                                      ref_ptr, ref_stride,    \
-                                                      sec_pred, h_filter,     \
-                                                      ht, &diff);             \
-    } else {                                                                  \
-      *sse = avg_sse_diff_##wd##width_msa(src_ptr, src_stride,                \
-                                          ref_ptr, ref_stride,                \
-                                          sec_pred, ht, &diff);               \
-    }                                                                         \
-  }                                                                           \
-                                                                              \
-  return VARIANCE_##wd##Wx##ht##H(*sse, diff);                                \
-}
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 4);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(4, 8);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 4);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 8);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(8, 16);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 8);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 16);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(16, 32);
-
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 16);
-VP9_SUB_PIXEL_AVG_VARIANCE_WDXHT_MSA(32, 32);
-
-uint32_t vp9_sub_pixel_avg_variance32x64_msa(const uint8_t *src_ptr,
-                                             int32_t src_stride,
-                                             int32_t xoffset,
-                                             int32_t yoffset,
-                                             const uint8_t *ref_ptr,
-                                             int32_t ref_stride,
-                                             uint32_t *sse,
-                                             const uint8_t *sec_pred) {
-  int32_t diff;
-  const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1];
-  const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1];
-
-  if (yoffset) {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_hv_msa(src_ptr, src_stride,
-                                                   ref_ptr, ref_stride,
-                                                   sec_pred, h_filter,
-                                                   v_filter, 64, &diff);
-    } else {
-      *sse = sub_pixel_avg_sse_diff_32width_v_msa(src_ptr, src_stride,
-                                                  ref_ptr, ref_stride,
-                                                  sec_pred, v_filter,
-                                                  64, &diff);
-    }
-  } else {
-    if (xoffset) {
-      *sse = sub_pixel_avg_sse_diff_32width_h_msa(src_ptr, src_stride,
-                                                  ref_ptr, ref_stride,
-                                                  sec_pred, h_filter,
-                                                  64, &diff);
-    } else {
-      *sse = avg_sse_diff_32x64_msa(src_ptr, src_stride, ref_ptr, ref_stride,
-                                    sec_pred, &diff);
-    }
-  }
-
-  return VARIANCE_32Wx64H(*sse, diff);
-}
-
-#define VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(ht)                          \
-uint32_t vp9_sub_pixel_avg_variance64x##ht##_msa(const uint8_t *src_ptr,     \
-                                                 int32_t src_stride,         \
-                                                 int32_t xoffset,            \
-                                                 int32_t yoffset,            \
-                                                 const uint8_t *ref_ptr,     \
-                                                 int32_t ref_stride,         \
-                                                 uint32_t *sse,              \
-                                                 const uint8_t *sec_pred) {  \
-  int32_t diff;                                                              \
-  const int8_t *h_filter = vp9_bilinear_filters_msa[xoffset - 1];            \
-  const int8_t *v_filter = vp9_bilinear_filters_msa[yoffset - 1];            \
-                                                                             \
-  if (yoffset) {                                                             \
-    if (xoffset) {                                                           \
-      *sse = sub_pixel_avg_sse_diff_64width_hv_msa(src_ptr, src_stride,      \
-                                                   ref_ptr, ref_stride,      \
-                                                   sec_pred, h_filter,       \
-                                                   v_filter, ht, &diff);     \
-    } else {                                                                 \
-      *sse = sub_pixel_avg_sse_diff_64width_v_msa(src_ptr, src_stride,       \
-                                                  ref_ptr, ref_stride,       \
-                                                  sec_pred, v_filter,        \
-                                                  ht, &diff);                \
-    }                                                                        \
-  } else {                                                                   \
-    if (xoffset) {                                                           \
-      *sse = sub_pixel_avg_sse_diff_64width_h_msa(src_ptr, src_stride,       \
-                                                  ref_ptr, ref_stride,       \
-                                                  sec_pred, h_filter,        \
-                                                  ht, &diff);                \
-    } else {                                                                 \
-      *sse = avg_sse_diff_64x##ht##_msa(src_ptr, src_stride,                 \
-                                        ref_ptr, ref_stride,                 \
-                                        sec_pred, &diff);                    \
-    }                                                                        \
-  }                                                                          \
-                                                                             \
-  return VARIANCE_64Wx##ht##H(*sse, diff);                                   \
-}
-
-VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(32);
-VP9_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_MSA(64);

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index e94d43b..02d986e 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -2123,6 +2123,39 @@
   BLOCK_64X64
 };
 
+// Checks to see if a macro block is at the edge of the active image.
+// In most cases this is the "real" edge unless there are formatting
+// bars embedded in the stream.
+static int active_edge_sb(VP9_COMP *cpi,
+                          int mi_row, int mi_col) {
+  int is_active_edge = 0;
+  int top_edge = 0;
+  int bottom_edge = cpi->common.mi_rows;
+  int left_edge = 0;
+  int right_edge = cpi->common.mi_cols;
+
+  // For two pass account for any formatting bars detected.
+  if (cpi->oxcf.pass == 2) {
+    TWO_PASS *twopass = &cpi->twopass;
+
+    // The inactive region is specified in MBs not mi units.
+    // The image edge is in the following MB row.
+    top_edge += (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+
+    bottom_edge -= (int)(twopass->this_frame_stats.inactive_zone_rows * 2);
+    bottom_edge = MAX(top_edge, bottom_edge);
+  }
+
+  if (((top_edge >= mi_row) && (top_edge < (mi_row + MI_BLOCK_SIZE))) ||
+      ((bottom_edge >= mi_row) && (bottom_edge < (mi_row + MI_BLOCK_SIZE))) ||
+      ((left_edge >= mi_col) && (left_edge < (mi_col + MI_BLOCK_SIZE))) ||
+      ((right_edge >= mi_col) && (right_edge < (mi_col + MI_BLOCK_SIZE)))) {
+    is_active_edge = 1;
+  }
+
+  return is_active_edge;
+}
+
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
 // At the moment this is designed to work on a 64x64 SB but could be
@@ -2217,7 +2250,15 @@
   max_size = find_partition_size(max_size,
                                  row8x8_remaining, col8x8_remaining,
                                  &bh, &bw);
-  min_size = MIN(cpi->sf.rd_auto_partition_min_limit, MIN(min_size, max_size));
+  // Test for blocks at the edge of the active image.
+  // This may be the actual edge of the image or where there are formatting
+  // bars.
+  if (active_edge_sb(cpi, mi_row, mi_col)) {
+    min_size = BLOCK_4X4;
+  } else {
+    min_size = MIN(cpi->sf.rd_auto_partition_min_limit,
+                   MIN(min_size, max_size));
+  }
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
@@ -3308,8 +3349,13 @@
   subsize = (bsize >= BLOCK_8X8) ? mi[0]->mbmi.sb_type : BLOCK_4X4;
   partition = partition_lookup[bsl][subsize];
 
-  if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
-      subsize >= BLOCK_16X16) {
+  if (bsize == BLOCK_32X32 && subsize == BLOCK_32X32) {
+    x->max_partition_size = BLOCK_32X32;
+    x->min_partition_size = BLOCK_16X16;
+    nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,
+                         rd_cost, 0, INT64_MAX, pc_tree);
+  } else if (bsize == BLOCK_32X32 && partition != PARTITION_NONE &&
+             subsize >= BLOCK_16X16) {
     x->max_partition_size = BLOCK_32X32;
     x->min_partition_size = BLOCK_8X8;
     nonrd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, bsize,

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2829365..3130941 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -11,6 +11,7 @@
 
 #include "./vp9_rtcd.h"
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -31,45 +32,6 @@
   ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
 };
 
-void vp9_subtract_block_c(int rows, int cols,
-                          int16_t *diff, ptrdiff_t diff_stride,
-                          const uint8_t *src, ptrdiff_t src_stride,
-                          const uint8_t *pred, ptrdiff_t pred_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++)
-      diff[c] = src[c] - pred[c];
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src  += src_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp9_highbd_subtract_block_c(int rows, int cols,
-                                 int16_t *diff, ptrdiff_t diff_stride,
-                                 const uint8_t *src8, ptrdiff_t src_stride,
-                                 const uint8_t *pred8, ptrdiff_t pred_stride,
-                                 int bd) {
-  int r, c;
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
-  (void) bd;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++) {
-      diff[c] = src[c] - pred[c];
-    }
-
-    diff += diff_stride;
-    pred += pred_stride;
-    src  += src_stride;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -79,13 +41,13 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
+    vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf,
                               p->src.stride, pd->dst.buf, pd->dst.stride,
                               x->e_mbd.bd);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
+  vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride,
                      pd->dst.buf, pd->dst.stride);
 }
 
@@ -838,7 +800,7 @@
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(32, 32, src_diff, diff_stride,
+          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
           vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
@@ -859,7 +821,7 @@
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(16, 16, src_diff, diff_stride,
+          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -881,7 +843,7 @@
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, i, j, plane);
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(8, 8, src_diff, diff_stride,
+          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
           vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
@@ -904,7 +866,7 @@
                                 dst, dst_stride, i, j, plane);
 
         if (!x->skip_recode) {
-          vp9_highbd_subtract_block(4, 4, src_diff, diff_stride,
+          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
                                     src, src_stride, dst, dst_stride, xd->bd);
           if (tx_type != DCT_DCT)
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
@@ -946,7 +908,7 @@
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(32, 32, src_diff, diff_stride,
+        vpx_subtract_block(32, 32, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
         vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
@@ -966,7 +928,7 @@
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(16, 16, src_diff, diff_stride,
+        vpx_subtract_block(16, 16, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
@@ -986,7 +948,7 @@
                               x->skip_encode ? src_stride : dst_stride,
                               dst, dst_stride, i, j, plane);
       if (!x->skip_recode) {
-        vp9_subtract_block(8, 8, src_diff, diff_stride,
+        vpx_subtract_block(8, 8, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
         vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
@@ -1007,7 +969,7 @@
                               dst, dst_stride, i, j, plane);
 
       if (!x->skip_recode) {
-        vp9_subtract_block(4, 4, src_diff, diff_stride,
+        vpx_subtract_block(4, 4, src_diff, diff_stride,
                            src, src_stride, dst, dst_stride);
         if (tx_type != DCT_DCT)
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index b9b1106..f27f57a 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -732,7 +732,7 @@
 
   vp9_set_mb_mi(cm, cm->width, cm->height);
   vp9_init_context_buffers(cm);
-  init_macroblockd(cm, xd);
+  vp9_init_macroblockd(cm, xd);
   cpi->td.mb.mbmi_ext_base = cpi->mbmi_ext_base;
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
@@ -1023,8 +1023,8 @@
                    vpx_highbd_sad32x16_bits8,
                    vpx_highbd_sad32x16_avg_bits8,
                    vpx_highbd_8_variance32x16,
-                   vp9_highbd_sub_pixel_variance32x16,
-                   vp9_highbd_sub_pixel_avg_variance32x16,
+                   vpx_highbd_8_sub_pixel_variance32x16,
+                   vpx_highbd_8_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x16x4d_bits8)
@@ -1033,8 +1033,8 @@
                    vpx_highbd_sad16x32_bits8,
                    vpx_highbd_sad16x32_avg_bits8,
                    vpx_highbd_8_variance16x32,
-                   vp9_highbd_sub_pixel_variance16x32,
-                   vp9_highbd_sub_pixel_avg_variance16x32,
+                   vpx_highbd_8_sub_pixel_variance16x32,
+                   vpx_highbd_8_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad16x32x4d_bits8)
@@ -1043,8 +1043,8 @@
                    vpx_highbd_sad64x32_bits8,
                    vpx_highbd_sad64x32_avg_bits8,
                    vpx_highbd_8_variance64x32,
-                   vp9_highbd_sub_pixel_variance64x32,
-                   vp9_highbd_sub_pixel_avg_variance64x32,
+                   vpx_highbd_8_sub_pixel_variance64x32,
+                   vpx_highbd_8_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad64x32x4d_bits8)
@@ -1053,8 +1053,8 @@
                    vpx_highbd_sad32x64_bits8,
                    vpx_highbd_sad32x64_avg_bits8,
                    vpx_highbd_8_variance32x64,
-                   vp9_highbd_sub_pixel_variance32x64,
-                   vp9_highbd_sub_pixel_avg_variance32x64,
+                   vpx_highbd_8_sub_pixel_variance32x64,
+                   vpx_highbd_8_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x64x4d_bits8)
@@ -1063,8 +1063,8 @@
                    vpx_highbd_sad32x32_bits8,
                    vpx_highbd_sad32x32_avg_bits8,
                    vpx_highbd_8_variance32x32,
-                   vp9_highbd_sub_pixel_variance32x32,
-                   vp9_highbd_sub_pixel_avg_variance32x32,
+                   vpx_highbd_8_sub_pixel_variance32x32,
+                   vpx_highbd_8_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits8,
                    vpx_highbd_sad32x32x8_bits8,
                    vpx_highbd_sad32x32x4d_bits8)
@@ -1073,8 +1073,8 @@
                    vpx_highbd_sad64x64_bits8,
                    vpx_highbd_sad64x64_avg_bits8,
                    vpx_highbd_8_variance64x64,
-                   vp9_highbd_sub_pixel_variance64x64,
-                   vp9_highbd_sub_pixel_avg_variance64x64,
+                   vpx_highbd_8_sub_pixel_variance64x64,
+                   vpx_highbd_8_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits8,
                    vpx_highbd_sad64x64x8_bits8,
                    vpx_highbd_sad64x64x4d_bits8)
@@ -1083,8 +1083,8 @@
                    vpx_highbd_sad16x16_bits8,
                    vpx_highbd_sad16x16_avg_bits8,
                    vpx_highbd_8_variance16x16,
-                   vp9_highbd_sub_pixel_variance16x16,
-                   vp9_highbd_sub_pixel_avg_variance16x16,
+                   vpx_highbd_8_sub_pixel_variance16x16,
+                   vpx_highbd_8_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits8,
                    vpx_highbd_sad16x16x8_bits8,
                    vpx_highbd_sad16x16x4d_bits8)
@@ -1093,8 +1093,8 @@
                    vpx_highbd_sad16x8_bits8,
                    vpx_highbd_sad16x8_avg_bits8,
                    vpx_highbd_8_variance16x8,
-                   vp9_highbd_sub_pixel_variance16x8,
-                   vp9_highbd_sub_pixel_avg_variance16x8,
+                   vpx_highbd_8_sub_pixel_variance16x8,
+                   vpx_highbd_8_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits8,
                    vpx_highbd_sad16x8x8_bits8,
                    vpx_highbd_sad16x8x4d_bits8)
@@ -1103,8 +1103,8 @@
                    vpx_highbd_sad8x16_bits8,
                    vpx_highbd_sad8x16_avg_bits8,
                    vpx_highbd_8_variance8x16,
-                   vp9_highbd_sub_pixel_variance8x16,
-                   vp9_highbd_sub_pixel_avg_variance8x16,
+                   vpx_highbd_8_sub_pixel_variance8x16,
+                   vpx_highbd_8_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits8,
                    vpx_highbd_sad8x16x8_bits8,
                    vpx_highbd_sad8x16x4d_bits8)
@@ -1113,8 +1113,8 @@
                    vpx_highbd_sad8x8_bits8,
                    vpx_highbd_sad8x8_avg_bits8,
                    vpx_highbd_8_variance8x8,
-                   vp9_highbd_sub_pixel_variance8x8,
-                   vp9_highbd_sub_pixel_avg_variance8x8,
+                   vpx_highbd_8_sub_pixel_variance8x8,
+                   vpx_highbd_8_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits8,
                    vpx_highbd_sad8x8x8_bits8,
                    vpx_highbd_sad8x8x4d_bits8)
@@ -1123,8 +1123,8 @@
                    vpx_highbd_sad8x4_bits8,
                    vpx_highbd_sad8x4_avg_bits8,
                    vpx_highbd_8_variance8x4,
-                   vp9_highbd_sub_pixel_variance8x4,
-                   vp9_highbd_sub_pixel_avg_variance8x4,
+                   vpx_highbd_8_sub_pixel_variance8x4,
+                   vpx_highbd_8_sub_pixel_avg_variance8x4,
                    NULL,
                    vpx_highbd_sad8x4x8_bits8,
                    vpx_highbd_sad8x4x4d_bits8)
@@ -1133,8 +1133,8 @@
                    vpx_highbd_sad4x8_bits8,
                    vpx_highbd_sad4x8_avg_bits8,
                    vpx_highbd_8_variance4x8,
-                   vp9_highbd_sub_pixel_variance4x8,
-                   vp9_highbd_sub_pixel_avg_variance4x8,
+                   vpx_highbd_8_sub_pixel_variance4x8,
+                   vpx_highbd_8_sub_pixel_avg_variance4x8,
                    NULL,
                    vpx_highbd_sad4x8x8_bits8,
                    vpx_highbd_sad4x8x4d_bits8)
@@ -1143,8 +1143,8 @@
                    vpx_highbd_sad4x4_bits8,
                    vpx_highbd_sad4x4_avg_bits8,
                    vpx_highbd_8_variance4x4,
-                   vp9_highbd_sub_pixel_variance4x4,
-                   vp9_highbd_sub_pixel_avg_variance4x4,
+                   vpx_highbd_8_sub_pixel_variance4x4,
+                   vpx_highbd_8_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits8,
                    vpx_highbd_sad4x4x8_bits8,
                    vpx_highbd_sad4x4x4d_bits8)
@@ -1155,8 +1155,8 @@
                    vpx_highbd_sad32x16_bits10,
                    vpx_highbd_sad32x16_avg_bits10,
                    vpx_highbd_10_variance32x16,
-                   vp9_highbd_10_sub_pixel_variance32x16,
-                   vp9_highbd_10_sub_pixel_avg_variance32x16,
+                   vpx_highbd_10_sub_pixel_variance32x16,
+                   vpx_highbd_10_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x16x4d_bits10)
@@ -1165,8 +1165,8 @@
                    vpx_highbd_sad16x32_bits10,
                    vpx_highbd_sad16x32_avg_bits10,
                    vpx_highbd_10_variance16x32,
-                   vp9_highbd_10_sub_pixel_variance16x32,
-                   vp9_highbd_10_sub_pixel_avg_variance16x32,
+                   vpx_highbd_10_sub_pixel_variance16x32,
+                   vpx_highbd_10_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad16x32x4d_bits10)
@@ -1175,8 +1175,8 @@
                    vpx_highbd_sad64x32_bits10,
                    vpx_highbd_sad64x32_avg_bits10,
                    vpx_highbd_10_variance64x32,
-                   vp9_highbd_10_sub_pixel_variance64x32,
-                   vp9_highbd_10_sub_pixel_avg_variance64x32,
+                   vpx_highbd_10_sub_pixel_variance64x32,
+                   vpx_highbd_10_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad64x32x4d_bits10)
@@ -1185,8 +1185,8 @@
                    vpx_highbd_sad32x64_bits10,
                    vpx_highbd_sad32x64_avg_bits10,
                    vpx_highbd_10_variance32x64,
-                   vp9_highbd_10_sub_pixel_variance32x64,
-                   vp9_highbd_10_sub_pixel_avg_variance32x64,
+                   vpx_highbd_10_sub_pixel_variance32x64,
+                   vpx_highbd_10_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x64x4d_bits10)
@@ -1195,8 +1195,8 @@
                    vpx_highbd_sad32x32_bits10,
                    vpx_highbd_sad32x32_avg_bits10,
                    vpx_highbd_10_variance32x32,
-                   vp9_highbd_10_sub_pixel_variance32x32,
-                   vp9_highbd_10_sub_pixel_avg_variance32x32,
+                   vpx_highbd_10_sub_pixel_variance32x32,
+                   vpx_highbd_10_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits10,
                    vpx_highbd_sad32x32x8_bits10,
                    vpx_highbd_sad32x32x4d_bits10)
@@ -1205,8 +1205,8 @@
                    vpx_highbd_sad64x64_bits10,
                    vpx_highbd_sad64x64_avg_bits10,
                    vpx_highbd_10_variance64x64,
-                   vp9_highbd_10_sub_pixel_variance64x64,
-                   vp9_highbd_10_sub_pixel_avg_variance64x64,
+                   vpx_highbd_10_sub_pixel_variance64x64,
+                   vpx_highbd_10_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits10,
                    vpx_highbd_sad64x64x8_bits10,
                    vpx_highbd_sad64x64x4d_bits10)
@@ -1215,8 +1215,8 @@
                    vpx_highbd_sad16x16_bits10,
                    vpx_highbd_sad16x16_avg_bits10,
                    vpx_highbd_10_variance16x16,
-                   vp9_highbd_10_sub_pixel_variance16x16,
-                   vp9_highbd_10_sub_pixel_avg_variance16x16,
+                   vpx_highbd_10_sub_pixel_variance16x16,
+                   vpx_highbd_10_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits10,
                    vpx_highbd_sad16x16x8_bits10,
                    vpx_highbd_sad16x16x4d_bits10)
@@ -1225,8 +1225,8 @@
                    vpx_highbd_sad16x8_bits10,
                    vpx_highbd_sad16x8_avg_bits10,
                    vpx_highbd_10_variance16x8,
-                   vp9_highbd_10_sub_pixel_variance16x8,
-                   vp9_highbd_10_sub_pixel_avg_variance16x8,
+                   vpx_highbd_10_sub_pixel_variance16x8,
+                   vpx_highbd_10_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits10,
                    vpx_highbd_sad16x8x8_bits10,
                    vpx_highbd_sad16x8x4d_bits10)
@@ -1235,8 +1235,8 @@
                    vpx_highbd_sad8x16_bits10,
                    vpx_highbd_sad8x16_avg_bits10,
                    vpx_highbd_10_variance8x16,
-                   vp9_highbd_10_sub_pixel_variance8x16,
-                   vp9_highbd_10_sub_pixel_avg_variance8x16,
+                   vpx_highbd_10_sub_pixel_variance8x16,
+                   vpx_highbd_10_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits10,
                    vpx_highbd_sad8x16x8_bits10,
                    vpx_highbd_sad8x16x4d_bits10)
@@ -1245,8 +1245,8 @@
                    vpx_highbd_sad8x8_bits10,
                    vpx_highbd_sad8x8_avg_bits10,
                    vpx_highbd_10_variance8x8,
-                   vp9_highbd_10_sub_pixel_variance8x8,
-                   vp9_highbd_10_sub_pixel_avg_variance8x8,
+                   vpx_highbd_10_sub_pixel_variance8x8,
+                   vpx_highbd_10_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits10,
                    vpx_highbd_sad8x8x8_bits10,
                    vpx_highbd_sad8x8x4d_bits10)
@@ -1255,8 +1255,8 @@
                    vpx_highbd_sad8x4_bits10,
                    vpx_highbd_sad8x4_avg_bits10,
                    vpx_highbd_10_variance8x4,
-                   vp9_highbd_10_sub_pixel_variance8x4,
-                   vp9_highbd_10_sub_pixel_avg_variance8x4,
+                   vpx_highbd_10_sub_pixel_variance8x4,
+                   vpx_highbd_10_sub_pixel_avg_variance8x4,
                    NULL,
                    vpx_highbd_sad8x4x8_bits10,
                    vpx_highbd_sad8x4x4d_bits10)
@@ -1265,8 +1265,8 @@
                    vpx_highbd_sad4x8_bits10,
                    vpx_highbd_sad4x8_avg_bits10,
                    vpx_highbd_10_variance4x8,
-                   vp9_highbd_10_sub_pixel_variance4x8,
-                   vp9_highbd_10_sub_pixel_avg_variance4x8,
+                   vpx_highbd_10_sub_pixel_variance4x8,
+                   vpx_highbd_10_sub_pixel_avg_variance4x8,
                    NULL,
                    vpx_highbd_sad4x8x8_bits10,
                    vpx_highbd_sad4x8x4d_bits10)
@@ -1275,8 +1275,8 @@
                    vpx_highbd_sad4x4_bits10,
                    vpx_highbd_sad4x4_avg_bits10,
                    vpx_highbd_10_variance4x4,
-                   vp9_highbd_10_sub_pixel_variance4x4,
-                   vp9_highbd_10_sub_pixel_avg_variance4x4,
+                   vpx_highbd_10_sub_pixel_variance4x4,
+                   vpx_highbd_10_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits10,
                    vpx_highbd_sad4x4x8_bits10,
                    vpx_highbd_sad4x4x4d_bits10)
@@ -1287,8 +1287,8 @@
                    vpx_highbd_sad32x16_bits12,
                    vpx_highbd_sad32x16_avg_bits12,
                    vpx_highbd_12_variance32x16,
-                   vp9_highbd_12_sub_pixel_variance32x16,
-                   vp9_highbd_12_sub_pixel_avg_variance32x16,
+                   vpx_highbd_12_sub_pixel_variance32x16,
+                   vpx_highbd_12_sub_pixel_avg_variance32x16,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x16x4d_bits12)
@@ -1297,8 +1297,8 @@
                    vpx_highbd_sad16x32_bits12,
                    vpx_highbd_sad16x32_avg_bits12,
                    vpx_highbd_12_variance16x32,
-                   vp9_highbd_12_sub_pixel_variance16x32,
-                   vp9_highbd_12_sub_pixel_avg_variance16x32,
+                   vpx_highbd_12_sub_pixel_variance16x32,
+                   vpx_highbd_12_sub_pixel_avg_variance16x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad16x32x4d_bits12)
@@ -1307,8 +1307,8 @@
                    vpx_highbd_sad64x32_bits12,
                    vpx_highbd_sad64x32_avg_bits12,
                    vpx_highbd_12_variance64x32,
-                   vp9_highbd_12_sub_pixel_variance64x32,
-                   vp9_highbd_12_sub_pixel_avg_variance64x32,
+                   vpx_highbd_12_sub_pixel_variance64x32,
+                   vpx_highbd_12_sub_pixel_avg_variance64x32,
                    NULL,
                    NULL,
                    vpx_highbd_sad64x32x4d_bits12)
@@ -1317,8 +1317,8 @@
                    vpx_highbd_sad32x64_bits12,
                    vpx_highbd_sad32x64_avg_bits12,
                    vpx_highbd_12_variance32x64,
-                   vp9_highbd_12_sub_pixel_variance32x64,
-                   vp9_highbd_12_sub_pixel_avg_variance32x64,
+                   vpx_highbd_12_sub_pixel_variance32x64,
+                   vpx_highbd_12_sub_pixel_avg_variance32x64,
                    NULL,
                    NULL,
                    vpx_highbd_sad32x64x4d_bits12)
@@ -1327,8 +1327,8 @@
                    vpx_highbd_sad32x32_bits12,
                    vpx_highbd_sad32x32_avg_bits12,
                    vpx_highbd_12_variance32x32,
-                   vp9_highbd_12_sub_pixel_variance32x32,
-                   vp9_highbd_12_sub_pixel_avg_variance32x32,
+                   vpx_highbd_12_sub_pixel_variance32x32,
+                   vpx_highbd_12_sub_pixel_avg_variance32x32,
                    vpx_highbd_sad32x32x3_bits12,
                    vpx_highbd_sad32x32x8_bits12,
                    vpx_highbd_sad32x32x4d_bits12)
@@ -1337,8 +1337,8 @@
                    vpx_highbd_sad64x64_bits12,
                    vpx_highbd_sad64x64_avg_bits12,
                    vpx_highbd_12_variance64x64,
-                   vp9_highbd_12_sub_pixel_variance64x64,
-                   vp9_highbd_12_sub_pixel_avg_variance64x64,
+                   vpx_highbd_12_sub_pixel_variance64x64,
+                   vpx_highbd_12_sub_pixel_avg_variance64x64,
                    vpx_highbd_sad64x64x3_bits12,
                    vpx_highbd_sad64x64x8_bits12,
                    vpx_highbd_sad64x64x4d_bits12)
@@ -1347,8 +1347,8 @@
                    vpx_highbd_sad16x16_bits12,
                    vpx_highbd_sad16x16_avg_bits12,
                    vpx_highbd_12_variance16x16,
-                   vp9_highbd_12_sub_pixel_variance16x16,
-                   vp9_highbd_12_sub_pixel_avg_variance16x16,
+                   vpx_highbd_12_sub_pixel_variance16x16,
+                   vpx_highbd_12_sub_pixel_avg_variance16x16,
                    vpx_highbd_sad16x16x3_bits12,
                    vpx_highbd_sad16x16x8_bits12,
                    vpx_highbd_sad16x16x4d_bits12)
@@ -1357,8 +1357,8 @@
                    vpx_highbd_sad16x8_bits12,
                    vpx_highbd_sad16x8_avg_bits12,
                    vpx_highbd_12_variance16x8,
-                   vp9_highbd_12_sub_pixel_variance16x8,
-                   vp9_highbd_12_sub_pixel_avg_variance16x8,
+                   vpx_highbd_12_sub_pixel_variance16x8,
+                   vpx_highbd_12_sub_pixel_avg_variance16x8,
                    vpx_highbd_sad16x8x3_bits12,
                    vpx_highbd_sad16x8x8_bits12,
                    vpx_highbd_sad16x8x4d_bits12)
@@ -1367,8 +1367,8 @@
                    vpx_highbd_sad8x16_bits12,
                    vpx_highbd_sad8x16_avg_bits12,
                    vpx_highbd_12_variance8x16,
-                   vp9_highbd_12_sub_pixel_variance8x16,
-                   vp9_highbd_12_sub_pixel_avg_variance8x16,
+                   vpx_highbd_12_sub_pixel_variance8x16,
+                   vpx_highbd_12_sub_pixel_avg_variance8x16,
                    vpx_highbd_sad8x16x3_bits12,
                    vpx_highbd_sad8x16x8_bits12,
                    vpx_highbd_sad8x16x4d_bits12)
@@ -1377,8 +1377,8 @@
                    vpx_highbd_sad8x8_bits12,
                    vpx_highbd_sad8x8_avg_bits12,
                    vpx_highbd_12_variance8x8,
-                   vp9_highbd_12_sub_pixel_variance8x8,
-                   vp9_highbd_12_sub_pixel_avg_variance8x8,
+                   vpx_highbd_12_sub_pixel_variance8x8,
+                   vpx_highbd_12_sub_pixel_avg_variance8x8,
                    vpx_highbd_sad8x8x3_bits12,
                    vpx_highbd_sad8x8x8_bits12,
                    vpx_highbd_sad8x8x4d_bits12)
@@ -1387,8 +1387,8 @@
                    vpx_highbd_sad8x4_bits12,
                    vpx_highbd_sad8x4_avg_bits12,
                    vpx_highbd_12_variance8x4,
-                   vp9_highbd_12_sub_pixel_variance8x4,
-                   vp9_highbd_12_sub_pixel_avg_variance8x4,
+                   vpx_highbd_12_sub_pixel_variance8x4,
+                   vpx_highbd_12_sub_pixel_avg_variance8x4,
                    NULL,
                    vpx_highbd_sad8x4x8_bits12,
                    vpx_highbd_sad8x4x4d_bits12)
@@ -1397,8 +1397,8 @@
                    vpx_highbd_sad4x8_bits12,
                    vpx_highbd_sad4x8_avg_bits12,
                    vpx_highbd_12_variance4x8,
-                   vp9_highbd_12_sub_pixel_variance4x8,
-                   vp9_highbd_12_sub_pixel_avg_variance4x8,
+                   vpx_highbd_12_sub_pixel_variance4x8,
+                   vpx_highbd_12_sub_pixel_avg_variance4x8,
                    NULL,
                    vpx_highbd_sad4x8x8_bits12,
                    vpx_highbd_sad4x8x4d_bits12)
@@ -1407,8 +1407,8 @@
                    vpx_highbd_sad4x4_bits12,
                    vpx_highbd_sad4x4_avg_bits12,
                    vpx_highbd_12_variance4x4,
-                   vp9_highbd_12_sub_pixel_variance4x4,
-                   vp9_highbd_12_sub_pixel_avg_variance4x4,
+                   vpx_highbd_12_sub_pixel_variance4x4,
+                   vpx_highbd_12_sub_pixel_avg_variance4x4,
                    vpx_highbd_sad4x4x3_bits12,
                    vpx_highbd_sad4x4x8_bits12,
                    vpx_highbd_sad4x4x4d_bits12)
@@ -1467,7 +1467,7 @@
   cpi->td.mb.e_mbd.bd = (int)cm->bit_depth;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+  rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
 
   cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
@@ -1832,62 +1832,62 @@
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
-      vpx_variance32x16, vp9_sub_pixel_variance32x16,
-      vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
+      vpx_variance32x16, vpx_sub_pixel_variance32x16,
+      vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg,
-      vpx_variance16x32, vp9_sub_pixel_variance16x32,
-      vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
+      vpx_variance16x32, vpx_sub_pixel_variance16x32,
+      vpx_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg,
-      vpx_variance64x32, vp9_sub_pixel_variance64x32,
-      vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
+      vpx_variance64x32, vpx_sub_pixel_variance64x32,
+      vpx_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg,
-      vpx_variance32x64, vp9_sub_pixel_variance32x64,
-      vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
+      vpx_variance32x64, vpx_sub_pixel_variance32x64,
+      vpx_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg,
-      vpx_variance32x32, vp9_sub_pixel_variance32x32,
-      vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
+      vpx_variance32x32, vpx_sub_pixel_variance32x32,
+      vpx_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8,
       vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg,
-      vpx_variance64x64, vp9_sub_pixel_variance64x64,
-      vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
+      vpx_variance64x64, vpx_sub_pixel_variance64x64,
+      vpx_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8,
       vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg,
-      vpx_variance16x16, vp9_sub_pixel_variance16x16,
-      vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
+      vpx_variance16x16, vpx_sub_pixel_variance16x16,
+      vpx_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8,
       vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg,
-      vpx_variance16x8, vp9_sub_pixel_variance16x8,
-      vp9_sub_pixel_avg_variance16x8,
+      vpx_variance16x8, vpx_sub_pixel_variance16x8,
+      vpx_sub_pixel_avg_variance16x8,
       vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg,
-      vpx_variance8x16, vp9_sub_pixel_variance8x16,
-      vp9_sub_pixel_avg_variance8x16,
+      vpx_variance8x16, vpx_sub_pixel_variance8x16,
+      vpx_sub_pixel_avg_variance8x16,
       vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg,
-      vpx_variance8x8, vp9_sub_pixel_variance8x8,
-      vp9_sub_pixel_avg_variance8x8,
+      vpx_variance8x8, vpx_sub_pixel_variance8x8,
+      vpx_sub_pixel_avg_variance8x8,
       vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg,
-      vpx_variance8x4, vp9_sub_pixel_variance8x4,
-      vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
+      vpx_variance8x4, vpx_sub_pixel_variance8x4,
+      vpx_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg,
-      vpx_variance4x8, vp9_sub_pixel_variance4x8,
-      vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
+      vpx_variance4x8, vpx_sub_pixel_variance4x8,
+      vpx_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg,
-      vpx_variance4x4, vp9_sub_pixel_variance4x4,
-      vp9_sub_pixel_avg_variance4x4,
+      vpx_variance4x4, vpx_sub_pixel_variance4x4,
+      vpx_sub_pixel_avg_variance4x4,
       vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -2562,7 +2562,7 @@
   const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
   uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
-  const InterpKernel *const kernel = vp9_get_interp_kernel(EIGHTTAP);
+  const InterpKernel *const kernel = vp9_filter_kernels[EIGHTTAP];
   int x, y, i;
 
   for (y = 0; y < dst_h; y += 16) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 8b324a7..f095cad 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -16,7 +16,7 @@
 #include "./vpx_config.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8cx.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_ppflags.h"
@@ -40,7 +40,7 @@
 #include "vp9/encoder/vp9_speed_features.h"
 #include "vp9/encoder/vp9_svc_layercontext.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp9/encoder/vp9_denoiser.h"
@@ -50,8 +50,6 @@
 extern "C" {
 #endif
 
-#define DEFAULT_GF_INTERVAL         10
-
 typedef struct {
   int nmvjointcost[MV_JOINTS];
   int nmvcosts[2][MV_VALS];
@@ -219,6 +217,9 @@
   int arnr_max_frames;
   int arnr_strength;
 
+  int min_gf_interval;
+  int max_gf_interval;
+
   int tile_columns;
   int tile_rows;
 

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 16640fe..a85b70b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -35,7 +35,7 @@
 #include "vp9/encoder/vp9_mcomp.h"
 #include "vp9/encoder/vp9_quantize.h"
 #include "vp9/encoder/vp9_rd.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
 
 #define OUTPUT_FPF          0
 #define ARF_STATS_OUTPUT    0
@@ -228,18 +228,43 @@
   section->duration   -= frame->duration;
 }
 
+// Calculate an active area of the image that discounts formatting
+// bars and partially discounts other 0 energy areas.
+#define MIN_ACTIVE_AREA 0.5
+#define MAX_ACTIVE_AREA 1.0
+static double calculate_active_area(const VP9_COMP *cpi,
+                                    const FIRSTPASS_STATS *this_frame)
+{
+  double active_pct;
+
+  active_pct = 1.0 -
+    ((this_frame->intra_skip_pct / 2) +
+     ((this_frame->inactive_zone_rows * 2) / (double)cpi->common.mb_rows));
+  return fclamp(active_pct, MIN_ACTIVE_AREA, MAX_ACTIVE_AREA);
+}
 
 // Calculate a modified Error used in distributing bits between easier and
 // harder frames.
-static double calculate_modified_err(const TWO_PASS *twopass,
+#define ACT_AREA_CORRECTION 0.5
+static double calculate_modified_err(const VP9_COMP *cpi,
+                                     const TWO_PASS *twopass,
                                      const VP9EncoderConfig *oxcf,
                                      const FIRSTPASS_STATS *this_frame) {
   const FIRSTPASS_STATS *const stats = &twopass->total_stats;
   const double av_weight = stats->weight / stats->count;
   const double av_err = (stats->coded_error * av_weight) / stats->count;
-  const double modified_error =
+  double modified_error =
     av_err * pow(this_frame->coded_error * this_frame->weight /
                  DOUBLE_DIVIDE_CHECK(av_err), oxcf->two_pass_vbrbias / 100.0);
+
+  // Correction for active area. Frames with a reduced active area
+  // (eg due to formatting bars) have a higher error per mb for the
+  // remaining active MBs. The correction here assumes that coding
+  // 0.5N blocks of complexity 2X is a little easier than coding N
+  // blocks of complexity X.
+  modified_error *=
+    pow(calculate_active_area(cpi, this_frame), ACT_AREA_CORRECTION);
+
   return fclamp(modified_error,
                 twopass->modified_error_min, twopass->modified_error_max);
 }
@@ -273,7 +298,7 @@
   }
 }
 
-static vp9_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
+static vpx_variance_fn_t get_block_variance_fn(BLOCK_SIZE bsize) {
   switch (bsize) {
     case BLOCK_8X8:
       return vpx_mse8x8;
@@ -290,13 +315,13 @@
                                          const struct buf_2d *src,
                                          const struct buf_2d *ref) {
   unsigned int sse;
-  const vp9_variance_fn_t fn = get_block_variance_fn(bsize);
+  const vpx_variance_fn_t fn = get_block_variance_fn(bsize);
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static vp9_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
+static vpx_variance_fn_t highbd_get_block_variance_fn(BLOCK_SIZE bsize,
                                                       int bd) {
   switch (bd) {
     default:
@@ -343,7 +368,7 @@
                                                 const struct buf_2d *ref,
                                                 int bd) {
   unsigned int sse;
-  const vp9_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
+  const vpx_variance_fn_t fn = highbd_get_block_variance_fn(bsize, bd);
   fn(src->buf, src->stride, ref->buf, ref->stride, &sse);
   return sse;
 }
@@ -1286,7 +1311,7 @@
     twopass->modified_error_max = (avg_error *
                                       oxcf->two_pass_vbrmax_section) / 100;
     while (s < twopass->stats_in_end) {
-      modified_error_total += calculate_modified_err(twopass, oxcf, s);
+      modified_error_total += calculate_modified_err(cpi, twopass, oxcf, s);
       ++s;
     }
     twopass->modified_error_left = modified_error_total;
@@ -1455,10 +1480,11 @@
     vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME],
                             cpi->common.bit_depth);
   const double boost_q_correction = MIN((0.5 + (lq * 0.015)), 1.5);
-  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                      ? cpi->initial_mbs : cpi->common.MBs;
+  int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
+                ? cpi->initial_mbs : cpi->common.MBs;
 
-  // TODO(paulwilkins): correct for dead zone
+  // Correct for any inactive region in the image
+  num_mbs = (int)MAX(1, num_mbs * calculate_active_area(cpi, this_frame));
 
   // Underlying boost factor is based on inter error ratio.
   frame_boost = (BASELINE_ERR_PER_MB * num_mbs) /
@@ -1753,7 +1779,7 @@
       ++frame_index;
     }
 
-    modified_err = calculate_modified_err(twopass, oxcf, &frame_stats);
+    modified_err = calculate_modified_err(cpi, twopass, oxcf, &frame_stats);
 
     if (group_error > 0)
       err_fraction = modified_err / DOUBLE_DIVIDE_CHECK(group_error);
@@ -1851,7 +1877,8 @@
   int64_t gf_group_bits;
   double gf_group_error_left;
   int gf_arf_bits;
-  int is_key_frame = frame_is_intra_only(cm);
+  const int is_key_frame = frame_is_intra_only(cm);
+  const int kf_or_arf_active = is_key_frame || rc->source_alt_ref_active;
 
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
@@ -1863,7 +1890,7 @@
   vp9_zero(next_frame);
 
   // Load stats for the current frame.
-  mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
+  mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
   // Note the error of the frame at the start of the group. This will be
   // the GF frame error if we code a normal gf.
@@ -1904,7 +1931,10 @@
       // bits to spare and are better with a smaller interval and smaller boost.
       // At high Q when there are few bits to spare we are better with a longer
       // interval to spread the cost of the GF.
-      active_max_gf_interval = 12 + MIN(4, (int_lbq / 6));
+      active_max_gf_interval = rc->max_gf_interval - 4 + MIN(4, (int_lbq / 6));
+      if (active_max_gf_interval < active_min_gf_interval)
+        active_max_gf_interval = active_min_gf_interval;
+
       if (active_max_gf_interval > rc->max_gf_interval)
         active_max_gf_interval = rc->max_gf_interval;
       if (active_max_gf_interval < active_min_gf_interval)
@@ -1917,7 +1947,7 @@
     ++i;
 
     // Accumulate error score of frames in this gf group.
-    mod_frame_err = calculate_modified_err(twopass, oxcf, this_frame);
+    mod_frame_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
     gf_group_err += mod_frame_err;
 #if GROUP_ADAPTIVE_MAXQ
     gf_group_raw_error += this_frame->coded_error;
@@ -1966,10 +1996,11 @@
     // Break out conditions.
     if (
       // Break at active_max_gf_interval unless almost totally static.
-      (i >= active_max_gf_interval && (zero_motion_accumulator < 0.995)) ||
+      ((i >= active_max_gf_interval + kf_or_arf_active) &&
+       (zero_motion_accumulator < 0.995)) ||
       (
         // Don't break out with a very short interval.
-        (i > active_min_gf_interval) &&
+        (i >= active_min_gf_interval + kf_or_arf_active) &&
         (!flash_detected) &&
         ((mv_ratio_accumulator > mv_ratio_accumulator_thresh) ||
          (abs_mv_in_out_accumulator > 3.0) ||
@@ -2022,7 +2053,7 @@
     for (j = 0; j < new_gf_interval - rc->baseline_gf_interval; ++j) {
       if (EOF == input_stats(twopass, this_frame))
         break;
-      gf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+      gf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
 #if GROUP_ADAPTIVE_MAXQ
       gf_group_raw_error += this_frame->coded_error;
 #endif
@@ -2271,14 +2302,14 @@
   twopass->kf_group_bits = 0;        // Total bits available to kf group
   twopass->kf_group_error_left = 0;  // Group modified error score.
 
-  kf_mod_err = calculate_modified_err(twopass, oxcf, this_frame);
+  kf_mod_err = calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
   // Find the next keyframe.
   i = 0;
   while (twopass->stats_in < twopass->stats_in_end &&
          rc->frames_to_key < cpi->oxcf.key_freq) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
 
     // Load the next frame's stats.
     last_frame = *this_frame;
@@ -2340,7 +2371,7 @@
 
     // Rescan to get the correct error data for the forced kf group.
     for (i = 0; i < rc->frames_to_key; ++i) {
-      kf_group_err += calculate_modified_err(twopass, oxcf, &tmp_frame);
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, &tmp_frame);
       input_stats(twopass, &tmp_frame);
     }
     rc->next_key_frame_forced = 1;
@@ -2358,7 +2389,7 @@
     for (j = 0; j < new_frame_to_key - rc->frames_to_key; ++j) {
       if (EOF == input_stats(twopass, this_frame))
         break;
-      kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+      kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
     }
     rc->frames_to_key = new_frame_to_key;
   }
@@ -2366,7 +2397,7 @@
   // Special case for the last key frame of the file.
   if (twopass->stats_in >= twopass->stats_in_end) {
     // Accumulate kf group error.
-    kf_group_err += calculate_modified_err(twopass, oxcf, this_frame);
+    kf_group_err += calculate_modified_err(cpi, twopass, oxcf, this_frame);
   }
 
   // Calculate the number of bits that should be assigned to the kf group.

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index 99c1afa..817bd79 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -13,7 +13,7 @@
 #define VP9_ENCODER_VP9_MCOMP_H_
 
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_variance.h"
+#include "vpx_dsp/variance.h"
 
 #ifdef __cplusplus
 extern "C" {

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 0cba638..e99cbc7 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -1813,7 +1813,7 @@
                                     &xd->block_refs[0]->sf,
                                     4 * num_4x4_blocks_wide,
                                     4 * num_4x4_blocks_high, 0,
-                                    vp9_get_interp_kernel(mbmi->interp_filter),
+                                    vp9_filter_kernels[mbmi->interp_filter],
                                     MV_PRECISION_Q3,
                                     mi_col * MI_SIZE + 4 * (i & 0x01),
                                     mi_row * MI_SIZE + 4 * (i >> 1), xd->bd);
@@ -1825,7 +1825,7 @@
                                      &xd->block_refs[0]->sf,
                                      4 * num_4x4_blocks_wide,
                                      4 * num_4x4_blocks_high, 0,
-                                     vp9_get_interp_kernel(mbmi->interp_filter),
+                                     vp9_filter_kernels[mbmi->interp_filter],
                                      MV_PRECISION_Q3,
                                      mi_col * MI_SIZE + 4 * (i & 0x01),
                                      mi_row * MI_SIZE + 4 * (i >> 1));

diff --git a/vp9/encoder/vp9_psnrhvs.c b/vp9/encoder/vp9_psnrhvs.c
index e10e028..2e7e345 100644
--- a/vp9/encoder/vp9_psnrhvs.c
+++ b/vp9/encoder/vp9_psnrhvs.c

@@ -25,7 +25,7 @@
 
 void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x, int xstride) {
   (void) xstride;
-  vp9_fdct8x8_c(x, y, ystride);
+  vp9_fdct8x8(x, y, ystride);
 }
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of

diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 85003f6..be09bca 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c

@@ -276,6 +276,27 @@
   }
 }
 
+int vp9_rc_get_default_min_gf_interval(
+    int width, int height, double framerate) {
+  // Assume we do not need any constraint lower than 4K 20 fps
+  static const double factor_safe = 3840 * 2160 * 20.0;
+  const double factor = width * height * framerate;
+
+  if (factor <= factor_safe)
+    return MIN_GF_INTERVAL;
+  else
+    return (int)(MIN_GF_INTERVAL * factor / factor_safe + 0.5);
+  // Note this logic makes:
+  // 4K24: 5
+  // 4K30: 6
+  // 4K60: 12
+}
+
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval) {
+  int interval = MIN(MAX_GF_INTERVAL, (int)(framerate * 0.75));
+  return MAX(interval, min_gf_interval);
+}
+
 void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) {
   int i;
 
@@ -284,9 +305,9 @@
     rc->avg_frame_qindex[INTER_FRAME] = oxcf->worst_allowed_q;
   } else {
     rc->avg_frame_qindex[KEY_FRAME] = (oxcf->worst_allowed_q +
-                                           oxcf->best_allowed_q) / 2;
+                                       oxcf->best_allowed_q) / 2;
     rc->avg_frame_qindex[INTER_FRAME] = (oxcf->worst_allowed_q +
-                                           oxcf->best_allowed_q) / 2;
+                                         oxcf->best_allowed_q) / 2;
   }
 
   rc->last_q[KEY_FRAME] = oxcf->best_allowed_q;
@@ -304,7 +325,6 @@
   rc->total_target_bits = 0;
   rc->total_target_vs_actual = 0;
 
-  rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
   rc->frames_since_key = 8;  // Sensible default for first frame.
   rc->this_key_frame_forced = 0;
   rc->next_key_frame_forced = 0;
@@ -322,6 +342,16 @@
   for (i = 0; i < RATE_FACTOR_LEVELS; ++i) {
     rc->rate_correction_factors[i] = 1.0;
   }
+
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, oxcf->init_framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+        oxcf->init_framerate, rc->min_gf_interval);
+  rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
 }
 
 int vp9_rc_drop_frame(VP9_COMP *cpi) {
@@ -1382,7 +1412,7 @@
     cm->frame_type = INTER_FRAME;
   }
   if (rc->frames_till_gf_update_due == 0) {
-    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+    rc->baseline_gf_interval = (rc->min_gf_interval + rc->max_gf_interval) / 2;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
     if (rc->frames_till_gf_update_due > rc->frames_to_key) {
@@ -1576,7 +1606,8 @@
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
       vp9_cyclic_refresh_set_golden_update(cpi);
     else
-      rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
+      rc->baseline_gf_interval =
+          (rc->min_gf_interval + rc->max_gf_interval) / 2;
     rc->frames_till_gf_update_due = rc->baseline_gf_interval;
     // NOTE: frames_till_gf_update_due must be <= frames_to_key.
     if (rc->frames_till_gf_update_due > rc->frames_to_key)
@@ -1649,20 +1680,19 @@
   return target_index - qindex;
 }
 
-#define MIN_GF_INTERVAL     4
-#define MAX_GF_INTERVAL     16
 void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
                                   RATE_CONTROL *const rc) {
   const VP9EncoderConfig *const oxcf = &cpi->oxcf;
 
-  // Set a minimum interval.
-  rc->min_gf_interval =
-    MIN(MAX_GF_INTERVAL, MAX(MIN_GF_INTERVAL, (int)(cpi->framerate * 0.125)));
-
-  // Set Maximum gf/arf interval.
-  rc->max_gf_interval =
-    MIN(MAX_GF_INTERVAL, (int)(cpi->framerate * 0.75));
-  // Round up to next even number if odd.
+  // Set Maximum gf/arf interval
+  rc->max_gf_interval = oxcf->max_gf_interval;
+  rc->min_gf_interval = oxcf->min_gf_interval;
+  if (rc->min_gf_interval == 0)
+    rc->min_gf_interval = vp9_rc_get_default_min_gf_interval(
+        oxcf->width, oxcf->height, cpi->framerate);
+  if (rc->max_gf_interval == 0)
+    rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
+        cpi->framerate, rc->min_gf_interval);
   rc->max_gf_interval += (rc->max_gf_interval & 0x01);
 
   // Extended interval for genuinely static scenes

diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index a10836c..f9b0488 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h

@@ -24,6 +24,9 @@
 // Bits Per MB at different Q (Multiplied by 512)
 #define BPER_MB_NORMBITS    9
 
+#define MIN_GF_INTERVAL     4
+#define MAX_GF_INTERVAL     16
+
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -155,6 +158,12 @@
 
 void vp9_rc_init_minq_luts(void);
 
+int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate);
+// Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to
+// be passed in to ensure that the max_gf_interval returned is at least as bis
+// as that.
+int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate);
+
 // Generally at the high level, the following flow is expected
 // to be enforced for rate control:
 // First call per frame, one of:

diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 930561a..bc7cb34 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c

@@ -37,7 +37,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_tokenize.h"
-#include "vp9/encoder/vp9_variance.h"
 
 #define RD_THRESH_POW      1.25
 #define RD_MULT_EPB_RATIO  64

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e6b7f19..3a27e89 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -12,6 +12,7 @@
 #include <math.h>
 
 #include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -38,7 +39,6 @@
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_rd.h"
 #include "vp9/encoder/vp9_rdopt.h"
-#include "vp9/encoder/vp9_variance.h"
 #include "vp9/encoder/vp9_aq_variance.h"
 
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
@@ -832,7 +832,7 @@
                                   x->skip_encode ? src : dst,
                                   x->skip_encode ? src_stride : dst_stride,
                                   dst, dst_stride, idx, idy, 0);
-          vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
+          vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless) {
             const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -932,7 +932,7 @@
                                 x->skip_encode ? src : dst,
                                 x->skip_encode ? src_stride : dst_stride,
                                 dst, dst_stride, idx, idy, 0);
-        vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
+        vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless) {
           const scan_order *so = &vp9_default_scan_orders[TX_4X4];
@@ -1358,7 +1358,7 @@
   int thisrate = 0, ref;
   const scan_order *so = &vp9_default_scan_orders[TX_4X4];
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mi->mbmi.interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre = &pd->pre[ref].buf[vp9_raster_block_offset(BLOCK_8X8, i,
@@ -1394,16 +1394,16 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_subtract_block(
+    vpx_highbd_subtract_block(
         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
         8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
   } else {
-    vp9_subtract_block(
+    vpx_subtract_block(
         height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
         8, src, p->src.stride, dst, pd->dst.stride);
   }
 #else
-  vp9_subtract_block(height, width,
+  vpx_subtract_block(height, width,
                      vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
                      8, src, p->src.stride, dst, pd->dst.stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1564,7 +1564,7 @@
                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
   int_mv ref_mv[2];
   int ite, ref;
-  const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+  const InterpKernel *kernel = vp9_filter_kernels[mbmi->interp_filter];
   struct scale_factors sf;
 
   // Do joint motion search in compound mode to get more accurate mv.

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 24b6203..3475d58 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -46,7 +46,7 @@
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
   const InterpKernel *const kernel =
-    vp9_get_interp_kernel(xd->mi[0]->mbmi.interp_filter);
+    vp9_filter_kernels[xd->mi[0]->mbmi.interp_filter];
 
   enum mv_precision mv_precision_uv;
   int uv_stride;

diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c
deleted file mode 100644
index c571b7c..0000000
--- a/vp9/encoder/vp9_variance.c
+++ /dev/null

@@ -1,380 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vp9_rtcd.h"
-#include "./vpx_dsp_rtcd.h"
-
-#include "vpx_ports/mem.h"
-#include "vpx/vpx_integer.h"
-
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_filter.h"
-
-#include "vp9/encoder/vp9_variance.h"
-
-static uint8_t bilinear_filters[8][2] = {
-  { 128,   0, },
-  { 112,  16, },
-  {  96,  32, },
-  {  80,  48, },
-  {  64,  64, },
-  {  48,  80, },
-  {  32,  96, },
-  {  16, 112, },
-};
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// first-pass of 2-D separable filter.
-//
-// Produces int32_t output to retain precision for next pass. Two filter taps
-// should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the filter is
-// applied horizontally (pixel_step=1) or vertically (pixel_step=stride). It
-// defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_first_pass(const uint8_t *src_ptr,
-                                              uint16_t *output_ptr,
-                                              unsigned int src_pixels_per_line,
-                                              int pixel_step,
-                                              unsigned int output_height,
-                                              unsigned int output_width,
-                                              const uint8_t *vp9_filter) {
-  unsigned int i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr    += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-// Applies a 1-D 2-tap bi-linear filter to the source block in either horizontal
-// or vertical direction to produce the filtered output block. Used to implement
-// second-pass of 2-D separable filter.
-//
-// Requires 32-bit input as produced by filter_block2d_bil_first_pass. Two
-// filter taps should sum to VP9_FILTER_WEIGHT. pixel_step defines whether the
-// filter is applied horizontally (pixel_step=1) or vertically (pixel_step=
-// stride). It defines the offset required to move from one input to the next.
-static void var_filter_block2d_bil_second_pass(const uint16_t *src_ptr,
-                                               uint8_t *output_ptr,
-                                               unsigned int src_pixels_per_line,
-                                               unsigned int pixel_step,
-                                               unsigned int output_height,
-                                               unsigned int output_width,
-                                               const uint8_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] = ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                          (int)src_ptr[pixel_step] * vp9_filter[1],
-                          FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#define SUBPIX_VAR(W, H) \
-unsigned int vp9_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
-\
-  return vpx_variance##W##x##H##_c(temp2, W, dst, dst_stride, sse); \
-}
-
-#define SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint8_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
-\
-  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
-  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
-\
-  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
-\
-  return vpx_variance##W##x##H##_c(temp3, W, dst, dst_stride, sse); \
-}
-
-SUBPIX_VAR(4, 4)
-SUBPIX_AVG_VAR(4, 4)
-
-SUBPIX_VAR(4, 8)
-SUBPIX_AVG_VAR(4, 8)
-
-SUBPIX_VAR(8, 4)
-SUBPIX_AVG_VAR(8, 4)
-
-SUBPIX_VAR(8, 8)
-SUBPIX_AVG_VAR(8, 8)
-
-SUBPIX_VAR(8, 16)
-SUBPIX_AVG_VAR(8, 16)
-
-SUBPIX_VAR(16, 8)
-SUBPIX_AVG_VAR(16, 8)
-
-SUBPIX_VAR(16, 16)
-SUBPIX_AVG_VAR(16, 16)
-
-SUBPIX_VAR(16, 32)
-SUBPIX_AVG_VAR(16, 32)
-
-SUBPIX_VAR(32, 16)
-SUBPIX_AVG_VAR(32, 16)
-
-SUBPIX_VAR(32, 32)
-SUBPIX_AVG_VAR(32, 32)
-
-SUBPIX_VAR(32, 64)
-SUBPIX_AVG_VAR(32, 64)
-
-SUBPIX_VAR(64, 32)
-SUBPIX_AVG_VAR(64, 32)
-
-SUBPIX_VAR(64, 64)
-SUBPIX_AVG_VAR(64, 64)
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_var_filter_block2d_bil_first_pass(
-    const uint8_t *src_ptr8,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const uint8_t *vp9_filter) {
-  unsigned int i, j;
-  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                             (int)src_ptr[pixel_step] * vp9_filter[1],
-                             FILTER_BITS);
-
-      src_ptr++;
-    }
-
-    // Next row...
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-static void highbd_var_filter_block2d_bil_second_pass(
-    const uint16_t *src_ptr,
-    uint16_t *output_ptr,
-    unsigned int src_pixels_per_line,
-    unsigned int pixel_step,
-    unsigned int output_height,
-    unsigned int output_width,
-    const uint8_t *vp9_filter) {
-  unsigned int  i, j;
-
-  for (i = 0; i < output_height; i++) {
-    for (j = 0; j < output_width; j++) {
-      output_ptr[j] =
-          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
-                             (int)src_ptr[pixel_step] * vp9_filter[1],
-                             FILTER_BITS);
-      src_ptr++;
-    }
-
-    src_ptr += src_pixels_per_line - output_width;
-    output_ptr += output_width;
-  }
-}
-
-#define HIGHBD_SUBPIX_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                          dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
-                                             W, dst, dst_stride, sse); \
-}
-
-#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
-unsigned int vp9_highbd_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                          dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-} \
-\
-unsigned int vp9_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
-  const uint8_t *src, int  src_stride, \
-  int xoffset, int  yoffset, \
-  const uint8_t *dst, int dst_stride, \
-  unsigned int *sse, \
-  const uint8_t *second_pred) { \
-  uint16_t fdata3[(H + 1) * W]; \
-  uint16_t temp2[H * W]; \
-  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
-\
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
-\
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
-\
-  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
-                                             W, dst, dst_stride, sse); \
-}
-
-HIGHBD_SUBPIX_VAR(4, 4)
-HIGHBD_SUBPIX_AVG_VAR(4, 4)
-
-HIGHBD_SUBPIX_VAR(4, 8)
-HIGHBD_SUBPIX_AVG_VAR(4, 8)
-
-HIGHBD_SUBPIX_VAR(8, 4)
-HIGHBD_SUBPIX_AVG_VAR(8, 4)
-
-HIGHBD_SUBPIX_VAR(8, 8)
-HIGHBD_SUBPIX_AVG_VAR(8, 8)
-
-HIGHBD_SUBPIX_VAR(8, 16)
-HIGHBD_SUBPIX_AVG_VAR(8, 16)
-
-HIGHBD_SUBPIX_VAR(16, 8)
-HIGHBD_SUBPIX_AVG_VAR(16, 8)
-
-HIGHBD_SUBPIX_VAR(16, 16)
-HIGHBD_SUBPIX_AVG_VAR(16, 16)
-
-HIGHBD_SUBPIX_VAR(16, 32)
-HIGHBD_SUBPIX_AVG_VAR(16, 32)
-
-HIGHBD_SUBPIX_VAR(32, 16)
-HIGHBD_SUBPIX_AVG_VAR(32, 16)
-
-HIGHBD_SUBPIX_VAR(32, 32)
-HIGHBD_SUBPIX_AVG_VAR(32, 32)
-
-HIGHBD_SUBPIX_VAR(32, 64)
-HIGHBD_SUBPIX_AVG_VAR(32, 64)
-
-HIGHBD_SUBPIX_VAR(64, 32)
-HIGHBD_SUBPIX_AVG_VAR(64, 32)
-
-HIGHBD_SUBPIX_VAR(64, 64)
-HIGHBD_SUBPIX_AVG_VAR(64, 64)
-#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
deleted file mode 100644
index 0a87395..0000000
--- a/vp9/encoder/vp9_variance.h
+++ /dev/null

@@ -1,81 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_VARIANCE_H_
-#define VP9_ENCODER_VP9_VARIANCE_H_
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
-                                    int source_stride,
-                                    const uint8_t *ref_ptr,
-                                    int ref_stride);
-
-typedef unsigned int(*vp9_sad_avg_fn_t)(const uint8_t *src_ptr,
-                                        int source_stride,
-                                        const uint8_t *ref_ptr,
-                                        int ref_stride,
-                                        const uint8_t *second_pred);
-
-typedef void (*vp9_sad_multi_fn_t)(const uint8_t *src_ptr,
-                                   int source_stride,
-                                   const uint8_t *ref_ptr,
-                                   int  ref_stride,
-                                   unsigned int *sad_array);
-
-typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
-                                     int source_stride,
-                                     const uint8_t* const ref_ptr[],
-                                     int  ref_stride, unsigned int *sad_array);
-
-typedef unsigned int (*vp9_variance_fn_t)(const uint8_t *src_ptr,
-                                          int source_stride,
-                                          const uint8_t *ref_ptr,
-                                          int ref_stride,
-                                          unsigned int *sse);
-
-typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
-                                                int source_stride,
-                                                int xoffset,
-                                                int yoffset,
-                                                const uint8_t *ref_ptr,
-                                                int Refstride,
-                                                unsigned int *sse);
-
-typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
-                                                   int source_stride,
-                                                   int xoffset,
-                                                   int yoffset,
-                                                   const uint8_t *ref_ptr,
-                                                   int Refstride,
-                                                   unsigned int *sse,
-                                                   const uint8_t *second_pred);
-
-typedef struct vp9_variance_vtable {
-  vp9_sad_fn_t               sdf;
-  vp9_sad_avg_fn_t           sdaf;
-  vp9_variance_fn_t          vf;
-  vp9_subpixvariance_fn_t    svf;
-  vp9_subp_avg_variance_fn_t svaf;
-  vp9_sad_multi_fn_t         sdx3f;
-  vp9_sad_multi_fn_t         sdx8f;
-  vp9_sad_multi_d_fn_t       sdx4df;
-} vp9_variance_fn_ptr_t;
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VP9_ENCODER_VP9_VARIANCE_H_

diff --git a/vp9/encoder/x86/vp9_highbd_variance_sse2.c b/vp9/encoder/x86/vp9_highbd_variance_sse2.c
deleted file mode 100644
index 29b7b27..0000000
--- a/vp9/encoder/x86/vp9_highbd_variance_sse2.c
+++ /dev/null

@@ -1,349 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
-                                               ptrdiff_t src_stride, \
-                                               int x_offset, int y_offset, \
-                                               const uint16_t *dst, \
-                                               ptrdiff_t dst_stride, \
-                                               int height, unsigned int *sse);
-#define DECLS(opt1, opt2) \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-// DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
-                                                        int src_stride, \
-                                                        int x_offset, \
-                                                        int y_offset, \
-                                                        const uint8_t *dst8, \
-                                                        int dst_stride, \
-                                                        uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, h, \
-                                                       &sse); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 48, src_stride, x_offset, y_offset, \
-          dst + 48, dst_stride, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  int se = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst, dst_stride, \
-                                                       h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
-                                                          src_stride, \
-                                                          x_offset, y_offset, \
-                                                          dst + 16, \
-                                                          dst_stride, \
-                                                          h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 32, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                        x_offset, y_offset, \
-                                                        dst + 48, dst_stride, \
-                                                        h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-        src + (start_row * src_stride), src_stride, \
-        x_offset, y_offset, dst + (start_row * dst_stride), \
-        dst_stride, height, &sse2); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-          src + 16 + (start_row * src_stride), src_stride, \
-          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
-          dst_stride, height, &sse2); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 32 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vp9_highbd_sub_pixel_variance##wf##xh_##opt( \
-            src + 48 + (start_row * src_stride), src_stride, \
-            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
-            dst_stride, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-      }\
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-
-FNS(sse2, sse);
-
-#undef FNS
-#undef FN
-
-#define DECL(w, opt) \
-int vp9_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
-                                                   ptrdiff_t src_stride, \
-                                                   int x_offset, int y_offset, \
-                                                   const uint16_t *dst, \
-                                                   ptrdiff_t dst_stride, \
-                                                   const uint16_t *sec, \
-                                                   ptrdiff_t sec_stride, \
-                                                   int height, \
-                                                   unsigned int *sse);
-#define DECLS(opt1) \
-DECL(16, opt1) \
-DECL(8, opt1)
-
-DECLS(sse2);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-uint32_t vp9_highbd_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-               src, src_stride, x_offset, \
-               y_offset, dst, dst_stride, sec, w, h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                  src + 16, src_stride, x_offset, y_offset, \
-                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32, src_stride, x_offset, y_offset, \
-                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48, src_stride, x_offset, y_offset, \
-                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  uint32_t sse; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  int se = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src, src_stride, x_offset, \
-                                            y_offset, dst, dst_stride, \
-                                            sec, w, h, &sse); \
-  if (w > wf) { \
-    uint32_t sse2; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 16, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 16, dst_stride, \
-                                            sec + 16, w, h, &sse2); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 32, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 32, dst_stride, \
-                                            sec + 32, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                                            src + 48, src_stride, \
-                                            x_offset, y_offset, \
-                                            dst + 48, dst_stride, \
-                                            sec + 48, w, h, &sse2); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 2); \
-  sse = ROUND_POWER_OF_TWO(sse, 4); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-} \
-\
-uint32_t vp9_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
-    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
-    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
-    const uint8_t *sec8) { \
-  int start_row; \
-  uint32_t sse; \
-  int se = 0; \
-  uint64_t long_sse = 0; \
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
-  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
-  for (start_row = 0; start_row < h; start_row +=16) { \
-    uint32_t sse2; \
-    int height = h - start_row < 16 ? h - start_row : 16; \
-    int se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + (start_row * src_stride), src_stride, x_offset, \
-                y_offset, dst + (start_row * dst_stride), dst_stride, \
-                sec + (start_row * w), w, height, &sse2); \
-    se += se2; \
-    long_sse += sse2; \
-    if (w > wf) { \
-      se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 16 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 16 + (start_row * dst_stride), dst_stride, \
-                sec + 16 + (start_row * w), w, height, &sse2); \
-      se += se2; \
-      long_sse += sse2; \
-      if (w > wf * 2) { \
-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 32 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 32 + (start_row * dst_stride), dst_stride, \
-                sec + 32 + (start_row * w), w, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-        se2 = vp9_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
-                src + 48 + (start_row * src_stride), src_stride, \
-                x_offset, y_offset, \
-                dst + 48 + (start_row * dst_stride), dst_stride, \
-                sec + 48 + (start_row * w), w, height, &sse2); \
-        se += se2; \
-        long_sse += sse2; \
-      } \
-    } \
-  } \
-  se = ROUND_POWER_OF_TWO(se, 4); \
-  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-
-#define FNS(opt1) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
-FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
-FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
-FN(8, 4, 8, 3, 2, opt1, (int64_t));
-
-FNS(sse2);
-
-#undef FNS
-#undef FN

diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c b/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
deleted file mode 100644
index b1c7975..0000000
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
+++ /dev/null

@@ -1,525 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <immintrin.h>  // AVX2
-
-#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-#include "vp9/encoder/vp9_variance.h"
-
-DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
-  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
-  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
-  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
-  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
-  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
-  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
-};
-
-#define FILTER_SRC(filter) \
-  /* filter the source */ \
-  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
-  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
-  \
-  /* add 8 to source */ \
-  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
-  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
-  \
-  /* divide source by 16 */ \
-  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
-  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
-
-#define MERGE_WITH_SRC(src_reg, reg) \
-  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
-  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
-
-#define LOAD_SRC_DST \
-  /* load source and destination */ \
-  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
-  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
-
-#define AVG_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
-  /* average between current and next stride source */ \
-  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
-
-#define MERGE_NEXT_SRC(src_reg, size_stride) \
-  src_next_reg = _mm256_loadu_si256((__m256i const *) \
-                                   (src + size_stride)); \
-  MERGE_WITH_SRC(src_reg, src_next_reg)
-
-#define CALC_SUM_SSE_INSIDE_LOOP \
-  /* expand each byte to 2 bytes */ \
-  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
-  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
-  /* source - dest */ \
-  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
-  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
-  /* caculate sum */ \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
-  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
-  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
-  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
-  /* calculate sse */ \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
-  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
-
-// final calculation to sum and sse
-#define CALC_SUM_AND_SSE \
-  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
-  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
-  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
-  \
-  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
-  \
-  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
-                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
-  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
-  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
-  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
-        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
-
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             int height,
-                                             unsigned int *sse) {
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        // save current source average
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src_pack = src_reg;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                             int src_stride,
-                                             int x_offset,
-                                             int y_offset,
-                                             const uint8_t *dst,
-                                             int dst_stride,
-                                             const uint8_t *sec,
-                                             int sec_stride,
-                                             int height,
-                                             unsigned int *sse) {
-  __m256i sec_reg;
-  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
-  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
-  __m256i zero_reg;
-  int i, sum;
-  sum_reg = _mm256_set1_epi16(0);
-  sse_reg = _mm256_set1_epi16(0);
-  zero_reg = _mm256_set1_epi16(0);
-
-  // x_offset = 0 and y_offset = 0
-  if (x_offset == 0) {
-    if (y_offset == 0) {
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    } else if (y_offset == 8) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, src_stride)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expend each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 0 and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg;
-
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-                 (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, src_stride)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = 8  and y_offset = 0
-  } else if (x_offset == 8) {
-    if (y_offset == 0) {
-      __m256i src_next_reg;
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        sec+= sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i src_next_reg, src_avg;
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        // average between previous average to current average
-        src_avg = _mm256_avg_epu8(src_avg, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        sec+= sec_stride;
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = 8  and y_offset = bilin interpolation
-    } else {
-      __m256i filter, pw8, src_next_reg, src_avg;
-      y_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      AVG_NEXT_SRC(src_reg, 1)
-      for (i = 0; i < height ; i++) {
-        // save current source average
-        src_avg = src_reg;
-        src+= src_stride;
-        LOAD_SRC_DST
-        AVG_NEXT_SRC(src_reg, 1)
-        MERGE_WITH_SRC(src_avg, src_reg)
-        FILTER_SRC(filter)
-        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
-        // expand each byte to 2 bytes
-        MERGE_WITH_SRC(src_avg, zero_reg)
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  // x_offset = bilin interpolation and y_offset = 0
-  } else {
-    if (y_offset == 0) {
-      __m256i filter, pw8, src_next_reg;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      for (i = 0; i < height ; i++) {
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
-        MERGE_WITH_SRC(src_reg, zero_reg)
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        src+= src_stride;
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = 8
-    } else if (y_offset == 8) {
-      __m256i filter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      filter = _mm256_load_si256((__m256i const *)
-               (bilinear_filters_avx2 + x_offset));
-      pw8 = _mm256_set1_epi16(8);
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-      FILTER_SRC(filter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(filter)
-        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // average between previous pack to the current
-        src_pack = _mm256_avg_epu8(src_pack, src_reg);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        sec+= sec_stride;
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    // x_offset = bilin interpolation and y_offset = bilin interpolation
-    } else {
-      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
-      x_offset <<= 5;
-      xfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + x_offset));
-      y_offset <<= 5;
-      yfilter = _mm256_load_si256((__m256i const *)
-                (bilinear_filters_avx2 + y_offset));
-      pw8 = _mm256_set1_epi16(8);
-      // load source and another source starting from the next
-      // following byte
-      src_reg = _mm256_loadu_si256((__m256i const *) (src));
-      MERGE_NEXT_SRC(src_reg, 1)
-
-      FILTER_SRC(xfilter)
-      // convert each 16 bit to 8 bit to each low and high lane source
-      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-      for (i = 0; i < height ; i++) {
-        src+= src_stride;
-        LOAD_SRC_DST
-        MERGE_NEXT_SRC(src_reg, 1)
-        FILTER_SRC(xfilter)
-        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        // merge previous pack to current pack source
-        MERGE_WITH_SRC(src_pack, src_reg)
-        // filter the source
-        FILTER_SRC(yfilter)
-        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
-        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
-        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
-        MERGE_WITH_SRC(src_pack, zero_reg)
-        src_pack = src_reg;
-        sec+= sec_stride;
-        CALC_SUM_SSE_INSIDE_LOOP
-        dst+= dst_stride;
-      }
-    }
-  }
-  CALC_SUM_AND_SSE
-  return sum;
-}

diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c
deleted file mode 100644
index 8cd071d..0000000
--- a/vp9/encoder/x86/vp9_variance_avx2.c
+++ /dev/null

@@ -1,104 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
-                                             int x_offset, int y_offset,
-                                             const uint8_t *dst, int dst_stride,
-                                             int height,
-                                             unsigned int *sse);
-
-unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
-                                                 int src_stride,
-                                                 int x_offset,
-                                                 int y_offset,
-                                                 const uint8_t *dst,
-                                                 int dst_stride,
-                                                 const uint8_t *sec,
-                                                 int sec_stride,
-                                                 int height,
-                                                 unsigned int *sseptr);
-
-unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  unsigned int sse1;
-  const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                  y_offset, dst, dst_stride,
-                                                  64, &sse1);
-  unsigned int sse2;
-  const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
-                                                  x_offset, y_offset,
-                                                  dst + 32, dst_stride,
-                                                  64, &sse2);
-  const int se = se1 + se2;
-  *sse = sse1 + sse2;
-  return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
-                                              int src_stride,
-                                              int x_offset,
-                                              int y_offset,
-                                              const uint8_t *dst,
-                                              int dst_stride,
-                                              unsigned int *sse) {
-  const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
-                                                 y_offset, dst, dst_stride,
-                                                 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
-}
-
-unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
-  unsigned int sse1;
-  const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                      y_offset, dst, dst_stride,
-                                                      sec, 64, 64, &sse1);
-  unsigned int sse2;
-  const int se2 =
-      vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
-                                          y_offset, dst + 32, dst_stride,
-                                          sec + 32, 64, 64, &sse2);
-  const int se = se1 + se2;
-
-  *sse = sse1 + sse2;
-
-  return *sse - (((int64_t)se * se) >> 12);
-}
-
-unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
-                                                  int src_stride,
-                                                  int x_offset,
-                                                  int y_offset,
-                                                  const uint8_t *dst,
-                                                  int dst_stride,
-                                                  unsigned int *sse,
-                                                  const uint8_t *sec) {
-  // processing 32 element in parallel
-  const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
-                                                     y_offset, dst, dst_stride,
-                                                     sec, 32, 32, sse);
-  return *sse - (((int64_t)se * se) >> 10);
-}

diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
deleted file mode 100644
index 961efe3..0000000
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ /dev/null

@@ -1,182 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <emmintrin.h>  // SSE2
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-
-#include "vp9/encoder/vp9_variance.h"
-#include "vpx_ports/mem.h"
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
-                                        ptrdiff_t src_stride, \
-                                        int x_offset, int y_offset, \
-                                        const uint8_t *dst, \
-                                        ptrdiff_t dst_stride, \
-                                        int height, unsigned int *sse, \
-                                        void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECLS
-#undef DECL
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                     int src_stride, \
-                                                     int x_offset, \
-                                                     int y_offset, \
-                                                     const uint8_t *dst, \
-                                                     int dst_stride, \
-                                                     unsigned int *sse_ptr) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                y_offset, dst, dst_stride, \
-                                                h, &sse, NULL, NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                   x_offset, y_offset, \
-                                                   dst + 16, dst_stride, \
-                                                   h, &sse2, NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 32, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                 x_offset, y_offset, \
-                                                 dst + 48, dst_stride, \
-                                                 h, &sse2, NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sse_ptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN
-
-// The 2 unused parameters are place holders for PIC enabled build.
-#define DECL(w, opt) \
-int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
-                                            ptrdiff_t src_stride, \
-                                            int x_offset, int y_offset, \
-                                            const uint8_t *dst, \
-                                            ptrdiff_t dst_stride, \
-                                            const uint8_t *sec, \
-                                            ptrdiff_t sec_stride, \
-                                            int height, unsigned int *sse, \
-                                            void *unused0, void *unused)
-#define DECLS(opt1, opt2) \
-DECL(4, opt2); \
-DECL(8, opt1); \
-DECL(16, opt1)
-
-DECLS(sse2, sse);
-DECLS(ssse3, ssse3);
-#undef DECL
-#undef DECLS
-
-#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
-unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
-                                                         int src_stride, \
-                                                         int x_offset, \
-                                                         int y_offset, \
-                                                         const uint8_t *dst, \
-                                                         int dst_stride, \
-                                                         unsigned int *sseptr, \
-                                                         const uint8_t *sec) { \
-  unsigned int sse; \
-  int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
-                                                    y_offset, dst, dst_stride, \
-                                                    sec, w, h, &sse, NULL, \
-                                                    NULL); \
-  if (w > wf) { \
-    unsigned int sse2; \
-    int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
-                                                       x_offset, y_offset, \
-                                                       dst + 16, dst_stride, \
-                                                       sec + 16, w, h, &sse2, \
-                                                       NULL, NULL); \
-    se += se2; \
-    sse += sse2; \
-    if (w > wf * 2) { \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 32, dst_stride, \
-                                                     sec + 32, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-      se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
-                                                     x_offset, y_offset, \
-                                                     dst + 48, dst_stride, \
-                                                     sec + 48, w, h, &sse2, \
-                                                     NULL, NULL); \
-      se += se2; \
-      sse += sse2; \
-    } \
-  } \
-  *sseptr = sse; \
-  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
-}
-
-#define FNS(opt1, opt2) \
-FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
-FN(16,  8, 16, 4, 3, opt1, (unsigned int)); \
-FN(8,  16,  8, 3, 4, opt1, (unsigned int)); \
-FN(8,   8,  8, 3, 3, opt1, (unsigned int)); \
-FN(8,   4,  8, 3, 2, opt1, (unsigned int)); \
-FN(4,   8,  4, 2, 3, opt2, (unsigned int)); \
-FN(4,   4,  4, 2, 2, opt2, (unsigned int))
-
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
-
-#undef FNS
-#undef FN

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index c257d91..2e43c27 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -131,7 +131,6 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
 # common (msa)
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_horiz_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_convolve8_avg_vert_msa.c

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 9462be9..d2d9288 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -31,6 +31,8 @@
   unsigned int                tile_rows;
   unsigned int                arnr_max_frames;
   unsigned int                arnr_strength;
+  unsigned int                min_gf_interval;
+  unsigned int                max_gf_interval;
   vp8e_tuning                 tuning;
   unsigned int                cq_level;  // constrained quality level
   unsigned int                rc_max_intra_bitrate_pct;
@@ -55,6 +57,8 @@
   0,                          // tile_rows
   7,                          // arnr_max_frames
   5,                          // arnr_strength
+  0,                          // min_gf_interval; 0 -> default decision
+  0,                          // max_gf_interval; 0 -> default decision
   VP8_TUNE_PSNR,              // tuning
   10,                         // cq_level
   0,                          // rc_max_intra_bitrate_pct
@@ -167,6 +171,12 @@
   RANGE_CHECK_HI(cfg, rc_resize_up_thresh,   100);
   RANGE_CHECK_HI(cfg, rc_resize_down_thresh, 100);
   RANGE_CHECK(cfg,        g_pass,         VPX_RC_ONE_PASS, VPX_RC_LAST_PASS);
+  RANGE_CHECK(extra_cfg, min_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  RANGE_CHECK(extra_cfg, max_gf_interval, 0, (MAX_LAG_BUFFERS - 1));
+  if (extra_cfg->min_gf_interval > 0 && extra_cfg->max_gf_interval > 0) {
+    RANGE_CHECK(extra_cfg, max_gf_interval, extra_cfg->min_gf_interval,
+                (MAX_LAG_BUFFERS - 1));
+  }
 
   if (cfg->rc_resize_allowed == 1) {
     RANGE_CHECK(cfg, rc_scaled_width, 0, cfg->g_w);
@@ -454,6 +464,8 @@
   oxcf->color_space = extra_cfg->color_space;
   oxcf->arnr_max_frames = extra_cfg->arnr_max_frames;
   oxcf->arnr_strength   = extra_cfg->arnr_strength;
+  oxcf->min_gf_interval = extra_cfg->min_gf_interval;
+  oxcf->max_gf_interval = extra_cfg->max_gf_interval;
 
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
@@ -727,6 +739,20 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_min_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.min_gf_interval = CAST(VP9E_SET_MIN_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
+static vpx_codec_err_t ctrl_set_max_gf_interval(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.max_gf_interval = CAST(VP9E_SET_MAX_GF_INTERVAL, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_set_frame_periodic_boost(vpx_codec_alg_priv_t *ctx,
                                                      va_list args) {
   struct vp9_extracfg extra_cfg = ctx->extra_cfg;
@@ -1444,6 +1470,8 @@
   {VP9E_SET_TUNE_CONTENT,             ctrl_set_tune_content},
   {VP9E_SET_COLOR_SPACE,              ctrl_set_color_space},
   {VP9E_SET_NOISE_SENSITIVITY,        ctrl_set_noise_sensitivity},
+  {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
+  {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index ab5bcba..5b62c3ec 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -17,7 +17,7 @@
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx/vp8dx.h"
 #include "vpx/vpx_decoder.h"
-#include "vpx_thread/vpx_thread.h"
+#include "vpx_util/vpx_thread.h"
 
 #include "vp9/common/vp9_alloccommon.h"
 #include "vp9/common/vp9_frame_buffers.h"

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index dca9a6c..94cc7ba 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -58,7 +58,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_svc_layercontext.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
-VP9_CX_SRCS-yes += encoder/vp9_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.c
 VP9_CX_SRCS-yes += encoder/vp9_encoder.c
 VP9_CX_SRCS-yes += encoder/vp9_picklpf.c
@@ -84,7 +83,6 @@
 
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.c
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.c
-VP9_CX_SRCS-yes += encoder/vp9_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.c
 VP9_CX_SRCS-yes += encoder/vp9_aq_variance.h
 VP9_CX_SRCS-yes += encoder/vp9_aq_cyclicrefresh.c
@@ -103,7 +101,6 @@
 VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_avg_intrin_sse2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_subpel_variance_impl_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
@@ -114,13 +111,6 @@
 ifeq ($(CONFIG_USE_X86INC),yes)
 VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_variance_sse2.c
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_subpel_variance.asm
-endif
 endif
 
 ifeq ($(ARCH_X86_64),yes)
@@ -144,15 +134,12 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct32x32_avx2_impl.h
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_dct_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
-VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 endif
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_avg_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_error_msa.c
@@ -161,8 +148,6 @@
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct16x16_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct32x32_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_fdct_msa.h
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_subtract_msa.c
 VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_temporal_filter_msa.c
-VP9_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/vp9_variance_msa.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 19bc4bd..013c67a 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -518,6 +518,22 @@
    */
   VP9E_SET_TEMPORAL_LAYERING_MODE,
 
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 4.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MIN_GF_INTERVAL,
+
+  /*!\brief Codec control function to set minimum interval between GF/ARF frames
+   *
+   * By default the value is set as 16.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_SET_MAX_GF_INTERVAL,
+
   /*!\brief Codec control function to get an Active map back from the encoder.
    *
    * Supported in codecs: VP9
@@ -716,6 +732,12 @@
 
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 
+VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
+
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL,  unsigned int)
+#define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
+
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus

diff --git a/vpx_dsp/arm/bilinear_filter_media.asm b/vpx_dsp/arm/bilinear_filter_media.asm
new file mode 100644
index 0000000..f3f9754
--- /dev/null
+++ b/vpx_dsp/arm/bilinear_filter_media.asm

@@ -0,0 +1,237 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+    EXPORT  |vpx_filter_block2d_bil_first_pass_media|
+    EXPORT  |vpx_filter_block2d_bil_second_pass_media|
+
+    AREA    |.text|, CODE, READONLY  ; name this block of code
+
+;-------------------------------------
+; r0    unsigned char  *src_ptr,
+; r1    unsigned short *dst_ptr,
+; r2    unsigned int    src_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;-------------------------------------
+; The output is transposed stroed in output array to make it easy for second pass filtering.
+|vpx_filter_block2d_bil_first_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    mov     r12, r3                         ; outer-loop counter
+
+    add     r7, r2, r4                      ; preload next row
+    pld     [r0, r7]
+
+    sub     r2, r2, r4                      ; src increment for height loop
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+
+    mov     r3, r3, lsl #1                  ; height*2
+    add     r3, r3, #2                      ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1)
+
+    mov     r11, r1                         ; save dst_ptr for each row
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_1st_filter
+
+|bil_height_loop_1st_v6|
+    ldrb    r6, [r0]                        ; load source data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    mov     lr, r4, lsr #2                  ; 4-in-parellel loop counter
+
+|bil_width_loop_1st_v6|
+    ldrb    r9, [r0, #3]
+    ldrb    r10, [r0, #4]
+
+    pkhbt   r6, r6, r7, lsl #16             ; src[1] | src[0]
+    pkhbt   r7, r7, r8, lsl #16             ; src[2] | src[1]
+
+    smuad   r6, r6, r5                      ; apply the filter
+    pkhbt   r8, r8, r9, lsl #16             ; src[3] | src[2]
+    smuad   r7, r7, r5
+    pkhbt   r9, r9, r10, lsl #16            ; src[4] | src[3]
+
+    smuad   r8, r8, r5
+    smuad   r9, r9, r5
+
+    add     r0, r0, #4
+    subs    lr, lr, #1
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #16, r6, asr #7
+    usat    r7, #16, r7, asr #7
+
+    strh    r6, [r1], r3                    ; result is transposed and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strh    r7, [r1], r3
+    add     r9, r9, #0x40
+    usat    r8, #16, r8, asr #7
+    usat    r9, #16, r9, asr #7
+
+    strh    r8, [r1], r3                    ; result is transposed and stored
+
+    ldrneb  r6, [r0]                        ; load source data
+    strh    r9, [r1], r3
+
+    ldrneb  r7, [r0, #1]
+    ldrneb  r8, [r0, #2]
+
+    bne     bil_width_loop_1st_v6
+
+    add     r0, r0, r2                      ; move to next input row
+    subs    r12, r12, #1
+
+    add     r9, r2, r4, lsl #1              ; adding back block width
+    pld     [r0, r9]                        ; preload next row
+
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_1st_v6
+
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_1st_filter|
+|bil_height_loop_null_1st|
+    mov     lr, r4, lsr #2                  ; loop counter
+
+|bil_width_loop_null_1st|
+    ldrb    r6, [r0]                        ; load data
+    ldrb    r7, [r0, #1]
+    ldrb    r8, [r0, #2]
+    ldrb    r9, [r0, #3]
+
+    strh    r6, [r1], r3                    ; store it to immediate buffer
+    add     r0, r0, #4
+    strh    r7, [r1], r3
+    subs    lr, lr, #1
+    strh    r8, [r1], r3
+    strh    r9, [r1], r3
+
+    bne     bil_width_loop_null_1st
+
+    subs    r12, r12, #1
+    add     r0, r0, r2                      ; move to next input line
+    add     r11, r11, #2                    ; move over to next column
+    mov     r1, r11
+
+    bne     bil_height_loop_null_1st
+
+    ldmia   sp!, {r4 - r11, pc}
+
+    ENDP  ; |vpx_filter_block2d_bil_first_pass_media|
+
+
+;---------------------------------
+; r0    unsigned short *src_ptr,
+; r1    unsigned char  *dst_ptr,
+; r2    int             dst_pitch,
+; r3    unsigned int    height,
+; stack unsigned int    width,
+; stack const short    *vpx_filter
+;---------------------------------
+|vpx_filter_block2d_bil_second_pass_media| PROC
+    stmdb   sp!, {r4 - r11, lr}
+
+    ldr     r11, [sp, #40]                  ; vpx_filter address
+    ldr     r4, [sp, #36]                   ; width
+
+    ldr     r5, [r11]                       ; load up filter coefficients
+    mov     r12, r4                         ; outer-loop counter = width, since we work on transposed data matrix
+    mov     r11, r1
+
+    cmp     r5, #128                        ; if filter coef = 128, then skip the filter
+    beq     bil_null_2nd_filter
+
+|bil_height_loop_2nd|
+    ldr     r6, [r0]                        ; load the data
+    ldr     r8, [r0, #4]
+    ldrh    r10, [r0, #8]
+    mov     lr, r3, lsr #2                  ; loop counter
+
+|bil_width_loop_2nd|
+    pkhtb   r7, r6, r8                      ; src[1] | src[2]
+    pkhtb   r9, r8, r10                     ; src[3] | src[4]
+
+    smuad   r6, r6, r5                      ; apply filter
+    smuad   r8, r8, r5                      ; apply filter
+
+    subs    lr, lr, #1
+
+    smuadx  r7, r7, r5                      ; apply filter
+    smuadx  r9, r9, r5                      ; apply filter
+
+    add     r0, r0, #8
+
+    add     r6, r6, #0x40                   ; round_shift_and_clamp
+    add     r7, r7, #0x40
+    usat    r6, #8, r6, asr #7
+    usat    r7, #8, r7, asr #7
+    strb    r6, [r1], r2                    ; the result is transposed back and stored
+
+    add     r8, r8, #0x40                   ; round_shift_and_clamp
+    strb    r7, [r1], r2
+    add     r9, r9, #0x40
+    usat    r8, #8, r8, asr #7
+    usat    r9, #8, r9, asr #7
+    strb    r8, [r1], r2                    ; the result is transposed back and stored
+
+    ldrne   r6, [r0]                        ; load data
+    strb    r9, [r1], r2
+    ldrne   r8, [r0, #4]
+    ldrneh  r10, [r0, #8]
+
+    bne     bil_width_loop_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4                      ; update src for next row
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_2nd
+    ldmia   sp!, {r4 - r11, pc}
+
+|bil_null_2nd_filter|
+|bil_height_loop_null_2nd|
+    mov     lr, r3, lsr #2
+
+|bil_width_loop_null_2nd|
+    ldr     r6, [r0], #4                    ; load data
+    subs    lr, lr, #1
+    ldr     r8, [r0], #4
+
+    strb    r6, [r1], r2                    ; store data
+    mov     r7, r6, lsr #16
+    strb    r7, [r1], r2
+    mov     r9, r8, lsr #16
+    strb    r8, [r1], r2
+    strb    r9, [r1], r2
+
+    bne     bil_width_loop_null_2nd
+
+    subs    r12, r12, #1
+    add     r0, r0, #4
+    add     r11, r11, #1
+    mov     r1, r11
+
+    bne     bil_height_loop_null_2nd
+
+    ldmia   sp!, {r4 - r11, pc}
+    ENDP  ; |vpx_filter_block2d_second_pass_media|
+
+    END

diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c
new file mode 100644
index 0000000..e7d8c85
--- /dev/null
+++ b/vpx_dsp/arm/subpel_variance_media.c

@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+#if HAVE_MEDIA
+static const int16_t bilinear_filters_media[8][2] = {
+  { 128,   0 },
+  { 112,  16 },
+  {  96,  32 },
+  {  80,  48 },
+  {  64,  64 },
+  {  48,  80 },
+  {  32,  96 },
+  {  16, 112 }
+};
+
+extern void vpx_filter_block2d_bil_first_pass_media(const uint8_t *src_ptr,
+                                                    uint16_t *dst_ptr,
+                                                    uint32_t src_pitch,
+                                                    uint32_t height,
+                                                    uint32_t width,
+                                                    const int16_t *filter);
+
+extern void vpx_filter_block2d_bil_second_pass_media(const uint16_t *src_ptr,
+                                                     uint8_t *dst_ptr,
+                                                     int32_t src_pitch,
+                                                     uint32_t height,
+                                                     uint32_t width,
+                                                     const int16_t *filter);
+
+
+unsigned int vpx_sub_pixel_variance8x8_media(const uint8_t *src_ptr,
+                                             int src_pixels_per_line,
+                                             int xoffset, int yoffset,
+                                             const uint8_t *dst_ptr,
+                                             int dst_pixels_per_line,
+                                             unsigned int *sse) {
+  uint16_t first_pass[10*8];
+  uint8_t  second_pass[8*8];
+  const int16_t *HFilter, *VFilter;
+
+  HFilter = bilinear_filters_media[xoffset];
+  VFilter = bilinear_filters_media[yoffset];
+
+  vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                          src_pixels_per_line,
+                                          9, 8, HFilter);
+  vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                           8, 8, 8, VFilter);
+
+  return vpx_variance8x8_media(second_pass, 8, dst_ptr,
+                               dst_pixels_per_line, sse);
+}
+
+unsigned int vpx_sub_pixel_variance16x16_media(const uint8_t *src_ptr,
+                                               int src_pixels_per_line,
+                                               int xoffset,
+                                               int yoffset,
+                                               const uint8_t *dst_ptr,
+                                               int dst_pixels_per_line,
+                                               unsigned int *sse) {
+  uint16_t first_pass[36*16];
+  uint8_t  second_pass[20*16];
+  const int16_t *HFilter, *VFilter;
+  unsigned int var;
+
+  if (xoffset == 4 && yoffset == 0) {
+    var = vpx_variance_halfpixvar16x16_h_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 0 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_v_media(src_ptr, src_pixels_per_line,
+                                               dst_ptr, dst_pixels_per_line,
+                                               sse);
+  } else if (xoffset == 4 && yoffset == 4) {
+    var = vpx_variance_halfpixvar16x16_hv_media(src_ptr, src_pixels_per_line,
+                                                dst_ptr, dst_pixels_per_line,
+                                                sse);
+  } else {
+    HFilter = bilinear_filters_media[xoffset];
+    VFilter = bilinear_filters_media[yoffset];
+
+    vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass,
+                                            src_pixels_per_line,
+                                            17, 16, HFilter);
+    vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass,
+                                             16, 16, 16, VFilter);
+
+    var = vpx_variance16x16_media(second_pass, 16, dst_ptr,
+                                  dst_pixels_per_line, sse);
+  }
+  return var;
+}
+#endif  // HAVE_MEDIA

diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vpx_dsp/arm/subpel_variance_neon.c
similarity index 89%
rename from vp9/encoder/arm/neon/vp9_variance_neon.c
rename to vpx_dsp/arm/subpel_variance_neon.c
index 0ac194e..40e2cc8 100644
--- a/vp9/encoder/arm/neon/vp9_variance_neon.c
+++ b/vpx_dsp/arm/subpel_variance_neon.c

@@ -9,16 +9,15 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-#include "vp9/common/vp9_filter.h"
+#include "vpx_dsp/variance.h"
 
-static uint8_t bilinear_filters[8][2] = {
+static const uint8_t bilinear_filters[8][2] = {
   { 128,   0, },
   { 112,  16, },
   {  96,  32, },
@@ -35,9 +34,9 @@
                                       int pixel_step,
                                       unsigned int output_height,
                                       unsigned int output_width,
-                                      const uint8_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
+                                      const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
   unsigned int i;
   for (i = 0; i < output_height; ++i) {
     const uint8x8_t src_0 = vld1_u8(&src_ptr[0]);
@@ -58,9 +57,9 @@
                                        int pixel_step,
                                        unsigned int output_height,
                                        unsigned int output_width,
-                                       const uint8_t *vp9_filter) {
-  const uint8x8_t f0 = vmov_n_u8(vp9_filter[0]);
-  const uint8x8_t f1 = vmov_n_u8(vp9_filter[1]);
+                                       const uint8_t *filter) {
+  const uint8x8_t f0 = vmov_n_u8(filter[0]);
+  const uint8x8_t f1 = vmov_n_u8(filter[1]);
   unsigned int i, j;
   for (i = 0; i < output_height; ++i) {
     for (j = 0; j < output_width; j += 16) {
@@ -80,7 +79,7 @@
   }
 }
 
-unsigned int vp9_sub_pixel_variance8x8_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance8x8_neon(const uint8_t *src,
                                             int src_stride,
                                             int xoffset,
                                             int yoffset,
@@ -98,7 +97,7 @@
   return vpx_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
 }
 
-unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance16x16_neon(const uint8_t *src,
                                               int src_stride,
                                               int xoffset,
                                               int yoffset,
@@ -116,7 +115,7 @@
   return vpx_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
 }
 
-unsigned int vp9_sub_pixel_variance32x32_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance32x32_neon(const uint8_t *src,
                                               int src_stride,
                                               int xoffset,
                                               int yoffset,
@@ -134,7 +133,7 @@
   return vpx_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
 }
 
-unsigned int vp9_sub_pixel_variance64x64_neon(const uint8_t *src,
+unsigned int vpx_sub_pixel_variance64x64_neon(const uint8_t *src,
                                               int src_stride,
                                               int xoffset,
                                               int yoffset,

diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
similarity index 97%
rename from vp9/encoder/arm/neon/vp9_subtract_neon.c
rename to vpx_dsp/arm/subtract_neon.c
index b4bf567..7b14609 100644
--- a/vp9/encoder/arm/neon/vp9_subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c

@@ -9,12 +9,11 @@
  */
 
 #include <arm_neon.h>
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
 
+#include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 
-void vp9_subtract_block_neon(int rows, int cols,
+void vpx_subtract_block_neon(int rows, int cols,
                              int16_t *diff, ptrdiff_t diff_stride,
                              const uint8_t *src, ptrdiff_t src_stride,
                              const uint8_t *pred, ptrdiff_t pred_stride) {

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
similarity index 97%
rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
rename to vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm
index 3668dc5..dab845a 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_h_armv6.asm
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_h_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_h_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_h_armv6| PROC
+|vpx_variance_halfpixvar16x16_h_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
similarity index 97%
rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
rename to vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm
index b4e0959..01953b7 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6.asm
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_hv_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_hv_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_hv_armv6| PROC
+|vpx_variance_halfpixvar16x16_hv_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
similarity index 97%
rename from vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
rename to vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm
index 10863e2..0d17acb 100644
--- a/vp8/common/arm/armv6/vp8_variance_halfpixvar16x16_v_armv6.asm
+++ b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm

@@ -9,7 +9,7 @@
 ;
 
 
-    EXPORT  |vp8_variance_halfpixvar16x16_v_armv6|
+    EXPORT  |vpx_variance_halfpixvar16x16_v_media|
 
     ARM
     REQUIRE8
@@ -22,7 +22,7 @@
 ; r2    unsigned char *ref_ptr
 ; r3    int  recon_stride
 ; stack unsigned int *sse
-|vp8_variance_halfpixvar16x16_v_armv6| PROC
+|vpx_variance_halfpixvar16x16_v_media| PROC
 
     stmfd   sp!, {r4-r12, lr}
 

diff --git a/vpx_dsp/mips/macros_msa.h b/vpx_dsp/mips/macros_msa.h
index 30e152b..df8141b 100644
--- a/vpx_dsp/mips/macros_msa.h
+++ b/vpx_dsp/mips/macros_msa.h

@@ -24,7 +24,34 @@
 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
 
+#define LD_W(RTYPE, psrc) *((const RTYPE *)(psrc))
+#define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
+
+#define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
+#define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
+
+#define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
+
+#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
+#define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
+
 #if (__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "lh  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
 #define LW(psrc) ({                                 \
   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
   uint32_t val_m;                                   \
@@ -38,7 +65,88 @@
                                                     \
   val_m;                                            \
 })
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ld  %[val_m],  %[psrc_m]  \n\t"              \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);         \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m);                                     \
+  val1_m = LW(psrc_m + 4);                                 \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint16_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sh  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SW(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sw  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
+
+#define SD(val, pdst) {                 \
+  uint8_t *pdst_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);         \
+                                        \
+  __asm__ __volatile__ (                \
+      "sd  %[val_m],  %[pdst_m]  \n\t"  \
+                                        \
+      : [pdst_m] "=m" (*pdst_m)         \
+      : [val_m] "r" (val_m)             \
+  );                                    \
+}
 #else  // !(__mips_isa_rev >= 6)
+#define LH(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint16_t val_m;                                   \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "ulh  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+
 #define LW(psrc) ({                                 \
   const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
   uint32_t val_m;                                   \
@@ -52,6 +160,72 @@
                                                     \
   val_m;                                            \
 })
+
+#if (__mips == 64)
+#define LD(psrc) ({                                 \
+  const uint8_t *psrc_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                               \
+                                                    \
+  __asm__ __volatile__ (                            \
+      "uld  %[val_m],  %[psrc_m]  \n\t"             \
+                                                    \
+      : [val_m] "=r" (val_m)                        \
+      : [psrc_m] "m" (*psrc_m)                      \
+  );                                                \
+                                                    \
+  val_m;                                            \
+})
+#else  // !(__mips == 64)
+#define LD(psrc) ({                                        \
+  const uint8_t *psrc_m1 = (const uint8_t *)(psrc);        \
+  uint32_t val0_m, val1_m;                                 \
+  uint64_t val_m = 0;                                      \
+                                                           \
+  val0_m = LW(psrc_m1);                                    \
+  val1_m = LW(psrc_m1 + 4);                                \
+                                                           \
+  val_m = (uint64_t)(val1_m);                              \
+  val_m = (uint64_t)((val_m << 32) & 0xFFFFFFFF00000000);  \
+  val_m = (uint64_t)(val_m | (uint64_t)val0_m);            \
+                                                           \
+  val_m;                                                   \
+})
+#endif  // (__mips == 64)
+
+#define SH(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint16_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "ush  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SW(val, pdst) {                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);          \
+                                         \
+  __asm__ __volatile__ (                 \
+      "usw  %[val_m],  %[pdst_m]  \n\t"  \
+                                         \
+      : [pdst_m] "=m" (*pdst_m)          \
+      : [val_m] "r" (val_m)              \
+  );                                     \
+}
+
+#define SD(val, pdst) {                                     \
+  uint8_t *pdst_m1 = (uint8_t *)(pdst);                     \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  SW(val0_m, pdst_m1);                                      \
+  SW(val1_m, pdst_m1 + 4);                                  \
+}
 #endif  // (__mips_isa_rev >= 6)
 
 /* Description : Load 4 words with stride
@@ -69,6 +243,49 @@
   out3 = LW((psrc) + 3 * stride);                    \
 }
 
+/* Description : Load double words with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+   Details     : Load double word in 'out0' from (psrc)
+                 Load double word in 'out1' from (psrc + stride)
+*/
+#define LD2(psrc, stride, out0, out1) {  \
+  out0 = LD((psrc));                     \
+  out1 = LD((psrc) + stride);            \
+}
+#define LD4(psrc, stride, out0, out1, out2, out3) {  \
+  LD2((psrc), stride, out0, out1);                   \
+  LD2((psrc) + 2 * stride, stride, out2, out3);      \
+}
+
+/* Description : Store 4 words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store word from 'in0' to (pdst)
+                 Store word from 'in1' to (pdst + stride)
+                 Store word from 'in2' to (pdst + 2 * stride)
+                 Store word from 'in3' to (pdst + 3 * stride)
+*/
+#define SW4(in0, in1, in2, in3, pdst, stride) {  \
+  SW(in0, (pdst))                                \
+  SW(in1, (pdst) + stride);                      \
+  SW(in2, (pdst) + 2 * stride);                  \
+  SW(in3, (pdst) + 3 * stride);                  \
+}
+
+/* Description : Store 4 double words with stride
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Store double word from 'in0' to (pdst)
+                 Store double word from 'in1' to (pdst + stride)
+                 Store double word from 'in2' to (pdst + 2 * stride)
+                 Store double word from 'in3' to (pdst + 3 * stride)
+*/
+#define SD4(in0, in1, in2, in3, pdst, stride) {  \
+  SD(in0, (pdst))                                \
+  SD(in1, (pdst) + stride);                      \
+  SD(in2, (pdst) + 2 * stride);                  \
+  SD(in3, (pdst) + 3 * stride);                  \
+}
+
 /* Description : Load vectors with 16 byte elements with stride
    Arguments   : Inputs  - psrc, stride
                  Outputs - out0, out1
@@ -81,6 +298,7 @@
   out1 = LD_B(RTYPE, (psrc) + stride);            \
 }
 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
+#define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
 
 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) {  \
   LD_B2(RTYPE, (psrc), stride, out0, out1);             \
@@ -93,12 +311,29 @@
   LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);     \
 }
 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
+#define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
 
 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) {  \
   LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);             \
   out4 = LD_B(RTYPE, (psrc) + 4 * stride);                          \
 }
 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
+#define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
+
+#define LD_B7(RTYPE, psrc, stride,                             \
+              out0, out1, out2, out3, out4, out5, out6) {      \
+  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \
+  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \
+}
+#define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
+
+#define LD_B8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
+#define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
 
 /* Description : Load vectors with 8 halfword elements with stride
    Arguments   : Inputs  - psrc, stride
@@ -110,6 +345,7 @@
   out0 = LD_H(RTYPE, (psrc));                     \
   out1 = LD_H(RTYPE, (psrc) + (stride));          \
 }
+#define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
 
 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) {  \
   LD_H2(RTYPE, (psrc), stride, out0, out1);                   \
@@ -117,6 +353,229 @@
 }
 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
 
+#define LD_H8(RTYPE, psrc, stride,                                    \
+              out0, out1, out2, out3, out4, out5, out6, out7) {       \
+  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \
+  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \
+}
+#define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
+
+#define LD_H16(RTYPE, psrc, stride,                                     \
+               out0, out1, out2, out3, out4, out5, out6, out7,          \
+               out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  LD_H8(RTYPE, (psrc), stride,                                          \
+        out0, out1, out2, out3, out4, out5, out6, out7);                \
+  LD_H8(RTYPE, (psrc) + 8 * stride, stride,                             \
+        out8, out9, out10, out11, out12, out13, out14, out15);          \
+}
+#define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
+
+/* Description : Load 4x4 block of signed halfword elements from 1D source
+                 data into 4 vectors (Each vector with 4 signed halfwords)
+   Arguments   : Input   - psrc
+                 Outputs - out0, out1, out2, out3
+*/
+#define LD4x4_SH(psrc, out0, out1, out2, out3) {         \
+  out0 = LD_SH(psrc);                                    \
+  out2 = LD_SH(psrc + 8);                                \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);  \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out2, (v2i64)out2);  \
+}
+
+/* Description : Load 2 vectors of signed word elements with stride
+   Arguments   : Inputs  - psrc, stride
+                 Outputs - out0, out1
+                 Return Type - signed word
+*/
+#define LD_SW2(psrc, stride, out0, out1) {  \
+  out0 = LD_SW((psrc));                     \
+  out1 = LD_SW((psrc) + stride);            \
+}
+
+/* Description : Store vectors of 16 byte elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 16 byte elements from 'in0' to (pdst)
+                 Store 16 byte elements from 'in1' to (pdst + stride)
+*/
+#define ST_B2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_B(RTYPE, in0, (pdst));                     \
+  ST_B(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
+
+#define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_B2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
+
+#define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,      \
+              pdst, stride) {                                     \
+  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \
+  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \
+}
+#define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
+
+/* Description : Store vectors of 8 halfword elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 8 halfword elements from 'in0' to (pdst)
+                 Store 8 halfword elements from 'in1' to (pdst + stride)
+*/
+#define ST_H2(RTYPE, in0, in1, pdst, stride) {  \
+  ST_H(RTYPE, in0, (pdst));                     \
+  ST_H(RTYPE, in1, (pdst) + stride);            \
+}
+#define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
+
+#define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) {  \
+  ST_H2(RTYPE, in0, in1, (pdst), stride);                 \
+  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);    \
+}
+#define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
+
+#define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) {  \
+  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                           \
+  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);              \
+}
+#define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
+
+/* Description : Store vectors of word elements with stride
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Store 4 word elements from 'in0' to (pdst)
+                 Store 4 word elements from 'in1' to (pdst + stride)
+*/
+#define ST_SW2(in0, in1, pdst, stride) {  \
+  ST_SW(in0, (pdst));                     \
+  ST_SW(in1, (pdst) + stride);            \
+}
+
+/* Description : Store 2x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in, stidx, pdst, stride
+   Details     : Index 'stidx' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst)
+                 Index 'stidx+1' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + stride)
+                 Index 'stidx+2' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 2 * stride)
+                 Index 'stidx+3' halfword element from 'in' vector is copied to
+                 the GP register and stored to (pdst + 3 * stride)
+*/
+#define ST2x4_UB(in, stidx, pdst, stride) {         \
+  uint16_t out0_m, out1_m, out2_m, out3_m;          \
+  uint8_t *pblk_2x4_m = (uint8_t *)(pdst);          \
+                                                    \
+  out0_m = __msa_copy_u_h((v8i16)in, (stidx));      \
+  out1_m = __msa_copy_u_h((v8i16)in, (stidx + 1));  \
+  out2_m = __msa_copy_u_h((v8i16)in, (stidx + 2));  \
+  out3_m = __msa_copy_u_h((v8i16)in, (stidx + 3));  \
+                                                    \
+  SH(out0_m, pblk_2x4_m);                           \
+  SH(out1_m, pblk_2x4_m + stride);                  \
+  SH(out2_m, pblk_2x4_m + 2 * stride);              \
+  SH(out3_m, pblk_2x4_m + 3 * stride);              \
+}
+
+/* Description : Store 4x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst)
+                 Index 1 word element from 'in' vector is copied to the GP
+                 register and stored to (pdst + stride)
+*/
+#define ST4x2_UB(in, pdst, stride) {        \
+  uint32_t out0_m, out1_m;                  \
+  uint8_t *pblk_4x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_w((v4i32)in, 0);    \
+  out1_m = __msa_copy_u_w((v4i32)in, 1);    \
+                                            \
+  SW(out0_m, pblk_4x2_m);                   \
+  SW(out1_m, pblk_4x2_m + stride);          \
+}
+
+/* Description : Store 4x4 byte block to destination memory from input vector
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : 'Idx0' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst)
+                 'Idx1' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + stride)
+                 'Idx2' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 'Idx3' word element from input vector 'in0' is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) {  \
+  uint32_t out0_m, out1_m, out2_m, out3_m;                          \
+  uint8_t *pblk_4x4_m = (uint8_t *)(pdst);                          \
+                                                                    \
+  out0_m = __msa_copy_u_w((v4i32)in0, idx0);                        \
+  out1_m = __msa_copy_u_w((v4i32)in0, idx1);                        \
+  out2_m = __msa_copy_u_w((v4i32)in1, idx2);                        \
+  out3_m = __msa_copy_u_w((v4i32)in1, idx3);                        \
+                                                                    \
+  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);          \
+}
+#define ST4x8_UB(in0, in1, pdst, stride) {                        \
+  uint8_t *pblk_4x8 = (uint8_t *)(pdst);                          \
+                                                                  \
+  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \
+  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \
+}
+
+/* Description : Store 8x1 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+*/
+#define ST8x1_UB(in, pdst) {              \
+  uint64_t out0_m;                        \
+                                          \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);  \
+  SD(out0_m, pdst);                       \
+}
+
+/* Description : Store 8x2 byte block to destination memory from input vector
+   Arguments   : Inputs - in, pdst, stride
+   Details     : Index 0 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in' vector is copied to the
+                 GP register and stored to (pdst + stride)
+*/
+#define ST8x2_UB(in, pdst, stride) {        \
+  uint64_t out0_m, out1_m;                  \
+  uint8_t *pblk_8x2_m = (uint8_t *)(pdst);  \
+                                            \
+  out0_m = __msa_copy_u_d((v2i64)in, 0);    \
+  out1_m = __msa_copy_u_d((v2i64)in, 1);    \
+                                            \
+  SD(out0_m, pblk_8x2_m);                   \
+  SD(out1_m, pblk_8x2_m + stride);          \
+}
+
+/* Description : Store 8x4 byte block to destination memory from input
+                 vectors
+   Arguments   : Inputs - in0, in1, pdst, stride
+   Details     : Index 0 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst)
+                 Index 1 double word element from 'in0' vector is copied to the
+                 GP register and stored to (pdst + stride)
+                 Index 0 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 2 * stride)
+                 Index 1 double word element from 'in1' vector is copied to the
+                 GP register and stored to (pdst + 3 * stride)
+*/
+#define ST8x4_UB(in0, in1, pdst, stride) {                  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                  \
+  uint8_t *pblk_8x4_m = (uint8_t *)(pdst);                  \
+                                                            \
+  out0_m = __msa_copy_u_d((v2i64)in0, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)in0, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)in1, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)in1, 1);                   \
+                                                            \
+  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \
+}
+
 /* Description : average with rounding (in0 + in1 + 1) / 2.
    Arguments   : Inputs  - in0, in1, in2, in3,
                  Outputs - out0, out1
@@ -138,6 +597,27 @@
 }
 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
 
+/* Description : Immediate number of elements to slide with zero
+   Arguments   : Inputs  - in0, in1, slide_val
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'zero_m' vector are slid into 'in0' by
+                 value specified in the 'slide_val'
+*/
+#define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) {          \
+  v16i8 zero_m = { 0 };                                              \
+  out0 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in0, slide_val);  \
+  out1 = (RTYPE)__msa_sldi_b((v16i8)zero_m, (v16i8)in1, slide_val);  \
+}
+#define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__)
+
+#define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \
+                  out0, out1, out2, out3, slide_val) {  \
+  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);    \
+  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);    \
+}
+#define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__)
+
 /* Description : Immediate number of elements to slide
    Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
                  Outputs - out0, out1
@@ -150,6 +630,148 @@
   out1 = (RTYPE)__msa_sldi_b((v16i8)in0_1, (v16i8)in1_1, slide_val);         \
 }
 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
+#define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
+
+#define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,      \
+                out0, out1, out2, slide_val) {                        \
+  SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)   \
+  out2 = (RTYPE)__msa_sldi_b((v16i8)in0_2, (v16i8)in1_2, slide_val);  \
+}
+#define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
+#define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
+
+/* Description : Shuffle byte vector elements as per mask vector
+   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Byte elements from 'in0' & 'in1' are copied selectively to
+                 'out0' as per control vector 'mask0'
+*/
+#define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) {  \
+  out0 = (RTYPE)__msa_vshf_b((v16i8)mask0, (v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_vshf_b((v16i8)mask1, (v16i8)in3, (v16i8)in2);     \
+}
+#define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
+#define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
+#define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
+
+#define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,     \
+                out0, out1, out2, out3) {                        \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \
+  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \
+}
+#define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
+#define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Unsigned byte elements from 'mult0' are multiplied with
+                 unsigned byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. unsigned halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_u_h((v16u8)mult0, (v16u8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_u_h((v16u8)mult1, (v16u8)cnst1);        \
+}
+#define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
+
+#define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
+
+/* Description : Dot product of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_h((v16i8)mult0, (v16i8)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_h((v16i8)mult1, (v16i8)cnst1);        \
+}
+#define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
+
+#define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
+
+/* Description : Dot product of halfword vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'mult0' are multiplied with
+                 signed halfword elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_w((v8i16)mult0, (v8i16)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_w((v8i16)mult1, (v8i16)cnst1);        \
+}
+#define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
+
+#define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,         \
+                 cnst0, cnst1, cnst2, cnst3,                \
+                 out0, out1, out2, out3) {                  \
+  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \
+  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \
+}
+#define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
+
+/* Description : Dot product of word vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed word elements from 'mult0' are multiplied with
+                 signed word elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed double word.
+                 The multiplication result of adjacent odd-even elements
+                 are added together and written to the 'out0' vector
+*/
+#define DOTP_SW2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {  \
+  out0 = (RTYPE)__msa_dotp_s_d((v4i32)mult0, (v4i32)cnst0);        \
+  out1 = (RTYPE)__msa_dotp_s_d((v4i32)mult1, (v4i32)cnst1);        \
+}
+#define DOTP_SW2_SD(...) DOTP_SW2(v2i64, __VA_ARGS__)
+
+/* Description : Dot product & addition of byte vector elements
+   Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed byte elements from 'mult0' are multiplied with
+                 signed byte elements from 'cnst0' producing a result
+                 twice the size of input i.e. signed halfword.
+                 The multiplication result of adjacent odd-even elements
+                 are added to the 'out0' vector
+*/
+#define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) {         \
+  out0 = (RTYPE)__msa_dpadd_s_h((v8i16)out0, (v16i8)mult0, (v16i8)cnst0);  \
+  out1 = (RTYPE)__msa_dpadd_s_h((v8i16)out1, (v16i8)mult1, (v16i8)cnst1);  \
+}
+#define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
+
+#define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                     \
+                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) {  \
+  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);              \
+  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);              \
+}
+#define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
 
 /* Description : Dot product & addition of halfword vector elements
    Arguments   : Inputs  - mult0, mult1, cnst0, cnst1
@@ -172,7 +794,7 @@
                  Outputs - out0, out1
                  Return Type - as per RTYPE
    Details     : Each signed word element from 'mult0' is multiplied with itself
-                 producing an intermediate result twice the size of it
+                 producing an intermediate result twice the size of input
                  i.e. signed double word
                  The multiplication result of adjacent odd-even elements
                  are added to the 'out0' vector
@@ -183,6 +805,49 @@
 }
 #define DPADD_SD2_SD(...) DPADD_SD2(v2i64, __VA_ARGS__)
 
+/* Description : Minimum values between unsigned elements of
+                 either vector are copied to the output vector
+   Arguments   : Inputs  - in0, in1, min_vec
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Minimum of unsigned halfword element values from 'in0' and
+                 'min_vec' are written to output vector 'in0'
+*/
+#define MIN_UH2(RTYPE, in0, in1, min_vec) {         \
+  in0 = (RTYPE)__msa_min_u_h((v8u16)in0, min_vec);  \
+  in1 = (RTYPE)__msa_min_u_h((v8u16)in1, min_vec);  \
+}
+#define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
+
+#define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) {  \
+  MIN_UH2(RTYPE, in0, in1, min_vec);                   \
+  MIN_UH2(RTYPE, in2, in3, min_vec);                   \
+}
+#define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Clips all signed halfword elements of input vector
+                 between 0 & 255
+   Arguments   : Input  - in
+                 Output - out_m
+                 Return Type - signed halfword
+*/
+#define CLIP_SH_0_255(in) ({                          \
+  v8i16 max_m = __msa_ldi_h(255);                     \
+  v8i16 out_m;                                        \
+                                                      \
+  out_m = __msa_maxi_s_h((v8i16)in, 0);               \
+  out_m = __msa_min_s_h((v8i16)max_m, (v8i16)out_m);  \
+  out_m;                                              \
+})
+#define CLIP_SH2_0_255(in0, in1) {  \
+  in0 = CLIP_SH_0_255(in0);         \
+  in1 = CLIP_SH_0_255(in1);         \
+}
+#define CLIP_SH4_0_255(in0, in1, in2, in3) {  \
+  CLIP_SH2_0_255(in0, in1);                   \
+  CLIP_SH2_0_255(in2, in3);                   \
+}
+
 /* Description : Horizontal addition of 4 signed word elements of input vector
    Arguments   : Input  - in       (signed word vector)
                  Output - sum_m    (i32 sum)
@@ -221,6 +886,26 @@
   sum_m;                                             \
 })
 
+/* Description : Horizontal addition of unsigned byte vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each unsigned odd byte element from 'in0' is added to
+                 even unsigned byte element from 'in0' (pairwise) and the
+                 halfword result is written to 'out0'
+*/
+#define HADD_UB2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hadd_u_h((v16u8)in0, (v16u8)in0);  \
+  out1 = (RTYPE)__msa_hadd_u_h((v16u8)in1, (v16u8)in1);  \
+}
+#define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
+
+#define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  HADD_UB2(RTYPE, in0, in1, out0, out1);                               \
+  HADD_UB2(RTYPE, in2, in3, out2, out3);                               \
+}
+#define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
+
 /* Description : Horizontal subtraction of unsigned byte vector elements
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -256,12 +941,32 @@
   sad_m;                                                    \
 })
 
+/* Description : Horizontal subtraction of signed halfword vector elements
+   Arguments   : Inputs  - in0, in1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Each signed odd halfword element from 'in0' is subtracted from
+                 even signed halfword element from 'in0' (pairwise) and the
+                 word result is written to 'out0'
+*/
+#define HSUB_UH2(RTYPE, in0, in1, out0, out1) {          \
+  out0 = (RTYPE)__msa_hsub_s_w((v8i16)in0, (v8i16)in0);  \
+  out1 = (RTYPE)__msa_hsub_s_w((v8i16)in1, (v8i16)in1);  \
+}
+#define HSUB_UH2_SW(...) HSUB_UH2(v4i32, __VA_ARGS__)
+
 /* Description : Set element n input vector to GPR value
    Arguments   : Inputs - in0, in1, in2, in3
                  Output - out
                  Return Type - as per RTYPE
    Details     : Set element 0 in vector 'out' to value specified in 'in0'
 */
+#define INSERT_W2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);  \
+}
+#define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
+
 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) {  \
   out = (RTYPE)__msa_insert_w((v4i32)out, 0, in0);   \
   out = (RTYPE)__msa_insert_w((v4i32)out, 1, in1);   \
@@ -271,6 +976,218 @@
 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
 
+#define INSERT_D2(RTYPE, in0, in1, out) {           \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 0, in0);  \
+  out = (RTYPE)__msa_insert_d((v2i64)out, 1, in1);  \
+}
+#define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
+#define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_b((v16i8)in1, (v16i8)in0);     \
+  out1 = (RTYPE)__msa_ilvev_b((v16i8)in3, (v16i8)in2);     \
+}
+#define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
+#define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave even halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_h((v8i16)in1, (v8i16)in0);     \
+  out1 = (RTYPE)__msa_ilvev_h((v8i16)in3, (v8i16)in2);     \
+}
+#define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
+#define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
+#define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
+
+/* Description : Interleave even word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_w((v4i32)in1, (v4i32)in0);     \
+  out1 = (RTYPE)__msa_ilvev_w((v4i32)in3, (v4i32)in2);     \
+}
+#define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
+
+/* Description : Interleave even double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even double word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'
+*/
+#define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvev_d((v2i64)in1, (v2i64)in0);     \
+  out1 = (RTYPE)__msa_ilvev_d((v2i64)in3, (v2i64)in2);     \
+}
+#define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
+
+/* Description : Interleave left half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvl_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
+#define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
+#define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
+#define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
+
+#define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
+#define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
+
+/* Description : Interleave left half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvl_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave left half of word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
+                 and written to 'out0'.
+*/
+#define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
+#define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
+
+/* Description : Interleave right half of byte elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
+                 and written to out0.
+*/
+#define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_ilvr_b((v16i8)in2, (v16i8)in3);     \
+}
+#define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
+#define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
+#define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
+#define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
+
+#define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
+#define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
+#define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
+#define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
+
+#define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                in8, in9, in10, in11, in12, in13, in14, in15,      \
+                out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,           \
+          out0, out1, out2, out3);                                 \
+  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,     \
+          out4, out5, out6, out7);                                 \
+}
+#define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
+
+/* Description : Interleave right half of halfword elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of halfword elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_ilvr_h((v8i16)in2, (v8i16)in3);     \
+}
+#define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
+
+#define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
+
+#define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);     \
+  out1 = (RTYPE)__msa_ilvr_w((v4i32)in2, (v4i32)in3);     \
+}
+#define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
+#define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
+
+#define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
+
+/* Description : Interleave right half of double word elements from vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Right half of double word elements of 'in0' and 'in1' are
+                 interleaved and written to 'out0'.
+*/
+#define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) {   \
+  out0 = (RTYPE)__msa_ilvr_d((v2i64)(in0), (v2i64)(in1));  \
+  out1 = (RTYPE)__msa_ilvr_d((v2i64)(in2), (v2i64)(in3));  \
+}
+#define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
+#define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
+#define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
+
+#define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) {  \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                         \
+  out2 = (RTYPE)__msa_ilvr_d((v2i64)(in4), (v2i64)(in5));                 \
+}
+#define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
+
+#define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
+#define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
+
 /* Description : Interleave both left and right half of input vectors
    Arguments   : Inputs  - in0, in1
                  Outputs - out0, out1
@@ -283,13 +1200,138 @@
   out1 = (RTYPE)__msa_ilvl_b((v16i8)in0, (v16i8)in1);  \
 }
 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
+#define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
+#define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
+#define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
 
 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) {        \
   out0 = (RTYPE)__msa_ilvr_h((v8i16)in0, (v8i16)in1);  \
   out1 = (RTYPE)__msa_ilvl_h((v8i16)in0, (v8i16)in1);  \
 }
+#define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
 
+#define ILVRL_W2(RTYPE, in0, in1, out0, out1) {        \
+  out0 = (RTYPE)__msa_ilvr_w((v4i32)in0, (v4i32)in1);  \
+  out1 = (RTYPE)__msa_ilvl_w((v4i32)in0, (v4i32)in1);  \
+}
+#define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
+#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range.
+                 The results are written in place
+*/
+#define SAT_UH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_u_h((v8u16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_u_h((v8u16)in1, sat_val);  \
+}
+#define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
+
+#define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_UH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_UH2(RTYPE, in2, in3, sat_val)                    \
+}
+#define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
+
+/* Description : Saturate the halfword element values to the max
+                 unsigned value of (sat_val + 1) bits
+                 The element data width remains unchanged
+   Arguments   : Inputs  - in0, in1, sat_val
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned halfword element from 'in0' is saturated to the
+                 value generated with (sat_val + 1) bit range
+                 The results are written in place
+*/
+#define SAT_SH2(RTYPE, in0, in1, sat_val) {         \
+  in0 = (RTYPE)__msa_sat_s_h((v8i16)in0, sat_val);  \
+  in1 = (RTYPE)__msa_sat_s_h((v8i16)in1, sat_val);  \
+}
+#define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
+
+#define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) {  \
+  SAT_SH2(RTYPE, in0, in1, sat_val);                   \
+  SAT_SH2(RTYPE, in2, in3, sat_val);                   \
+}
+#define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Indexed halfword element values are replicated to all
+                 elements in output vector
+   Arguments   : Inputs  - in, idx0, idx1
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : 'idx0' element value from 'in' vector is replicated to all
+                  elements in 'out0' vector
+                  Valid index range for halfword operation is 0-7
+*/
+#define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) {  \
+  out0 = (RTYPE)__msa_splati_h((v8i16)in, idx0);        \
+  out1 = (RTYPE)__msa_splati_h((v8i16)in, idx1);        \
+}
+#define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
+
+#define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \
+                  out0, out1, out2, out3) {           \
+  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);       \
+  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);       \
+}
+#define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
+#define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even byte elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even byte elements of 'in0' are copied to the left half of
+                 'out0' & even byte elements of 'in1' are copied to the right
+                 half of 'out0'.
+*/
+#define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_b((v16i8)in0, (v16i8)in1);     \
+  out1 = (RTYPE)__msa_pckev_b((v16i8)in2, (v16i8)in3);     \
+}
+#define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
+#define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
+#define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
+
+#define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
+#define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
+#define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
+
+/* Description : Pack even halfword elements of vector pairs
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Even halfword elements of 'in0' are copied to the left half of
+                 'out0' & even halfword elements of 'in1' are copied to the
+                 right half of 'out0'.
+*/
+#define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_pckev_h((v8i16)in0, (v8i16)in1);     \
+  out1 = (RTYPE)__msa_pckev_h((v8i16)in2, (v8i16)in3);     \
+}
+#define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
+#define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
+
+#define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
+
 /* Description : Pack even double word elements of vector pairs
    Arguments   : Inputs  - in0, in1, in2, in3
                  Outputs - out0, out1
@@ -303,6 +1345,7 @@
   out1 = (RTYPE)__msa_pckev_d((v2i64)in2, (v2i64)in3);     \
 }
 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
+#define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
 
 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
                  out0, out1, out2, out3) {                       \
@@ -311,6 +1354,256 @@
 }
 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
 
+/* Description : Each byte element is logically xor'ed with immediate 128
+   Arguments   : Inputs  - in0, in1
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each unsigned byte element from input vector 'in0' is
+                 logically xor'ed with 128 and the result is stored in-place.
+*/
+#define XORI_B2_128(RTYPE, in0, in1) {         \
+  in0 = (RTYPE)__msa_xori_b((v16u8)in0, 128);  \
+  in1 = (RTYPE)__msa_xori_b((v16u8)in1, 128);  \
+}
+#define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
+#define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
+
+#define XORI_B3_128(RTYPE, in0, in1, in2) {    \
+  XORI_B2_128(RTYPE, in0, in1);                \
+  in2 = (RTYPE)__msa_xori_b((v16u8)in2, 128);  \
+}
+#define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
+
+#define XORI_B4_128(RTYPE, in0, in1, in2, in3) {  \
+  XORI_B2_128(RTYPE, in0, in1);                   \
+  XORI_B2_128(RTYPE, in2, in3);                   \
+}
+#define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
+#define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
+
+#define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) {  \
+  XORI_B4_128(RTYPE, in0, in1, in2, in3);                        \
+  XORI_B3_128(RTYPE, in4, in5, in6);                             \
+}
+#define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
+
+/* Description : Average of signed halfword elements -> (a + b) / 2
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each signed halfword element from 'in0' is added to each
+                 signed halfword element of 'in1' with full precision resulting
+                 in one extra bit in the result. The result is then divided by
+                 2 and written to 'out0'
+*/
+#define AVE_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                out0, out1, out2, out3) {                       \
+  out0 = (RTYPE)__msa_ave_s_h((v8i16)in0, (v8i16)in1);          \
+  out1 = (RTYPE)__msa_ave_s_h((v8i16)in2, (v8i16)in3);          \
+  out2 = (RTYPE)__msa_ave_s_h((v8i16)in4, (v8i16)in5);          \
+  out3 = (RTYPE)__msa_ave_s_h((v8i16)in6, (v8i16)in7);          \
+}
+#define AVE_SH4_SH(...) AVE_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Addition of signed halfword elements and signed saturation
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+                 Return Type - as per RTYPE
+   Details     : Signed halfword elements from 'in0' are added to signed
+                 halfword elements of 'in1'. The result is then signed saturated
+                 between halfword data type range
+*/
+#define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) {  \
+  out0 = (RTYPE)__msa_adds_s_h((v8i16)in0, (v8i16)in1);    \
+  out1 = (RTYPE)__msa_adds_s_h((v8i16)in2, (v8i16)in3);    \
+}
+#define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
+
+#define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
+                 out0, out1, out2, out3) {                       \
+  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);               \
+  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);               \
+}
+#define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
+
+/* Description : Shift left all elements of vector (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is left shifted by 'shift' and
+                 the result is written in-place.
+*/
+#define SLLI_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 << shift;                         \
+  in1 = in1 << shift;                         \
+  in2 = in2 << shift;                         \
+  in3 = in3 << shift;                         \
+}
+
+/* Description : Arithmetic shift right all elements of vector
+                 (generic for all data types)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - in place operation
+                 Return Type - as per input vector RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is a GP variable.
+*/
+#define SRA_4V(in0, in1, in2, in3, shift) {  \
+  in0 = in0 >> shift;                        \
+  in1 = in1 >> shift;                        \
+  in2 = in2 >> shift;                        \
+  in3 = in3 >> shift;                        \
+}
+
+/* Description : Shift right arithmetic rounded words
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the number of bits in the corresponding element in the vector
+                 'shift'. The last discarded bit is added to shifted value for
+                 rounding and the result is written in-place.
+                 'shift' is a vector.
+*/
+#define SRAR_W2(RTYPE, in0, in1, shift) {               \
+  in0 = (RTYPE)__msa_srar_w((v4i32)in0, (v4i32)shift);  \
+  in1 = (RTYPE)__msa_srar_w((v4i32)in1, (v4i32)shift);  \
+}
+
+#define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRAR_W2(RTYPE, in0, in1, shift)                    \
+  SRAR_W2(RTYPE, in2, in3, shift)                    \
+}
+#define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
+
+/* Description : Shift right arithmetic rounded (immediate)
+   Arguments   : Inputs  - in0, in1, shift
+                 Outputs - in place operation
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is shifted right arithmetically by
+                 the value in 'shift'. The last discarded bit is added to the
+                 shifted value for rounding and the result is written in-place.
+                 'shift' is an immediate value.
+*/
+#define SRARI_H2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_h((v8i16)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_h((v8i16)in1, shift);  \
+}
+#define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
+#define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
+
+#define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_H2(RTYPE, in0, in1, shift);                   \
+  SRARI_H2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
+#define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
+
+#define SRARI_W2(RTYPE, in0, in1, shift) {        \
+  in0 = (RTYPE)__msa_srari_w((v4i32)in0, shift);  \
+  in1 = (RTYPE)__msa_srari_w((v4i32)in1, shift);  \
+}
+#define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
+
+#define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) {  \
+  SRARI_W2(RTYPE, in0, in1, shift);                   \
+  SRARI_W2(RTYPE, in2, in3, shift);                   \
+}
+#define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
+
+/* Description : Logical shift right all elements of vector (immediate)
+   Arguments   : Inputs  - in0, in1, in2, in3, shift
+                 Outputs - out0, out1, out2, out3
+                 Return Type - as per RTYPE
+   Details     : Each element of vector 'in0' is right shifted by 'shift' and
+                 the result is written in-place. 'shift' is an immediate value.
+*/
+#define SRLI_H4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3, shift) {  \
+  out0 = (RTYPE)__msa_srli_h((v8i16)in0, shift);                             \
+  out1 = (RTYPE)__msa_srli_h((v8i16)in1, shift);                             \
+  out2 = (RTYPE)__msa_srli_h((v8i16)in2, shift);                             \
+  out3 = (RTYPE)__msa_srli_h((v8i16)in3, shift);                             \
+}
+#define SRLI_H4_SH(...) SRLI_H4(v8i16, __VA_ARGS__)
+
+/* Description : Multiplication of pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element from 'in0' is multiplied with elements from 'in1'
+                 and the result is written to 'out0'
+*/
+#define MUL2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 * in1;                             \
+  out1 = in2 * in3;                             \
+}
+#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  MUL2(in0, in1, in2, in3, out0, out1);               \
+  MUL2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Addition of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in0' is added to 'in1' and result is written
+                 to 'out0'.
+*/
+#define ADD2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 + in1;                             \
+  out1 = in2 + in3;                             \
+}
+#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  ADD2(in0, in1, in2, in3, out0, out1);               \
+  ADD2(in4, in5, in6, in7, out2, out3);               \
+}
+
+/* Description : Subtraction of 2 pairs of vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1
+   Details     : Each element in 'in1' is subtracted from 'in0' and result is
+                 written to 'out0'.
+*/
+#define SUB2(in0, in1, in2, in3, out0, out1) {  \
+  out0 = in0 - in1;                             \
+  out1 = in2 - in3;                             \
+}
+#define SUB4(in0, in1, in2, in3, in4, in5, in6, in7,  \
+             out0, out1, out2, out3) {                \
+  out0 = in0 - in1;                                   \
+  out1 = in2 - in3;                                   \
+  out2 = in4 - in5;                                   \
+  out3 = in6 - in7;                                   \
+}
+
+/* Description : Sign extend halfword elements from right half of the vector
+   Arguments   : Input  - in    (halfword vector)
+                 Output - out   (sign extended word vector)
+                 Return Type - signed word
+   Details     : Sign bit of halfword elements from input vector 'in' is
+                 extracted and interleaved with same vector 'in0' to generate
+                 4 word elements keeping sign intact
+*/
+#define UNPCK_R_SH_SW(in, out) {                 \
+  v8i16 sign_m;                                  \
+                                                 \
+  sign_m = __msa_clti_s_h((v8i16)in, 0);         \
+  out = (v4i32)__msa_ilvr_h(sign_m, (v8i16)in);  \
+}
+
+/* Description : Zero extend unsigned byte elements to halfword elements
+   Arguments   : Input   - in          (unsigned byte vector)
+                 Outputs - out0, out1  (unsigned  halfword vectors)
+                 Return Type - signed halfword
+   Details     : Zero extended right half of vector is returned in 'out0'
+                 Zero extended left half of vector is returned in 'out1'
+*/
+#define UNPCK_UB_SH(in, out0, out1) {   \
+  v16i8 zero_m = { 0 };                 \
+                                        \
+  ILVRL_B2_SH(zero_m, in, out0, out1);  \
+}
+
 /* Description : Sign extend halfword elements from input vector and return
                  the result in pair of vectors
    Arguments   : Input   - in            (halfword vector)
@@ -328,4 +1621,313 @@
   tmp_m = __msa_clti_s_h((v8i16)in, 0);  \
   ILVRL_H2_SW(tmp_m, in, out0, out1);    \
 }
+
+/* Description : Butterfly of 4 input vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  out0 = in0 + in3;                                                \
+  out1 = in1 + in2;                                                \
+                                                                   \
+  out2 = in1 - in2;                                                \
+  out3 = in0 - in3;                                                \
+}
+
+/* Description : Butterfly of 8 input vectors
+   Arguments   : Inputs  - in0 ...  in7
+                 Outputs - out0 .. out7
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                    out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  out0 = in0 + in7;                                                    \
+  out1 = in1 + in6;                                                    \
+  out2 = in2 + in5;                                                    \
+  out3 = in3 + in4;                                                    \
+                                                                       \
+  out4 = in3 - in4;                                                    \
+  out5 = in2 - in5;                                                    \
+  out6 = in1 - in6;                                                    \
+  out7 = in0 - in7;                                                    \
+}
+
+/* Description : Butterfly of 16 input vectors
+   Arguments   : Inputs  - in0 ...  in15
+                 Outputs - out0 .. out15
+   Details     : Butterfly operation
+*/
+#define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                  \
+                     in8, in9,  in10, in11, in12, in13, in14, in15,           \
+                     out0, out1, out2, out3, out4, out5, out6, out7,          \
+                     out8, out9, out10, out11, out12, out13, out14, out15) {  \
+  out0 = in0 + in15;                                                          \
+  out1 = in1 + in14;                                                          \
+  out2 = in2 + in13;                                                          \
+  out3 = in3 + in12;                                                          \
+  out4 = in4 + in11;                                                          \
+  out5 = in5 + in10;                                                          \
+  out6 = in6 + in9;                                                           \
+  out7 = in7 + in8;                                                           \
+                                                                              \
+  out8 = in7 - in8;                                                           \
+  out9 = in6 - in9;                                                           \
+  out10 = in5 - in10;                                                         \
+  out11 = in4 - in11;                                                         \
+  out12 = in3 - in12;                                                         \
+  out13 = in2 - in13;                                                         \
+  out14 = in1 - in14;                                                         \
+  out15 = in0 - in15;                                                         \
+}
+
+/* Description : Transpose input 8x8 byte block
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                        out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                           \
+  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                       \
+             tmp0_m, tmp1_m, tmp2_m, tmp3_m);                              \
+  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                             \
+  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                             \
+  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                             \
+  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                             \
+  SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                             \
+  SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                             \
+}
+#define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
+
+/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
+                           in8, in9, in10, in11, in12, in13, in14, in15
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - unsigned byte
+*/
+#define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                            in8, in9, in10, in11, in12, in13, in14, in15,      \
+                            out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                        \
+  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                        \
+                                                                               \
+  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                                 \
+  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                               \
+  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                               \
+  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                               \
+                                                                               \
+  tmp0_m = (v16u8)__msa_ilvev_b((v16i8)out6, (v16i8)out7);                     \
+  tmp4_m = (v16u8)__msa_ilvod_b((v16i8)out6, (v16i8)out7);                     \
+  tmp1_m = (v16u8)__msa_ilvev_b((v16i8)out4, (v16i8)out5);                     \
+  tmp5_m = (v16u8)__msa_ilvod_b((v16i8)out4, (v16i8)out5);                     \
+  out5 = (v16u8)__msa_ilvev_b((v16i8)out2, (v16i8)out3);                       \
+  tmp6_m = (v16u8)__msa_ilvod_b((v16i8)out2, (v16i8)out3);                     \
+  out7 = (v16u8)__msa_ilvev_b((v16i8)out0, (v16i8)out1);                       \
+  tmp7_m = (v16u8)__msa_ilvod_b((v16i8)out0, (v16i8)out1);                     \
+                                                                               \
+  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                     \
+  out0 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out4 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp1_m, (v8i16)tmp0_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)out7, (v8i16)out5);                     \
+  out2 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out6 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);                 \
+  out1 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out5 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+                                                                               \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp2_m = (v16u8)__msa_ilvod_h((v8i16)tmp5_m, (v8i16)tmp4_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  tmp3_m = (v16u8)__msa_ilvod_h((v8i16)tmp7_m, (v8i16)tmp6_m);                 \
+  out3 = (v16u8)__msa_ilvev_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+  out7 = (v16u8)__msa_ilvod_w((v4i32)tmp3_m, (v4i32)tmp2_m);                   \
+}
+
+/* Description : Transpose 4x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 s0_m, s1_m;                                                       \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                             \
+  ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                    \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out0);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)out0, (v2i64)out2);                   \
+}
+
+/* Description : Transpose 4x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE4X8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,            \
+                           out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                       \
+  v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                       \
+  v8i16 zero_m = { 0 };                                                       \
+                                                                              \
+  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,                          \
+             tmp0_n, tmp1_n, tmp2_n, tmp3_n);                                 \
+  ILVRL_W2_SH(tmp1_n, tmp0_n, tmp0_m, tmp2_m);                                \
+  ILVRL_W2_SH(tmp3_n, tmp2_n, tmp1_m, tmp3_m);                                \
+                                                                              \
+  out0 = (v8i16)__msa_ilvr_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out1 = (v8i16)__msa_ilvl_d((v2i64)tmp1_m, (v2i64)tmp0_m);                   \
+  out2 = (v8i16)__msa_ilvr_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+  out3 = (v8i16)__msa_ilvl_d((v2i64)tmp3_m, (v2i64)tmp2_m);                   \
+                                                                              \
+  out4 = zero_m;                                                              \
+  out5 = zero_m;                                                              \
+  out6 = zero_m;                                                              \
+  out7 = zero_m;                                                              \
+}
+
+/* Description : Transpose 8x4 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - signed halfword
+*/
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in1, in0, in3, in2, tmp0_m, tmp1_m);                         \
+  ILVL_H2_SH(in1, in0, in3, in2, tmp2_m, tmp3_m);                         \
+  ILVR_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2);                 \
+  ILVL_W2_SH(tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3);                 \
+}
+
+/* Description : Transpose 8x8 block with half word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+                 Return Type - as per RTYPE
+*/
+#define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
+                       out0, out1, out2, out3, out4, out5, out6, out7) {  \
+  v8i16 s0_m, s1_m;                                                       \
+  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \
+  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                   \
+                                                                          \
+  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                                \
+  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                                \
+  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                                \
+  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                             \
+  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                                \
+  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,         \
+           tmp3_m, tmp7_m, out0, out2, out4, out6);                       \
+  out1 = (RTYPE)__msa_pckod_d((v2i64)tmp0_m, (v2i64)tmp4_m);              \
+  out3 = (RTYPE)__msa_pckod_d((v2i64)tmp1_m, (v2i64)tmp5_m);              \
+  out5 = (RTYPE)__msa_pckod_d((v2i64)tmp2_m, (v2i64)tmp6_m);              \
+  out7 = (RTYPE)__msa_pckod_d((v2i64)tmp3_m, (v2i64)tmp7_m);              \
+}
+#define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
+
+/* Description : Transpose 4x4 block with word elements in vectors
+   Arguments   : Inputs  - in0, in1, in2, in3
+                 Outputs - out0, out1, out2, out3
+                 Return Type - signed word
+*/
+#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) {  \
+  v4i32 s0_m, s1_m, s2_m, s3_m;                                           \
+                                                                          \
+  ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                      \
+  ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                      \
+                                                                          \
+  out0 = (v4i32)__msa_ilvr_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out1 = (v4i32)__msa_ilvl_d((v2i64)s2_m, (v2i64)s0_m);                   \
+  out2 = (v4i32)__msa_ilvr_d((v2i64)s3_m, (v2i64)s1_m);                   \
+  out3 = (v4i32)__msa_ilvl_d((v2i64)s3_m, (v2i64)s1_m);                   \
+}
+
+/* Description : Add block 4x4
+   Arguments   : Inputs - in0, in1, in2, in3, pdst, stride
+   Details     : Least significant 4 bytes from each input vector are added to
+                 the destination bytes, clipped between 0-255 and stored.
+*/
+#define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) {     \
+  uint32_t src0_m, src1_m, src2_m, src3_m;                      \
+  v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \
+  v16i8 dst0_m = { 0 };                                         \
+  v16i8 dst1_m = { 0 };                                         \
+  v16i8 zero_m = { 0 };                                         \
+                                                                \
+  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \
+  LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \
+  INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \
+  INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \
+  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \
+  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \
+  CLIP_SH2_0_255(res0_m, res1_m);                               \
+  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \
+  ST4x4_UB(dst0_m, dst1_m, 0, 1, 0, 1, pdst, stride);           \
+}
+
+/* Description : Pack even elements of input vectors & xor with 128
+   Arguments   : Inputs - in0, in1
+                 Output - out_m
+                 Return Type - unsigned byte
+   Details     : Signed byte even elements from 'in0' and 'in1' are packed
+                 together in one vector and the resulting vector is xor'ed with
+                 128 to shift the range from signed to unsigned byte
+*/
+#define PCKEV_XORI128_UB(in0, in1) ({                    \
+  v16u8 out_m;                                           \
+                                                         \
+  out_m = (v16u8)__msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  out_m = (v16u8)__msa_xori_b((v16u8)out_m, 128);        \
+  out_m;                                                 \
+})
+
+/* Description : Converts inputs to unsigned bytes, interleave, average & store
+                 as 8x4 unsigned byte block
+   Arguments   : Inputs - in0, in1, in2, in3, dst0, dst1, dst2, dst3,
+                          pdst, stride
+*/
+#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                      \
+                                dst0, dst1, dst2, dst3, pdst, stride) {  \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+  uint8_t *pdst_m = (uint8_t *)(pdst);                                   \
+                                                                         \
+  tmp0_m = PCKEV_XORI128_UB(in0, in1);                                   \
+  tmp1_m = PCKEV_XORI128_UB(in2, in3);                                   \
+  ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                    \
+  AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);           \
+  ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                              \
+}
+
+/* Description : Pack even byte elements and store byte vector in destination
+                 memory
+   Arguments   : Inputs - in0, in1, pdst
+*/
+#define PCKEV_ST_SB(in0, in1, pdst) {             \
+  v16i8 tmp_m;                                    \
+                                                  \
+  tmp_m = __msa_pckev_b((v16i8)in1, (v16i8)in0);  \
+  ST_SB(tmp_m, (pdst));                           \
+}
+
+/* Description : Horizontal 2 tap filter kernel code
+   Arguments   : Inputs - in0, in1, mask, coeff, shift
+*/
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) ({    \
+  v16i8 tmp0_m;                                                \
+  v8u16 tmp1_m;                                                \
+                                                               \
+  tmp0_m = __msa_vshf_b((v16i8)mask, (v16i8)in1, (v16i8)in0);  \
+  tmp1_m = __msa_dotp_u_h((v16u8)tmp0_m, (v16u8)coeff);        \
+  tmp1_m = (v8u16)__msa_srari_h((v8i16)tmp1_m, shift);         \
+  tmp1_m = __msa_sat_u_h(tmp1_m, shift);                       \
+                                                               \
+  tmp1_m;                                                      \
+})
 #endif  /* VPX_DSP_MIPS_MACROS_MSA_H_ */

diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index 19ac47e..3bdec28 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c

@@ -11,6 +11,14 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/mips/macros_msa.h"
 
+#define SAD_INSVE_W4(RTYPE, in0, in1, in2, in3, out) {    \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 0, (v4i32)in0);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 1, (v4i32)in1);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 2, (v4i32)in2);  \
+  out = (RTYPE)__msa_insve_w((v4i32)out, 3, (v4i32)in3);  \
+}
+#define SAD_INSVE_W4_UB(...) SAD_INSVE_W4(v16u8, __VA_ARGS__)
+
 static uint32_t sad_4width_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *ref_ptr, int32_t ref_stride,
                                int32_t height) {
@@ -150,6 +158,640 @@
   return sad;
 }
 
+static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    src_ptr += (4 * src_stride);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref, ref0, ref1, diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_32width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, ref0_0, ref0_1, ref0_2, ref0, ref1;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+
+  for (ht_cnt = height >> 1; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+}
+
+static void sad_64width_x3_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4, ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v4u32 sad;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB4(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3);
+    ref0_4 = LD_UB(ref + 64);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32((v4i32)sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32((v4i32)sad);
+}
+
+static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                              const uint8_t *ref_ptr, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  uint32_t src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref2, ref3, diff;
+  v16u8 src = { 0 };
+  v16u8 ref = { 0 };
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LW4(src_ptr, src_stride, src0, src1, src2, src3);
+    INSERT_W4_UB(src0, src1, src2, src3, src);
+    src_ptr += (4 * src_stride);
+    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
+    ref_ptr += (4 * ref_stride);
+
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
+    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
+    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *ref, int32_t ref_stride,
+                              int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    LD_UB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
+    ref += (4 * ref_stride);
+    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22,
+                src0, src1, ref0, ref1);
+    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
+    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
+    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *ref_ptr, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src, ref0, ref1, ref;
+  v16u8 diff;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = (height >> 1); ht_cnt--;) {
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+
+    src = LD_UB(src_ptr);
+    src_ptr += src_stride;
+    LD_UB2(ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+
+    diff = __msa_asub_u_b(src, ref0);
+    sad0 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
+    diff = __msa_asub_u_b(src, ref);
+    sad1 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
+    diff = __msa_asub_u_b(src, ref);
+    sad2 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
+    diff = __msa_asub_u_b(src, ref);
+    sad3 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
+    diff = __msa_asub_u_b(src, ref);
+    sad4 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
+    diff = __msa_asub_u_b(src, ref);
+    sad5 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
+    diff = __msa_asub_u_b(src, ref);
+    sad6 += __msa_hadd_u_h(diff, diff);
+
+    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
+    diff = __msa_asub_u_b(src, ref);
+    sad7 += __msa_hadd_u_h(diff, diff);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_32width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  int32_t ht_cnt;
+  v16u8 src0, src1;
+  v16u8 ref0, ref1, ref0_0, ref0_1, ref0_2;
+  v8u16 sad0 = { 0 };
+  v8u16 sad1 = { 0 };
+  v8u16 sad2 = { 0 };
+  v8u16 sad3 = { 0 };
+  v8u16 sad4 = { 0 };
+  v8u16 sad5 = { 0 };
+  v8u16 sad6 = { 0 };
+  v8u16 sad7 = { 0 };
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB2(src, 16, src0, src1);
+    src += src_stride;
+    LD_UB3(ref, 16, ref0_0, ref0_1, ref0_2);
+    ref += ref_stride;
+
+    sad0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
+  }
+
+  sad_array[0] = HADD_UH_U32(sad0);
+  sad_array[1] = HADD_UH_U32(sad1);
+  sad_array[2] = HADD_UH_U32(sad2);
+  sad_array[3] = HADD_UH_U32(sad3);
+  sad_array[4] = HADD_UH_U32(sad4);
+  sad_array[5] = HADD_UH_U32(sad5);
+  sad_array[6] = HADD_UH_U32(sad6);
+  sad_array[7] = HADD_UH_U32(sad7);
+}
+
+static void sad_64width_x8_msa(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height, uint32_t *sad_array) {
+  const uint8_t *src_dup, *ref_dup;
+  int32_t ht_cnt;
+  v16u8 src0, src1, src2, src3;
+  v16u8 ref0_0, ref0_1, ref0_2, ref0_3, ref0_4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 sad0_0 = { 0 };
+  v8u16 sad0_1 = { 0 };
+  v8u16 sad1_0 = { 0 };
+  v8u16 sad1_1 = { 0 };
+  v8u16 sad2_0 = { 0 };
+  v8u16 sad2_1 = { 0 };
+  v8u16 sad3_0 = { 0 };
+  v8u16 sad3_1 = { 0 };
+  v4u32 sad;
+
+  src_dup = src;
+  ref_dup = ref;
+
+  for (ht_cnt = height; ht_cnt--;) {
+    LD_UB4(src, 16, src0, src1, src2, src3);
+    src += src_stride;
+    LD_UB5(ref, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref += ref_stride;
+
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0_0, ref0_1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref0_2, ref0_3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 1);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 1);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 2);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 2);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 3);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 3);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[0] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[1] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[2] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[3] = HADD_SW_S32(sad);
+
+  sad0_0 = (v8u16)__msa_ldi_h(0);
+  sad0_1 = (v8u16)__msa_ldi_h(0);
+  sad1_0 = (v8u16)__msa_ldi_h(0);
+  sad1_1 = (v8u16)__msa_ldi_h(0);
+  sad2_0 = (v8u16)__msa_ldi_h(0);
+  sad2_1 = (v8u16)__msa_ldi_h(0);
+  sad3_0 = (v8u16)__msa_ldi_h(0);
+  sad3_1 = (v8u16)__msa_ldi_h(0);
+
+  for (ht_cnt = 64; ht_cnt--;) {
+    LD_UB4(src_dup, 16, src0, src1, src2, src3);
+    src_dup += src_stride;
+    LD_UB5(ref_dup, 16, ref0_0, ref0_1, ref0_2, ref0_3, ref0_4);
+    ref_dup += ref_stride;
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 4);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 4);
+    sad0_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad0_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 5);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 5);
+    sad1_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad1_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 6);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 6);
+    sad2_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad2_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+
+    SLDI_B2_UB(ref0_1, ref0_2, ref0_0, ref0_1, ref0, ref1, 7);
+    SLDI_B2_UB(ref0_3, ref0_4, ref0_2, ref0_3, ref2, ref3, 7);
+    sad3_0 += SAD_UB2_UH(src0, src1, ref0, ref1);
+    sad3_1 += SAD_UB2_UH(src2, src3, ref2, ref3);
+  }
+
+  sad = __msa_hadd_u_w(sad0_0, sad0_0);
+  sad += __msa_hadd_u_w(sad0_1, sad0_1);
+  sad_array[4] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad1_0, sad1_0);
+  sad += __msa_hadd_u_w(sad1_1, sad1_1);
+  sad_array[5] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad2_0, sad2_0);
+  sad += __msa_hadd_u_w(sad2_1, sad2_1);
+  sad_array[6] = HADD_SW_S32(sad);
+
+  sad = __msa_hadd_u_w(sad3_0, sad3_0);
+  sad += __msa_hadd_u_w(sad3_1, sad3_1);
+  sad_array[7] = HADD_SW_S32(sad);
+}
+
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t * const aref_ptr[],
                                int32_t ref_stride,
@@ -646,6 +1288,76 @@
   return sad_64width_msa(src, src_stride, ref, ref_stride, height);          \
 }
 
+#define VPX_SAD_4xHEIGHTx3_MSA(height)                                  \
+void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx3_MSA(height)                                  \
+void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx3_MSA(height)                                  \
+void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx3_MSA(height)                                  \
+void vpx_sad32x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx3_MSA(height)                                  \
+void vpx_sad64x##height##x3_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x3_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_4xHEIGHTx8_MSA(height)                                  \
+void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_8xHEIGHTx8_MSA(height)                                  \
+void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                               const uint8_t *ref, int32_t ref_stride,  \
+                               uint32_t *sads) {                        \
+  sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_16xHEIGHTx8_MSA(height)                                  \
+void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_32xHEIGHTx8_MSA(height)                                  \
+void vpx_sad32x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_32width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
+#define VPX_SAD_64xHEIGHTx8_MSA(height)                                  \
+void vpx_sad64x##height##x8_msa(const uint8_t *src, int32_t src_stride,  \
+                                const uint8_t *ref, int32_t ref_stride,  \
+                                uint32_t *sads) {                        \
+  sad_64width_x8_msa(src, src_stride, ref, ref_stride, height, sads);    \
+}
+
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                  \
 void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride,  \
                                 const uint8_t *const refs[],             \
@@ -723,65 +1435,91 @@
 
 // 64x64
 VPX_SAD_64xHEIGHT_MSA(64);
+VPX_SAD_64xHEIGHTx3_MSA(64);
+VPX_SAD_64xHEIGHTx8_MSA(64);
 VPX_SAD_64xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_64xHEIGHT_MSA(64);
 
 // 64x32
 VPX_SAD_64xHEIGHT_MSA(32);
+VPX_SAD_64xHEIGHTx3_MSA(32);
+VPX_SAD_64xHEIGHTx8_MSA(32);
 VPX_SAD_64xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_64xHEIGHT_MSA(32);
 
 // 32x64
 VPX_SAD_32xHEIGHT_MSA(64);
+VPX_SAD_32xHEIGHTx3_MSA(64);
+VPX_SAD_32xHEIGHTx8_MSA(64);
 VPX_SAD_32xHEIGHTx4D_MSA(64);
 VPX_AVGSAD_32xHEIGHT_MSA(64);
 
 // 32x32
 VPX_SAD_32xHEIGHT_MSA(32);
+VPX_SAD_32xHEIGHTx3_MSA(32);
+VPX_SAD_32xHEIGHTx8_MSA(32);
 VPX_SAD_32xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_32xHEIGHT_MSA(32);
 
 // 32x16
 VPX_SAD_32xHEIGHT_MSA(16);
+VPX_SAD_32xHEIGHTx3_MSA(16);
+VPX_SAD_32xHEIGHTx8_MSA(16);
 VPX_SAD_32xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_32xHEIGHT_MSA(16);
 
 // 16x32
 VPX_SAD_16xHEIGHT_MSA(32);
+VPX_SAD_16xHEIGHTx3_MSA(32);
+VPX_SAD_16xHEIGHTx8_MSA(32);
 VPX_SAD_16xHEIGHTx4D_MSA(32);
 VPX_AVGSAD_16xHEIGHT_MSA(32);
 
 // 16x16
 VPX_SAD_16xHEIGHT_MSA(16);
+VPX_SAD_16xHEIGHTx3_MSA(16);
+VPX_SAD_16xHEIGHTx8_MSA(16);
 VPX_SAD_16xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_16xHEIGHT_MSA(16);
 
 // 16x8
 VPX_SAD_16xHEIGHT_MSA(8);
+VPX_SAD_16xHEIGHTx3_MSA(8);
+VPX_SAD_16xHEIGHTx8_MSA(8);
 VPX_SAD_16xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_16xHEIGHT_MSA(8);
 
 // 8x16
 VPX_SAD_8xHEIGHT_MSA(16);
+VPX_SAD_8xHEIGHTx3_MSA(16);
+VPX_SAD_8xHEIGHTx8_MSA(16);
 VPX_SAD_8xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_8xHEIGHT_MSA(16);
 
 // 8x8
 VPX_SAD_8xHEIGHT_MSA(8);
+VPX_SAD_8xHEIGHTx3_MSA(8);
+VPX_SAD_8xHEIGHTx8_MSA(8);
 VPX_SAD_8xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_8xHEIGHT_MSA(8);
 
 // 8x4
 VPX_SAD_8xHEIGHT_MSA(4);
+VPX_SAD_8xHEIGHTx3_MSA(4);
+VPX_SAD_8xHEIGHTx8_MSA(4);
 VPX_SAD_8xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_8xHEIGHT_MSA(4);
 
 // 4x8
 VPX_SAD_4xHEIGHT_MSA(8);
+VPX_SAD_4xHEIGHTx3_MSA(8);
+VPX_SAD_4xHEIGHTx8_MSA(8);
 VPX_SAD_4xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_4xHEIGHT_MSA(8);
 
 // 4x4
 VPX_SAD_4xHEIGHT_MSA(4);
+VPX_SAD_4xHEIGHTx3_MSA(4);
+VPX_SAD_4xHEIGHTx8_MSA(4);
 VPX_SAD_4xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_4xHEIGHT_MSA(4);

diff --git a/vpx_dsp/mips/sub_pixel_variance_msa.c b/vpx_dsp/mips/sub_pixel_variance_msa.c
new file mode 100644
index 0000000..5168192
--- /dev/null
+++ b/vpx_dsp/mips/sub_pixel_variance_msa.c

@@ -0,0 +1,767 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/mips/macros_msa.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_msa[8][2] = {
+  { 128,   0, },
+  { 112,  16, },
+  {  96,  32, },
+  {  80,  48, },
+  {  64,  64, },
+  {  48,  80, },
+  {  32,  96, },
+  {  16, 112, },
+};
+
+#define CALC_MSE_AVG_B(src, ref, var, sub) {                       \
+  v16u8 src_l0_m, src_l1_m;                                        \
+  v8i16 res_l0_m, res_l1_m;                                        \
+                                                                   \
+  ILVRL_B2_UB(src, ref, src_l0_m, src_l1_m);                       \
+  HSUB_UB2_SH(src_l0_m, src_l1_m, res_l0_m, res_l1_m);             \
+  DPADD_SH2_SW(res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var);  \
+                                                                   \
+  sub += res_l0_m + res_l1_m;                                      \
+}
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  sse - (((uint32_t)diff * diff) >> shift)
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  sse - (((int64_t)diff * diff) >> shift)
+
+static uint32_t sub_pixel_sse_diff_4width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 filt0, ref = { 0 };
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    ILVEV_W2_SB(src0, src1, src2, src3, src0, src2);
+    src0 = (v16i8)__msa_ilvev_d((v2i64)src2, (v2i64)src0);
+    CALC_MSE_AVG_B(src0, ref, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 filt0, out, ref0, ref1, ref2, ref3;
+  v16i8 src0, src1, src2, src3;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 vec0, vec1, vec2, vec3, const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src1, src2, src3);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                vec0, vec1, vec2, vec3);
+    SRARI_H4_UH(vec0, vec1, vec2, vec3, FILTER_BITS);
+    MIN_UH4_UH(vec0, vec1, vec2, vec3, const255);
+    PCKEV_B4_SB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
+                src0, src1, src2, src3);
+    out = (v16u8)__msa_ilvev_d((v2i64)src1, (v2i64)src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = (v16u8)__msa_ilvev_d((v2i64)src3, (v2i64)src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16i8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v16u8 dst0, dst1, dst2, dst3, filt0;
+  v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 out0, out1, out2, out3, out4, out5, out6, out7;
+  v8u16 const255;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  const255 = (v8u16)__msa_ldi_h(255);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_SB4(src, src_stride, src0, src2, src4, src6);
+    LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
+    dst += (4 * dst_stride);
+
+    VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
+    VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
+    VSHF_B2_UH(src4, src4, src5, src5, mask, mask, vec4, vec5);
+    VSHF_B2_UH(src6, src6, src7, src7, mask, mask, vec6, vec7);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                out0, out1, out2, out3);
+    DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
+                out4, out5, out6, out7);
+    SRARI_H4_UH(out0, out1, out2, out3, FILTER_BITS);
+    SRARI_H4_UH(out4, out5, out6, out7, FILTER_BITS);
+    MIN_UH4_UH(out0, out1, out2, out3, const255);
+    MIN_UH4_UH(out4, out5, out6, out7, const255);
+    PCKEV_B4_SB(out1, out0, out3, out2, out5, out4, out7, out6,
+                src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_h_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_h_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4, out;
+  v16u8 src10_r, src32_r, src21_r, src43_r;
+  v16u8 ref = { 0 };
+  v16u8 src2110, src4332;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+  v8u16 tmp0, tmp1;
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
+               src10_r, src21_r, src32_r, src43_r);
+    ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
+    DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_msa(const uint8_t *src,
+                                                int32_t src_stride,
+                                                const uint8_t *dst,
+                                                int32_t dst_stride,
+                                                const uint8_t *filter,
+                                                int32_t height,
+                                                int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 ref0, ref1, ref2, ref3;
+  v8u16 vec0, vec1, vec2, vec3;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    ILVR_B4_UH(src1, src0, src2, src1, src3, src2, src4, src3,
+               vec0, vec1, vec2, vec3);
+    DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
+                tmp0, tmp1, tmp2, tmp3);
+    SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, FILTER_BITS);
+    SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1, out2, out3;
+  v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter);
+  filt0 = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
+    ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
+    DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
+    ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
+    DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    out1 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    out3 = (v16u8)__msa_pckev_b((v16i8)tmp3, (v16i8)tmp2);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_v_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_v_msa(src, src_stride, dst, dst_stride,
+                                            filter, height, &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_4width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  uint32_t ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out, ref = { 0 };
+  v16u8 filt_vt, filt_hz, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LW4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+    INSERT_W4_UB(ref0, ref1, ref2, ref3, ref);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src0, src1, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src2, src3, mask, filt_hz, FILTER_BITS);
+    hz_out4 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out1 = (v8u16)__msa_sldi_b((v16i8)hz_out2, (v16i8)hz_out0, 8);
+    hz_out3 = (v8u16)__msa_pckod_d((v2i64)hz_out4, (v2i64)hz_out2);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    out = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+    CALC_MSE_AVG_B(out, ref, var, avg);
+    src0 = src4;
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_msa(const uint8_t *src,
+                                                 int32_t src_stride,
+                                                 const uint8_t *dst,
+                                                 int32_t dst_stride,
+                                                 const uint8_t *filter_horiz,
+                                                 const uint8_t *filter_vert,
+                                                 int32_t height,
+                                                 int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 src0, src1, src2, src3, src4;
+  v16u8 out0, out1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1;
+  v8u16 tmp0, tmp1, tmp2, tmp3;
+  v16u8 filt_vt, filt_hz, vec0;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  src0 = LD_UB(src);
+  src += src_stride;
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src1, src2, src3, src4);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    PCKEV_D2_UB(ref1, ref0, ref3, ref2, ref0, ref1);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp0 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp1 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    hz_out1 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out1, (v16i8)hz_out0);
+    tmp2 = __msa_dotp_u_h(vec0, filt_vt);
+    hz_out0 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    vec0 = (v16u8)__msa_ilvev_b((v16i8)hz_out0, (v16i8)hz_out1);
+    tmp3 = __msa_dotp_u_h(vec0, filt_vt);
+    SRARI_H2_UH(tmp2, tmp3, FILTER_BITS);
+    SAT_UH2_UH(tmp2, tmp3, 7);
+    PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  int16_t filtval;
+  uint32_t loop_cnt;
+  v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+  v16u8 ref0, ref1, ref2, ref3;
+  v16u8 filt_hz, filt_vt, vec0, vec1;
+  v16u8 mask = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
+  v8u16 hz_out0, hz_out1, hz_out2, hz_out3;
+  v8u16 tmp0, tmp1;
+  v8i16 avg = { 0 };
+  v4i32 vec, var = { 0 };
+
+  filtval = LH(filter_horiz);
+  filt_hz = (v16u8)__msa_fill_h(filtval);
+  filtval = LH(filter_vert);
+  filt_vt = (v16u8)__msa_fill_h(filtval);
+
+  LD_UB2(src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+  hz_out2 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    LD_UB4(src, src_stride, src0, src2, src4, src6);
+    LD_UB4(src + 8, src_stride, src1, src3, src5, src7);
+    src += (4 * src_stride);
+    LD_UB4(dst, dst_stride, ref0, ref1, ref2, ref3);
+    dst += (4 * dst_stride);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src0 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src1 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out1 = HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS);
+    hz_out3 = HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src2 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    hz_out0 = HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS);
+    hz_out2 = HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS);
+    ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
+    SRARI_H2_UH(tmp0, tmp1, FILTER_BITS);
+    SAT_UH2_UH(tmp0, tmp1, 7);
+    src3 = (v16u8)__msa_pckev_b((v16i8)tmp1, (v16i8)tmp0);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+
+  vec = __msa_hadd_s_w(avg, avg);
+  *diff = HADD_SW_S32(vec);
+
+  return HADD_SW_S32(var);
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[2];
+
+  for (loop_cnt = 0; loop_cnt < 2; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_64width_hv_msa(const uint8_t *src,
+                                                  int32_t src_stride,
+                                                  const uint8_t *dst,
+                                                  int32_t dst_stride,
+                                                  const uint8_t *filter_horiz,
+                                                  const uint8_t *filter_vert,
+                                                  int32_t height,
+                                                  int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += sub_pixel_sse_diff_16width_hv_msa(src, src_stride, dst, dst_stride,
+                                             filter_horiz, filter_vert, height,
+                                             &diff0[loop_cnt]);
+    src += 16;
+    dst += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_4Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 4);
+#define VARIANCE_4Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx4H(sse, diff) VARIANCE_WxH(sse, diff, 5);
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6);
+#define VARIANCE_8Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 7);
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8);
+
+#define VARIANCE_16Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx16H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 9);
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10);
+#define VARIANCE_32Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 11);
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12);
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(wd, ht)                         \
+uint32_t vpx_sub_pixel_variance##wd##x##ht##_msa(const uint8_t *src,     \
+                                                 int32_t src_stride,     \
+                                                 int32_t xoffset,        \
+                                                 int32_t yoffset,        \
+                                                 const uint8_t *ref,     \
+                                                 int32_t ref_stride,     \
+                                                 uint32_t *sse) {        \
+  int32_t diff;                                                          \
+  uint32_t var;                                                          \
+  const uint8_t *h_filter = bilinear_filters_msa[xoffset];               \
+  const uint8_t *v_filter = bilinear_filters_msa[yoffset];               \
+                                                                         \
+  if (yoffset) {                                                         \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_hv_msa(src, src_stride,      \
+                                                   ref, ref_stride,      \
+                                                   h_filter, v_filter,   \
+                                                   ht, &diff);           \
+    } else {                                                             \
+      *sse = sub_pixel_sse_diff_##wd##width_v_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  v_filter, ht, &diff);  \
+    }                                                                    \
+                                                                         \
+    var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                          \
+  } else {                                                               \
+    if (xoffset) {                                                       \
+      *sse = sub_pixel_sse_diff_##wd##width_h_msa(src, src_stride,       \
+                                                  ref, ref_stride,       \
+                                                  h_filter, ht, &diff);  \
+                                                                         \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                        \
+    } else {                                                             \
+      var = vpx_variance##wd##x##ht##_msa(src, src_stride,               \
+                                          ref, ref_stride, sse);         \
+    }                                                                    \
+  }                                                                      \
+                                                                         \
+  return var;                                                            \
+}
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(4, 8);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 4);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(8, 16);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 8);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(16, 32);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 16);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(32, 64);
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 32);
+VPX_SUB_PIXEL_VARIANCE_WDXHT_MSA(64, 64);

diff --git a/vp9/encoder/mips/msa/vp9_subtract_msa.c b/vpx_dsp/mips/subtract_msa.c
similarity index 96%
rename from vp9/encoder/mips/msa/vp9_subtract_msa.c
rename to vpx_dsp/mips/subtract_msa.c
index 1b8b694..9ac43c5 100644
--- a/vp9/encoder/mips/msa/vp9_subtract_msa.c
+++ b/vpx_dsp/mips/subtract_msa.c

@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#include "./vp9_rtcd.h"
-#include "vp9/common/mips/msa/vp9_macros_msa.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/mips/macros_msa.h"
 
 static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride,
                             const uint8_t *pred_ptr, int32_t pred_stride,
@@ -226,7 +226,7 @@
   }
 }
 
-void vp9_subtract_block_msa(int32_t rows, int32_t cols,
+void vpx_subtract_block_msa(int32_t rows, int32_t cols,
                             int16_t *diff_ptr, ptrdiff_t diff_stride,
                             const uint8_t *src_ptr, ptrdiff_t src_stride,
                             const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
@@ -253,12 +253,12 @@
                           diff_ptr, diff_stride);
         break;
       default:
-        vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
                              src_stride, pred_ptr, pred_stride);
         break;
     }
   } else {
-    vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
                          pred_ptr, pred_stride);
   }
 }

diff --git a/vpx_dsp/subtract.c b/vpx_dsp/subtract.c
new file mode 100644
index 0000000..556e013
--- /dev/null
+++ b/vpx_dsp/subtract.c

@@ -0,0 +1,56 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+void vpx_subtract_block_c(int rows, int cols,
+                          int16_t *diff, ptrdiff_t diff_stride,
+                          const uint8_t *src, ptrdiff_t src_stride,
+                          const uint8_t *pred, ptrdiff_t pred_stride) {
+  int r, c;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++)
+      diff[c] = src[c] - pred[c];
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_subtract_block_c(int rows, int cols,
+                                 int16_t *diff, ptrdiff_t diff_stride,
+                                 const uint8_t *src8, ptrdiff_t src_stride,
+                                 const uint8_t *pred8, ptrdiff_t pred_stride,
+                                 int bd) {
+  int r, c;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  (void) bd;
+
+  for (r = 0; r < rows; r++) {
+    for (c = 0; c < cols; c++) {
+      diff[c] = src[c] - pred[c];
+    }
+
+    diff += diff_stride;
+    pred += pred_stride;
+    src  += src_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index 084dd7b..e8bddb0 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c

@@ -14,13 +14,26 @@
 #include "vpx_ports/mem.h"
 #include "vpx/vpx_integer.h"
 
-unsigned int vpx_get4x4sse_cs_c(const unsigned char *a, int  a_stride,
-                                const unsigned char *b, int  b_stride) {
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters[8][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
+
+uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
+                            const uint8_t *b, int  b_stride) {
   int distortion = 0;
   int r, c;
 
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++) {
+  for (r = 0; r < 4; ++r) {
+    for (c = 0; c < 4; ++c) {
       int diff = a[c] - b[c];
       distortion += diff * diff;
     }
@@ -32,7 +45,7 @@
   return distortion;
 }
 
-unsigned int vpx_get_mb_ss_c(const int16_t *a) {
+uint32_t vpx_get_mb_ss_c(const int16_t *a) {
   unsigned int i, sum = 0;
 
   for (i = 0; i < 256; ++i) {
@@ -42,16 +55,38 @@
   return sum;
 }
 
+uint32_t vpx_variance_halfpixvar16x16_h_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 0,
+                                       b, b_stride, sse);
+}
+
+
+uint32_t vpx_variance_halfpixvar16x16_v_c(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 0, 4,
+                                       b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_c(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_c(a, a_stride, 4, 4,
+                                       b, b_stride, sse);
+}
+
 static void variance(const uint8_t *a, int  a_stride,
                      const uint8_t *b, int  b_stride,
-                     int  w, int  h, unsigned int *sse, int *sum) {
+                     int  w, int  h, uint32_t *sse, int *sum) {
   int i, j;
 
   *sum = 0;
   *sse = 0;
 
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
       *sum += diff;
       *sse += diff * diff;
@@ -62,15 +97,113 @@
   }
 }
 
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the first-pass of 2-D separable filter.
+//
+// Produces int16_t output to retain precision for the next pass. Two filter
+// taps should sum to FILTER_WEIGHT. pixel_step defines whether the filter is
+// applied horizontally (pixel_step = 1) or vertically (pixel_step = stride).
+// It defines the offset required to move from one input to the next.
+static void var_filter_block2d_bil_first_pass(const uint8_t *a, uint16_t *b,
+                                              unsigned int src_pixels_per_line,
+                                              int pixel_step,
+                                              unsigned int output_height,
+                                              unsigned int output_width,
+                                              const uint8_t *filter) {
+  unsigned int i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
+// Applies a 1-D 2-tap bilinear filter to the source block in either horizontal
+// or vertical direction to produce the filtered output block. Used to implement
+// the second-pass of 2-D separable filter.
+//
+// Requires 16-bit input as produced by filter_block2d_bil_first_pass. Two
+// filter taps should sum to FILTER_WEIGHT. pixel_step defines whether the
+// filter is applied horizontally (pixel_step = 1) or vertically
+// (pixel_step = stride). It defines the offset required to move from one input
+// to the next. Output is 8-bit.
+static void var_filter_block2d_bil_second_pass(const uint16_t *a, uint8_t *b,
+                                               unsigned int src_pixels_per_line,
+                                               unsigned int pixel_step,
+                                               unsigned int output_height,
+                                               unsigned int output_width,
+                                               const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      b[j] = ROUND_POWER_OF_TWO((int)a[0] * filter[0] +
+                          (int)a[pixel_step] * filter[1],
+                          FILTER_BITS);
+      ++a;
+    }
+
+    a += src_pixels_per_line - output_width;
+    b += output_width;
+  }
+}
+
 #define VAR(W, H) \
-unsigned int vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                       const uint8_t *b, int b_stride, \
-                                       unsigned int *sse) { \
+uint32_t vpx_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                   const uint8_t *b, int b_stride, \
+                                   uint32_t *sse) { \
   int sum; \
   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
   return *sse - (((int64_t)sum * sum) / (W * H)); \
 }
 
+#define SUBPIX_VAR(W, H) \
+uint32_t vpx_sub_pixel_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                             int xoffset, int  yoffset, \
+                                             const uint8_t *b, int b_stride, \
+                                             uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
+}
+
+#define SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_sub_pixel_avg_variance##W##x##H##_c(const uint8_t *a, \
+                                                 int  a_stride, \
+                                                 int xoffset, int  yoffset, \
+                                                 const uint8_t *b, \
+                                                 int b_stride, \
+                                                 uint32_t *sse, \
+                                                 const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
+\
+  var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
+                                    bilinear_filters[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters[yoffset]); \
+\
+  vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
+\
+  return vpx_variance##W##x##H##_c(temp3, W, b, b_stride, sse); \
+}
+
 /* Identical to the variance call except it takes an additional parameter, sum,
  * and returns that value using pass-by-reference instead of returning
  * sse - sum^2 / w*h
@@ -78,7 +211,7 @@
 #define GET_VAR(W, H) \
 void vpx_get##W##x##H##var_c(const uint8_t *a, int a_stride, \
                              const uint8_t *b, int b_stride, \
-                             unsigned int *sse, int *sum) { \
+                             uint32_t *sse, int *sum) { \
   variance(a, a_stride, b, b_stride, W, H, sse, sum); \
 }
 
@@ -87,27 +220,33 @@
  * variable.
  */
 #define MSE(W, H) \
-unsigned int vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
-                                  const uint8_t *b, int b_stride, \
-                                  unsigned int *sse) { \
+uint32_t vpx_mse##W##x##H##_c(const uint8_t *a, int a_stride, \
+                              const uint8_t *b, int b_stride, \
+                              uint32_t *sse) { \
   int sum; \
   variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
   return *sse; \
 }
 
-VAR(64, 64)
-VAR(64, 32)
-VAR(32, 64)
-VAR(32, 32)
-VAR(32, 16)
-VAR(16, 32)
-VAR(16, 16)
-VAR(16, 8)
-VAR(8, 16)
-VAR(8, 8)
-VAR(8, 4)
-VAR(4, 8)
-VAR(4, 4)
+/* All three forms of the variance are available in the same sizes. */
+#define VARIANCES(W, H) \
+    VAR(W, H) \
+    SUBPIX_VAR(W, H) \
+    SUBPIX_AVG_VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
 
 GET_VAR(16, 16)
 GET_VAR(8, 8)
@@ -117,12 +256,13 @@
 MSE(8, 16)
 MSE(8, 8)
 
-void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred, int width,
-                         int height, const uint8_t *ref, int ref_stride) {
+void vpx_comp_avg_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                         int width, int height,
+                         const uint8_t *ref, int ref_stride) {
   int i, j;
 
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
       const int tmp = pred[j] + ref[j];
       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
     }
@@ -143,8 +283,8 @@
   *sum = 0;
   *sse = 0;
 
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
       const int diff = a[j] - b[j];
       *sum += diff;
       *sse += diff * diff;
@@ -156,60 +296,60 @@
 
 static void highbd_8_variance(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
-                              int w, int h, unsigned int *sse, int *sum) {
+                              int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
+  *sse = (uint32_t)sse_long;
   *sum = (int)sum_long;
 }
 
 static void highbd_10_variance(const uint8_t *a8, int  a_stride,
                                const uint8_t *b8, int  b_stride,
-                               int w, int h, unsigned int *sse, int *sum) {
+                               int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 4);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 4);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 2);
 }
 
 static void highbd_12_variance(const uint8_t *a8, int  a_stride,
                                const uint8_t *b8, int  b_stride,
-                               int w, int h, unsigned int *sse, int *sum) {
+                               int w, int h, uint32_t *sse, int *sum) {
   uint64_t sse_long = 0;
   uint64_t sum_long = 0;
   highbd_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
-  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse_long, 8);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse_long, 8);
   *sum = (int)ROUND_POWER_OF_TWO(sum_long, 4);
 }
 
 #define HIGHBD_VAR(W, H) \
-unsigned int vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
-                                                int a_stride, \
-                                                const uint8_t *b, \
-                                                int b_stride, \
-                                                unsigned int *sse) { \
+uint32_t vpx_highbd_8_variance##W##x##H##_c(const uint8_t *a, \
+                                            int a_stride, \
+                                            const uint8_t *b, \
+                                            int b_stride, \
+                                            uint32_t *sse) { \
   int sum; \
   highbd_8_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
   return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
-unsigned int vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
+uint32_t vpx_highbd_10_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
   int sum; \
   highbd_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
   return *sse - (((int64_t)sum * sum) / (W * H)); \
 } \
 \
-unsigned int vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
-                                                 int a_stride, \
-                                                 const uint8_t *b, \
-                                                 int b_stride, \
-                                                 unsigned int *sse) { \
+uint32_t vpx_highbd_12_variance##W##x##H##_c(const uint8_t *a, \
+                                             int a_stride, \
+                                             const uint8_t *b, \
+                                             int b_stride, \
+                                             uint32_t *sse) { \
   int sum; \
   highbd_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
   return *sse - (((int64_t)sum * sum) / (W * H)); \
@@ -217,54 +357,243 @@
 
 #define HIGHBD_GET_VAR(S) \
 void vpx_highbd_8_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
-                                    const uint8_t *ref, int ref_stride, \
-                                    unsigned int *sse, int *sum) { \
+                                      const uint8_t *ref, int ref_stride, \
+                                      uint32_t *sse, int *sum) { \
   highbd_8_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                        const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
+                                       uint32_t *sse, int *sum) { \
   highbd_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 } \
 \
 void vpx_highbd_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
                                        const uint8_t *ref, int ref_stride, \
-                                       unsigned int *sse, int *sum) { \
+                                       uint32_t *sse, int *sum) { \
   highbd_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
 }
 
 #define HIGHBD_MSE(W, H) \
-unsigned int vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
-                                         int src_stride, \
-                                         const uint8_t *ref, \
-                                         int ref_stride, \
-                                         unsigned int *sse) { \
+uint32_t vpx_highbd_8_mse##W##x##H##_c(const uint8_t *src, \
+                                       int src_stride, \
+                                       const uint8_t *ref, \
+                                       int ref_stride, \
+                                       uint32_t *sse) { \
   int sum; \
   highbd_8_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
   return *sse; \
 } \
 \
-unsigned int vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
+uint32_t vpx_highbd_10_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
   int sum; \
   highbd_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
   return *sse; \
 } \
 \
-unsigned int vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
-                                            int src_stride, \
-                                            const uint8_t *ref, \
-                                            int ref_stride, \
-                                            unsigned int *sse) { \
+uint32_t vpx_highbd_12_mse##W##x##H##_c(const uint8_t *src, \
+                                        int src_stride, \
+                                        const uint8_t *ref, \
+                                        int ref_stride, \
+                                        uint32_t *sse) { \
   int sum; \
   highbd_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
   return *sse; \
 }
 
+static void highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+
+      ++src_ptr;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; ++i) {
+    for (j = 0; j < output_width; ++j) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * filter[0] +
+                             (int)src_ptr[pixel_step] * filter[1],
+                             FILTER_BITS);
+      ++src_ptr;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGHBD_SUBPIX_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                             W, dst, dst_stride, sse); \
+}
+
+#define HIGHBD_SUBPIX_AVG_VAR(W, H) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                          dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  uint32_t *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
+\
+  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                           W, bilinear_filters[xoffset]); \
+  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                            bilinear_filters[yoffset]); \
+\
+  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
+                           CONVERT_TO_BYTEPTR(temp2), W); \
+\
+  return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
+                                             W, dst, dst_stride, sse); \
+}
+
+/* All three forms of the variance are available in the same sizes. */
+#define HIGHBD_VARIANCES(W, H) \
+    HIGHBD_VAR(W, H) \
+    HIGHBD_SUBPIX_VAR(W, H) \
+    HIGHBD_SUBPIX_AVG_VAR(W, H)
+
+HIGHBD_VARIANCES(64, 64)
+HIGHBD_VARIANCES(64, 32)
+HIGHBD_VARIANCES(32, 64)
+HIGHBD_VARIANCES(32, 32)
+HIGHBD_VARIANCES(32, 16)
+HIGHBD_VARIANCES(16, 32)
+HIGHBD_VARIANCES(16, 16)
+HIGHBD_VARIANCES(16, 8)
+HIGHBD_VARIANCES(8, 16)
+HIGHBD_VARIANCES(8, 8)
+HIGHBD_VARIANCES(8, 4)
+HIGHBD_VARIANCES(4, 8)
+HIGHBD_VARIANCES(4, 4)
+
 HIGHBD_GET_VAR(8)
 HIGHBD_GET_VAR(16)
 
@@ -273,28 +602,14 @@
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-HIGHBD_VAR(64, 64)
-HIGHBD_VAR(64, 32)
-HIGHBD_VAR(32, 64)
-HIGHBD_VAR(32, 32)
-HIGHBD_VAR(32, 16)
-HIGHBD_VAR(16, 32)
-HIGHBD_VAR(16, 16)
-HIGHBD_VAR(16, 8)
-HIGHBD_VAR(8, 16)
-HIGHBD_VAR(8, 8)
-HIGHBD_VAR(8, 4)
-HIGHBD_VAR(4, 8)
-HIGHBD_VAR(4, 4)
-
 void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
                               int width, int height, const uint8_t *ref8,
                               int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-  for (i = 0; i < height; i++) {
-    for (j = 0; j < width; j++) {
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
       const int tmp = pred[j] + ref[j];
       comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
     }

diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
new file mode 100644
index 0000000..c18d9b4
--- /dev/null
+++ b/vpx_dsp/variance.h

@@ -0,0 +1,94 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_VARIANCE_H_
+#define VPX_DSP_VARIANCE_H_
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define FILTER_BITS 7
+#define FILTER_WEIGHT 128
+
+typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
+                                    const uint8_t *b_ptr, int b_stride);
+
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
+                                        const uint8_t *b_ptr, int b_stride,
+                                        const uint8_t *second_pred);
+
+typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
+                                  uint8_t *b, int b_stride, int n);
+
+typedef void (*vpx_sad_multi_fn_t)(const uint8_t *a, int a_stride,
+                                   const uint8_t *b, int b_stride,
+                                   unsigned int *sad_array);
+
+typedef void (*vpx_sad_multi_d_fn_t)(const uint8_t *a, int a_stride,
+                                     const uint8_t *const b_array[],
+                                     int b_stride,
+                                     unsigned int *sad_array);
+
+typedef unsigned int (*vpx_variance_fn_t)(const uint8_t *a, int a_stride,
+                                          const uint8_t *b, int b_stride,
+                                          unsigned int *sse);
+
+typedef unsigned int (*vpx_subpixvariance_fn_t)(const uint8_t *a, int a_stride,
+                                                int xoffset, int yoffset,
+                                                const uint8_t *b, int b_stride,
+                                                unsigned int *sse);
+
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+                                                   int a_stride,
+                                                   int xoffset, int yoffset,
+                                                   const uint8_t *b_ptr,
+                                                   int b_stride,
+                                                   unsigned int *sse,
+                                                   const uint8_t *second_pred);
+#if CONFIG_VP8
+typedef struct variance_vtable {
+  vpx_sad_fn_t            sdf;
+  vpx_variance_fn_t       vf;
+  vpx_subpixvariance_fn_t svf;
+  vpx_variance_fn_t       svf_halfpix_h;
+  vpx_variance_fn_t       svf_halfpix_v;
+  vpx_variance_fn_t       svf_halfpix_hv;
+  vpx_sad_multi_fn_t      sdx3f;
+  vpx_sad_multi_fn_t      sdx8f;
+  vpx_sad_multi_d_fn_t    sdx4df;
+#if ARCH_X86 || ARCH_X86_64
+  vp8_copy32xn_fn_t       copymem;
+#endif
+} vp8_variance_fn_ptr_t;
+#endif  // CONFIG_VP8
+
+#if CONFIG_VP9
+typedef struct vp9_variance_vtable {
+  vpx_sad_fn_t               sdf;
+  vpx_sad_avg_fn_t           sdaf;
+  vpx_variance_fn_t          vf;
+  vpx_subpixvariance_fn_t    svf;
+  vpx_subp_avg_variance_fn_t svaf;
+  vpx_sad_multi_fn_t         sdx3f;
+  vpx_sad_multi_fn_t         sdx8f;
+  vpx_sad_multi_d_fn_t       sdx4df;
+} vp9_variance_fn_ptr_t;
+#endif  // CONFIG_VP9
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_DSP_VARIANCE_H_

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index d8ab108..70a131c 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -10,15 +10,19 @@
 
 DSP_SRCS-yes += vpx_dsp.mk
 
+DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
+DSP_SRCS-yes            += subtract.c
 
 DSP_SRCS-$(HAVE_MEDIA)  += arm/sad_media$(ASM)
 DSP_SRCS-$(HAVE_NEON)   += arm/sad4d_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/sad_neon.c
+DSP_SRCS-$(HAVE_NEON)   += arm/subtract_neon.c
 
-DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
 DSP_SRCS-$(HAVE_MMX)    += x86/sad_mmx.asm
 DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
@@ -30,6 +34,7 @@
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2)   += x86/sad_sse2.asm
+DSP_SRCS-$(HAVE_SSE2)   += x86/subtract_sse2.asm
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
@@ -41,21 +46,36 @@
 
 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
 DSP_SRCS-yes            += variance.c
+DSP_SRCS-yes            += variance.h
 
+DSP_SRCS-$(HAVE_MEDIA)  += arm/bilinear_filter_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/subpel_variance_media.c
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_h_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_hv_media$(ASM)
+DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_halfpixvar16x16_v_media$(ASM)
 DSP_SRCS-$(HAVE_MEDIA)  += arm/variance_media$(ASM)
+DSP_SRCS-$(HAVE_NEON)   += arm/subpel_variance_neon.c
 DSP_SRCS-$(HAVE_NEON)   += arm/variance_neon.c
 
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
+DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_mmx.c
 DSP_SRCS-$(HAVE_MMX)    += x86/variance_impl_mmx.asm
-DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c
+DSP_SRCS-$(HAVE_SSE2)   += x86/variance_sse2.c  # Contains SSE2 and SSSE3
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/variance_impl_avx2.c
 
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/subpel_variance_sse2.asm  # Contains SSE2 and SSSE3
+endif  # CONFIG_USE_X86INC
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
+ifeq ($(CONFIG_USE_X86INC),yes)
+DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
+endif  # CONFIG_USE_X86INC
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index f12270c..8e4e966 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -37,6 +37,12 @@
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
+# Block subtraction
+#
+add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
+specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
+
+#
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -125,47 +131,53 @@
 #
 # Blocks of 3
 add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x3 msa/;
 
 add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x3 msa/;
 
 add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3/;
+specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
 
 add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3/;
+specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
 
 add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3/;
+specialize qw/vpx_sad8x16x3 sse3 msa/;
 
 add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3/;
+specialize qw/vpx_sad8x8x3 sse3 msa/;
 
 add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3/;
+specialize qw/vpx_sad4x4x3 sse3 msa/;
 
 # Blocks of 8
 add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad64x64x8 msa/;
 
 add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad32x32x8 msa/;
 
 add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1/;
+specialize qw/vpx_sad16x16x8 sse4_1 msa/;
 
 add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1/;
+specialize qw/vpx_sad16x8x8 sse4_1 msa/;
 
 add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1/;
+specialize qw/vpx_sad8x16x8 sse4_1 msa/;
 
 add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1/;
+specialize qw/vpx_sad8x8x8 sse4_1 msa/;
 
 add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad8x4x8 msa/;
 
 add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+specialize qw/vpx_sad4x8x8 msa/;
 
 add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1/;
+specialize qw/vpx_sad4x4x8 sse4_1 msa/;
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
@@ -211,6 +223,12 @@
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
+  # Block subtraction
+  #
+  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vpx_highbd_subtract_block/;
+
+  #
   # Single block SAD
   #
   add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
@@ -394,6 +412,9 @@
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
 
+#
+# Variance
+#
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance64x64 sse2 avx2 neon msa/;
 
@@ -433,7 +454,9 @@
 add_proto qw/unsigned int vpx_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance4x4 mmx sse2 msa/;
 
-
+#
+# Specialty Variance
+#
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
   specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
 
@@ -460,6 +483,99 @@
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 
+#
+# Subpixel Variance
+#
+add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x16 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance16x8 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x16 mmx msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x8 mmx media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_sub_pixel_variance4x4 mmx msa/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x64 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance64x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x64/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x32 avx2/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance32x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x32/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance16x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x16/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x8/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance8x4/, "$sse2_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x8/, "$sse_x86inc", "$ssse3_x86inc";
+
+add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
+
+#
+# Specialty Subpixel
+#
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_h mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_v mmx media/;
+
+add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
+  specialize qw/vpx_variance_halfpixvar16x16_hv mmx media/;
+
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance64x64 sse2/;
@@ -597,6 +713,226 @@
   specialize qw/vpx_highbd_12_mse8x8 sse2/;
 
   add_proto qw/void vpx_highbd_comp_avg_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+
+  #
+  # Subpixel Variance
+  #
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_12_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_10_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  specialize qw/vpx_highbd_8_sub_pixel_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_12_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_10_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance64x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x64/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance32x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x32/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance16x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x16/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x8/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  specialize qw/vpx_highbd_8_sub_pixel_avg_variance8x4/, "$sse2_x86inc";
+
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+  add_proto qw/uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+
 }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC
 

diff --git a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
similarity index 98%
rename from vp9/encoder/x86/vp9_highbd_subpel_variance.asm
rename to vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm
index 4594bb1..2be6446 100644
--- a/vp9/encoder/x86/vp9_highbd_subpel_variance.asm
+++ b/vpx_dsp/x86/highbd_subpel_variance_impl_sse2.asm

@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
@@ -30,7 +32,7 @@
 
 SECTION .text
 
-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);

diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 343c047..fe35c1e 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c

@@ -8,9 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 #include "./vpx_config.h"
-#include "vp9/common/vp9_common.h"
 
-#include "vp9/encoder/vp9_variance.h"
 #include "vpx_ports/mem.h"
 
 typedef uint32_t (*high_variance_fn_t) (const uint16_t *src, int src_stride,
@@ -243,3 +241,341 @@
                           sse, &sum, vpx_highbd_calc8x8var_sse2, 8);
   return *sse;
 }
+
+#if CONFIG_USE_X86INC
+#define DECL(w, opt) \
+  int vpx_highbd_sub_pixel_variance##w##xh_##opt(const uint16_t *src, \
+                                                 ptrdiff_t src_stride, \
+                                                 int x_offset, int y_offset, \
+                                                 const uint16_t *dst, \
+                                                 ptrdiff_t dst_stride, \
+                                                 int height, unsigned int *sse);
+#define DECLS(opt1, opt2) \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+// TODO(johannkoenig): enable the ssse3 or delete
+// DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src8, \
+                                                          int src_stride, \
+                                                          int x_offset, \
+                                                          int y_offset, \
+                                                          const uint8_t *dst8, \
+                                                          int dst_stride, \
+                                                          uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, h, \
+                                                       &sse); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 48, src_stride, x_offset, y_offset, \
+          dst + 48, dst_stride, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  int se = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst, dst_stride, \
+                                                       h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 16, \
+                                                          src_stride, \
+                                                          x_offset, y_offset, \
+                                                          dst + 16, \
+                                                          dst_stride, \
+                                                          h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 32, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                        x_offset, y_offset, \
+                                                        dst + 48, dst_stride, \
+                                                        h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+        src + (start_row * src_stride), src_stride, \
+        x_offset, y_offset, dst + (start_row * dst_stride), \
+        dst_stride, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+          src + 16 + (start_row * src_stride), src_stride, \
+          x_offset, y_offset, dst + 16 + (start_row * dst_stride), \
+          dst_stride, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 32 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 32 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_variance##wf##xh_##opt( \
+            src + 48 + (start_row * src_stride), src_stride, \
+            x_offset, y_offset, dst + 48 + (start_row * dst_stride), \
+            dst_stride, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      }\
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 3, 4, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+
+FNS(sse2, sse);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vpx_highbd_sub_pixel_avg_variance##w##xh_##opt(const uint16_t *src, \
+                                                   ptrdiff_t src_stride, \
+                                                   int x_offset, int y_offset, \
+                                                   const uint16_t *dst, \
+                                                   ptrdiff_t dst_stride, \
+                                                   const uint16_t *sec, \
+                                                   ptrdiff_t sec_stride, \
+                                                   int height, \
+                                                   unsigned int *sse);
+#define DECLS(opt1) \
+DECL(16, opt1) \
+DECL(8, opt1)
+
+DECLS(sse2);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+uint32_t vpx_highbd_8_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+               src, src_stride, x_offset, \
+               y_offset, dst, dst_stride, sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                  src + 16, src_stride, x_offset, y_offset, \
+                  dst + 16, dst_stride, sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32, src_stride, x_offset, y_offset, \
+                dst + 32, dst_stride, sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48, src_stride, x_offset, y_offset, \
+                dst + 48, dst_stride, sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_10_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  uint32_t sse; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  int se = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src, src_stride, x_offset, \
+                                            y_offset, dst, dst_stride, \
+                                            sec, w, h, &sse); \
+  if (w > wf) { \
+    uint32_t sse2; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 16, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 16, dst_stride, \
+                                            sec + 16, w, h, &sse2); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 32, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 32, dst_stride, \
+                                            sec + 32, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                                            src + 48, src_stride, \
+                                            x_offset, y_offset, \
+                                            dst + 48, dst_stride, \
+                                            sec + 48, w, h, &sse2); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 2); \
+  sse = ROUND_POWER_OF_TWO(sse, 4); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+} \
+\
+uint32_t vpx_highbd_12_sub_pixel_avg_variance##w##x##h##_##opt( \
+    const uint8_t *src8, int src_stride, int x_offset, int y_offset, \
+    const uint8_t *dst8, int dst_stride, uint32_t *sse_ptr, \
+    const uint8_t *sec8) { \
+  int start_row; \
+  uint32_t sse; \
+  int se = 0; \
+  uint64_t long_sse = 0; \
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+  uint16_t *sec = CONVERT_TO_SHORTPTR(sec8); \
+  for (start_row = 0; start_row < h; start_row +=16) { \
+    uint32_t sse2; \
+    int height = h - start_row < 16 ? h - start_row : 16; \
+    int se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + (start_row * src_stride), src_stride, x_offset, \
+                y_offset, dst + (start_row * dst_stride), dst_stride, \
+                sec + (start_row * w), w, height, &sse2); \
+    se += se2; \
+    long_sse += sse2; \
+    if (w > wf) { \
+      se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 16 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 16 + (start_row * dst_stride), dst_stride, \
+                sec + 16 + (start_row * w), w, height, &sse2); \
+      se += se2; \
+      long_sse += sse2; \
+      if (w > wf * 2) { \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 32 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 32 + (start_row * dst_stride), dst_stride, \
+                sec + 32 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+        se2 = vpx_highbd_sub_pixel_avg_variance##wf##xh_##opt( \
+                src + 48 + (start_row * src_stride), src_stride, \
+                x_offset, y_offset, \
+                dst + 48 + (start_row * dst_stride), dst_stride, \
+                sec + 48 + (start_row * w), w, height, &sse2); \
+        se += se2; \
+        long_sse += sse2; \
+      } \
+    } \
+  } \
+  se = ROUND_POWER_OF_TWO(se, 4); \
+  sse = ROUND_POWER_OF_TWO(long_sse, 8); \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+
+#define FNS(opt1) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
+FN(16, 8, 16, 4, 3, opt1, (int64_t)); \
+FN(8, 16, 8, 4, 3, opt1, (int64_t)); \
+FN(8, 8, 8, 3, 3, opt1, (int64_t)); \
+FN(8, 4, 8, 3, 2, opt1, (int64_t));
+
+FNS(sse2);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC

diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vpx_dsp/x86/subpel_variance_sse2.asm
similarity index 98%
rename from vp9/encoder/x86/vp9_subpel_variance.asm
rename to vpx_dsp/x86/subpel_variance_sse2.asm
index 292cf34..294f54f 100644
--- a/vp9/encoder/x86/vp9_subpel_variance.asm
+++ b/vpx_dsp/x86/subpel_variance_sse2.asm

@@ -8,6 +8,8 @@
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
 
+%define program_name vpx
+
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION_RODATA
@@ -39,7 +41,7 @@
 
 SECTION .text
 
-; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int vpx_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
 ;                               int x_offset, int y_offset,
 ;                               const uint8_t *dst, ptrdiff_t dst_stride,
 ;                               int height, unsigned int *sse);

diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
similarity index 97%
rename from vp9/encoder/x86/vp9_subtract_sse2.asm
rename to vpx_dsp/x86/subtract_sse2.asm
index 9824080..a186451 100644
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm

@@ -7,12 +7,13 @@
 ;  in the file PATENTS.  All contributing project authors may
 ;  be found in the AUTHORS file in the root of the source tree.
 ;
+%define program_name vpx
 
 %include "third_party/x86inc/x86inc.asm"
 
 SECTION .text
 
-; void vp9_subtract_block(int rows, int cols,
+; void vpx_subtract_block(int rows, int cols,
 ;                         int16_t *diff, ptrdiff_t diff_stride,
 ;                         const uint8_t *src, ptrdiff_t src_stride,
 ;                         const uint8_t *pred, ptrdiff_t pred_stride)

diff --git a/vpx_dsp/x86/variance_avx2.c b/vpx_dsp/x86/variance_avx2.c
index 82cef4a..7851a98 100644
--- a/vpx_dsp/x86/variance_avx2.c
+++ b/vpx_dsp/x86/variance_avx2.c

@@ -91,3 +91,93 @@
                 sse, &sum, vpx_get32x32var_avx2, 32);
   return *sse - (((int64_t)sum * sum) >> 11);
 }
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height,
+                                             unsigned int *sse);
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                                 int src_stride,
+                                                 int x_offset,
+                                                 int y_offset,
+                                                 const uint8_t *dst,
+                                                 int dst_stride,
+                                                 const uint8_t *sec,
+                                                 int sec_stride,
+                                                 int height,
+                                                 unsigned int *sseptr);
+
+unsigned int vpx_sub_pixel_variance64x64_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                  y_offset, dst, dst_stride,
+                                                  64, &sse1);
+  unsigned int sse2;
+  const int se2 = vpx_sub_pixel_variance32xh_avx2(src + 32, src_stride,
+                                                  x_offset, y_offset,
+                                                  dst + 32, dst_stride,
+                                                  64, &sse2);
+  const int se = se1 + se2;
+  *sse = sse1 + sse2;
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_variance32x32_avx2(const uint8_t *src,
+                                              int src_stride,
+                                              int x_offset,
+                                              int y_offset,
+                                              const uint8_t *dst,
+                                              int dst_stride,
+                                              unsigned int *sse) {
+  const int se = vpx_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
+                                                 y_offset, dst, dst_stride,
+                                                 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}
+
+unsigned int vpx_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  unsigned int sse1;
+  const int se1 = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                      y_offset, dst, dst_stride,
+                                                      sec, 64, 64, &sse1);
+  unsigned int sse2;
+  const int se2 =
+    vpx_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
+                                        y_offset, dst + 32, dst_stride,
+                                        sec + 32, 64, 64, &sse2);
+  const int se = se1 + se2;
+
+  *sse = sse1 + sse2;
+
+  return *sse - (((int64_t)se * se) >> 12);
+}
+
+unsigned int vpx_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
+                                                  int src_stride,
+                                                  int x_offset,
+                                                  int y_offset,
+                                                  const uint8_t *dst,
+                                                  int dst_stride,
+                                                  unsigned int *sse,
+                                                  const uint8_t *sec) {
+  // Process 32 elements in parallel.
+  const int se = vpx_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
+                                                     y_offset, dst, dst_stride,
+                                                     sec, 32, 32, sse);
+  return *sse - (((int64_t)se * se) >> 10);
+}

diff --git a/vpx_dsp/x86/variance_impl_avx2.c b/vpx_dsp/x86/variance_impl_avx2.c
index 0e40959..b289e9a 100644
--- a/vpx_dsp/x86/variance_impl_avx2.c
+++ b/vpx_dsp/x86/variance_impl_avx2.c

@@ -11,6 +11,27 @@
 #include <immintrin.h>  // AVX2
 
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+  2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
+};
+
 
 void vpx_get16x16var_avx2(const unsigned char *src_ptr,
                           int source_stride,
@@ -213,3 +234,494 @@
       _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_ref_src, 1));
     }
 }
+
+#define FILTER_SRC(filter) \
+  /* filter the source */ \
+  exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
+  exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
+  \
+  /* add 8 to source */ \
+  exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
+  exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
+  \
+  /* divide source by 16 */ \
+  exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
+  exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
+
+#define MERGE_WITH_SRC(src_reg, reg) \
+  exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
+  exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
+
+#define LOAD_SRC_DST \
+  /* load source and destination */ \
+  src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
+  dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
+
+#define AVG_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  /* average between current and next stride source */ \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC(src_reg, size_stride) \
+  src_next_reg = _mm256_loadu_si256((__m256i const *) \
+                                   (src + size_stride)); \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define CALC_SUM_SSE_INSIDE_LOOP \
+  /* expand each byte to 2 bytes */ \
+  exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
+  exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
+  /* source - dest */ \
+  exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
+  exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
+  /* caculate sum */ \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
+  exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
+  sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
+  exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
+  /* calculate sse */ \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
+  sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
+
+// final calculation to sum and sse
+#define CALC_SUM_AND_SSE \
+  res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
+  sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
+  sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
+  \
+  sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
+  \
+  sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
+                _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
+  sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
+  sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
+  sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
+        _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
+
+
+unsigned int vpx_sub_pixel_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        // save current source average
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}
+
+unsigned int vpx_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
+                                             int src_stride,
+                                             int x_offset,
+                                             int y_offset,
+                                             const uint8_t *dst,
+                                             int dst_stride,
+                                             const uint8_t *sec,
+                                             int sec_stride,
+                                             int height,
+                                             unsigned int *sse) {
+  __m256i sec_reg;
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    } else if (y_offset == 8) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, src_stride)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+                 (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, src_stride)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = 8  and y_offset = 0
+  } else if (x_offset == 8) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i src_next_reg, src_avg;
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        // average between previous average to current average
+        src_avg = _mm256_avg_epu8(src_avg, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        sec+= sec_stride;
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = 8  and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg, src_avg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      AVG_NEXT_SRC(src_reg, 1)
+      for (i = 0; i < height ; i++) {
+        // save current source average
+        src_avg = src_reg;
+        src+= src_stride;
+        LOAD_SRC_DST
+        AVG_NEXT_SRC(src_reg, 1)
+        MERGE_WITH_SRC(src_avg, src_reg)
+        FILTER_SRC(filter)
+        src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_avg = _mm256_avg_epu8(src_avg, sec_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_avg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height ; i++) {
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_reg = _mm256_avg_epu8(src_reg, sec_reg);
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src+= src_stride;
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = 8
+    } else if (y_offset == 8) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256((__m256i const *)
+               (bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(filter)
+        src_reg =  _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_reg);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        sec+= sec_stride;
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    // x_offset = bilin interpolation and y_offset = bilin interpolation
+    } else {
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256((__m256i const *)
+                (bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load source and another source starting from the next
+      // following byte
+      src_reg = _mm256_loadu_si256((__m256i const *) (src));
+      MERGE_NEXT_SRC(src_reg, 1)
+
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      for (i = 0; i < height ; i++) {
+        src+= src_stride;
+        LOAD_SRC_DST
+        MERGE_NEXT_SRC(src_reg, 1)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        // merge previous pack to current pack source
+        MERGE_WITH_SRC(src_pack, src_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
+        src_pack = _mm256_avg_epu8(src_pack, sec_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        src_pack = src_reg;
+        sec+= sec_stride;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst+= dst_stride;
+      }
+    }
+  }
+  CALC_SUM_AND_SSE
+  return sum;
+}

diff --git a/vpx_dsp/x86/variance_impl_mmx.asm b/vpx_dsp/x86/variance_impl_mmx.asm
index a8d7d99..b8ba79b 100644
--- a/vpx_dsp/x86/variance_impl_mmx.asm
+++ b/vpx_dsp/x86/variance_impl_mmx.asm

@@ -11,6 +11,8 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
+%define mmx_filter_shift            7
+
 ;unsigned int vpx_get_mb_ss_mmx( short *src_ptr )
 global sym(vpx_get_mb_ss_mmx) PRIVATE
 sym(vpx_get_mb_ss_mmx):
@@ -52,7 +54,6 @@
         movsxd      rcx, dword ptr [rsp+4]
         add         rax, rcx
 
-
     ; begin epilog
     add rsp, 8
     pop rdi
@@ -62,7 +63,6 @@
     pop         rbp
     ret
 
-
 ;void vpx_get8x8var_mmx
 ;(
 ;    unsigned char *src_ptr,
@@ -83,7 +83,6 @@
     sub         rsp, 16
     ; end prolog
 
-
         pxor        mm5, mm5                    ; Blank mmx6
         pxor        mm6, mm6                    ; Blank mmx7
         pxor        mm7, mm7                    ; Blank mmx7
@@ -117,7 +116,6 @@
         paddd       mm7, mm0                    ; accumulate in mm7
         paddd       mm7, mm2                    ; accumulate in mm7
 
-
         ; Row 2
         movq        mm0, [rax]                  ; Copy eight bytes to mm0
         movq        mm2, mm0                    ; Take copies
@@ -298,7 +296,6 @@
         mov         dword ptr [rdi], edx
         xor         rax, rax    ; return 0
 
-
     ; begin epilog
     add rsp, 16
     pop rbx
@@ -308,8 +305,6 @@
     pop         rbp
     ret
 
-
-
 ;void
 ;vpx_get4x4var_mmx
 ;(
@@ -331,7 +326,6 @@
     sub         rsp, 16
     ; end prolog
 
-
         pxor        mm5, mm5                    ; Blank mmx6
         pxor        mm6, mm6                    ; Blank mmx7
         pxor        mm7, mm7                    ; Blank mmx7
@@ -354,7 +348,6 @@
         movd        mm1, [rbx]                  ; Copy four bytes to mm1
         paddd       mm7, mm0                    ; accumulate in mm7
 
-
         ; Row 2
         movd        mm0, [rax]                  ; Copy four bytes to mm0
         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
@@ -393,7 +386,6 @@
         pmaddwd     mm0, mm0                    ; square and accumulate
         paddd       mm7, mm0                    ; accumulate in mm7
 
-
         ; Now accumulate the final results.
         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
@@ -413,7 +405,6 @@
         mov         dword ptr [rdi], edx
         xor         rax, rax    ; return 0
 
-
     ; begin epilog
     add rsp, 16
     pop rbx
@@ -422,3 +413,332 @@
     UNSHADOW_ARGS
     pop         rbp
     ret
+
+;void vpx_filter_block2d_bil4x4_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil4x4_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil4x4_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+
+        mov             rax,            arg(4) ;HFilter             ;
+        mov             rdx,            arg(5) ;VFilter             ;
+
+        mov             rsi,            arg(0) ;ref_ptr              ;
+        mov             rdi,            arg(2) ;src_ptr              ;
+
+        mov             rcx,            4                   ;
+        pxor            mm0,            mm0                 ;
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+%if ABI_IS_32BIT
+        add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
+%else
+        movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
+        add             rsi, r8
+%endif
+
+.filter_block2d_bil4x4_var_mmx_loop:
+
+        movd            mm1,            [rsi]               ;
+        movd            mm3,            [rsi+1]             ;
+
+        punpcklbw       mm1,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        movq            mm3,            mm5                 ;
+
+        movq            mm5,            mm1                 ;
+        pmullw          mm3,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        movd            mm3,            [rdi]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        paddw           mm6,            mm1                 ;
+
+        pmaddwd         mm1,            mm1                 ;
+        paddd           mm7,            mm1                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(6) ;sum
+        mov             rsi,            arg(7) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+;void vpx_filter_block2d_bil_var_mmx
+;(
+;    unsigned char *ref_ptr,
+;    int ref_pixels_per_line,
+;    unsigned char *src_ptr,
+;    int src_pixels_per_line,
+;    unsigned int Height,
+;    unsigned short *HFilter,
+;    unsigned short *VFilter,
+;    int *sum,
+;    unsigned int *sumsquared
+;)
+global sym(vpx_filter_block2d_bil_var_mmx) PRIVATE
+sym(vpx_filter_block2d_bil_var_mmx):
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 9
+    GET_GOT     rbx
+    push rsi
+    push rdi
+    sub         rsp, 16
+    ; end prolog
+
+        pxor            mm6,            mm6                 ;
+        pxor            mm7,            mm7                 ;
+        mov             rax,            arg(5) ;HFilter             ;
+
+        mov             rdx,            arg(6) ;VFilter             ;
+        mov             rsi,            arg(0) ;ref_ptr              ;
+
+        mov             rdi,            arg(2) ;src_ptr              ;
+        movsxd          rcx,            dword ptr arg(4) ;Height              ;
+
+        pxor            mm0,            mm0                 ;
+        movq            mm1,            [rsi]               ;
+
+        movq            mm3,            [rsi+1]             ;
+        movq            mm2,            mm1                 ;
+
+        movq            mm4,            mm3                 ;
+        punpcklbw       mm1,            mm0                 ;
+
+        punpckhbw       mm2,            mm0                 ;
+        pmullw          mm1,            [rax]               ;
+
+        pmullw          mm2,            [rax]               ;
+        punpcklbw       mm3,            mm0                 ;
+
+        punpckhbw       mm4,            mm0                 ;
+        pmullw          mm3,            [rax+8]             ;
+
+        pmullw          mm4,            [rax+8]             ;
+        paddw           mm1,            mm3                 ;
+
+        paddw           mm2,            mm4                 ;
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm2,            mmx_filter_shift    ;
+        movq            mm5,            mm1
+
+        packuswb        mm5,            mm2                 ;
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
+        add             rsi,            r8
+%endif
+
+.filter_block2d_bil_var_mmx_loop:
+
+        movq            mm1,            [rsi]               ;
+        movq            mm3,            [rsi+1]             ;
+
+        movq            mm2,            mm1                 ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm1,            mm0                 ;
+        punpckhbw       mm2,            mm0                 ;
+
+        pmullw          mm1,            [rax]               ;
+        pmullw          mm2,            [rax]               ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        pmullw          mm3,            [rax+8]             ;
+        pmullw          mm4,            [rax+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm1,            mmx_filter_shift    ;
+
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            mm5                 ;
+        movq            mm4,            mm5                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        movq            mm5,            mm1                 ;
+        packuswb        mm5,            mm2                 ;
+
+        pmullw          mm3,            [rdx]               ;
+        pmullw          mm4,            [rdx]               ;
+
+        pmullw          mm1,            [rdx+8]             ;
+        pmullw          mm2,            [rdx+8]             ;
+
+        paddw           mm1,            mm3                 ;
+        paddw           mm2,            mm4                 ;
+
+        paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
+        paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
+
+        psraw           mm1,            mmx_filter_shift    ;
+        psraw           mm2,            mmx_filter_shift    ;
+
+        movq            mm3,            [rdi]               ;
+        movq            mm4,            mm3                 ;
+
+        punpcklbw       mm3,            mm0                 ;
+        punpckhbw       mm4,            mm0                 ;
+
+        psubw           mm1,            mm3                 ;
+        psubw           mm2,            mm4                 ;
+
+        paddw           mm6,            mm1                 ;
+        pmaddwd         mm1,            mm1                 ;
+
+        paddw           mm6,            mm2                 ;
+        pmaddwd         mm2,            mm2                 ;
+
+        paddd           mm7,            mm1                 ;
+        paddd           mm7,            mm2                 ;
+
+%if ABI_IS_32BIT
+        add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
+        add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
+%else
+        movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
+        movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
+        add             rsi,            r8
+        add             rdi,            r9
+%endif
+        sub             rcx,            1                   ;
+        jnz             .filter_block2d_bil_var_mmx_loop       ;
+
+        pxor            mm3,            mm3                 ;
+        pxor            mm2,            mm2                 ;
+
+        punpcklwd       mm2,            mm6                 ;
+        punpckhwd       mm3,            mm6                 ;
+
+        paddd           mm2,            mm3                 ;
+        movq            mm6,            mm2                 ;
+
+        psrlq           mm6,            32                  ;
+        paddd           mm2,            mm6                 ;
+
+        psrad           mm2,            16                  ;
+        movq            mm4,            mm7                 ;
+
+        psrlq           mm4,            32                  ;
+        paddd           mm4,            mm7                 ;
+
+        mov             rdi,            arg(7) ;sum
+        mov             rsi,            arg(8) ;sumsquared
+
+        movd            dword ptr [rdi],          mm2                 ;
+        movd            dword ptr [rsi],          mm4                 ;
+
+    ; begin epilog
+    add rsp, 16
+    pop rdi
+    pop rsi
+    RESTORE_GOT
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+;short mmx_bi_rd[4] = { 64, 64, 64, 64};
+align 16
+mmx_bi_rd:
+    times 4 dw 64

diff --git a/vpx_dsp/x86/variance_mmx.c b/vpx_dsp/x86/variance_mmx.c
index 99dd741..f04f4e2 100644
--- a/vpx_dsp/x86/variance_mmx.c
+++ b/vpx_dsp/x86/variance_mmx.c

@@ -10,12 +10,45 @@
 
 #include "./vpx_dsp_rtcd.h"
 
+#include "vpx_ports/mem.h"
+
+DECLARE_ALIGNED(16, static const int16_t, bilinear_filters_mmx[8][8]) = {
+  { 128, 128, 128, 128,   0,   0,   0,   0 },
+  { 112, 112, 112, 112,  16,  16,  16,  16 },
+  {  96,  96,  96,  96,  32,  32,  32,  32 },
+  {  80,  80,  80,  80,  48,  48,  48,  48 },
+  {  64,  64,  64,  64,  64,  64,  64,  64 },
+  {  48,  48,  48,  48,  80,  80,  80,  80 },
+  {  32,  32,  32,  32,  96,  96,  96,  96 },
+  {  16,  16,  16,  16, 112, 112, 112, 112 }
+};
+
 extern void vpx_get4x4var_mmx(const uint8_t *a, int a_stride,
                               const uint8_t *b, int b_stride,
                               unsigned int *sse, int *sum);
 
-unsigned int vpx_variance4x4_mmx(const unsigned char *a, int  a_stride,
-                                 const unsigned char *b, int  b_stride,
+extern void vpx_filter_block2d_bil4x4_var_mmx(const unsigned char *ref_ptr,
+                                              int ref_pixels_per_line,
+                                              const unsigned char *src_ptr,
+                                              int src_pixels_per_line,
+                                              const int16_t *HFilter,
+                                              const int16_t *VFilter,
+                                              int *sum,
+                                              unsigned int *sumsquared);
+
+extern void vpx_filter_block2d_bil_var_mmx(const unsigned char *ref_ptr,
+                                           int ref_pixels_per_line,
+                                           const unsigned char *src_ptr,
+                                           int src_pixels_per_line,
+                                           unsigned int Height,
+                                           const int16_t *HFilter,
+                                           const int16_t *VFilter,
+                                           int *sum,
+                                           unsigned int *sumsquared);
+
+
+unsigned int vpx_variance4x4_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
     unsigned int var;
     int avg;
@@ -25,8 +58,8 @@
     return (var - (((unsigned int)avg * avg) >> 4));
 }
 
-unsigned int vpx_variance8x8_mmx(const unsigned char *a, int  a_stride,
-                                 const unsigned char *b, int  b_stride,
+unsigned int vpx_variance8x8_mmx(const unsigned char *a, int a_stride,
+                                 const unsigned char *b, int b_stride,
                                  unsigned int *sse) {
     unsigned int var;
     int avg;
@@ -37,8 +70,8 @@
     return (var - (((unsigned int)avg * avg) >> 6));
 }
 
-unsigned int vpx_mse16x16_mmx(const unsigned char *a, int  a_stride,
-                              const unsigned char *b, int  b_stride,
+unsigned int vpx_mse16x16_mmx(const unsigned char *a, int a_stride,
+                              const unsigned char *b, int b_stride,
                               unsigned int *sse) {
     unsigned int sse0, sse1, sse2, sse3, var;
     int sum0, sum1, sum2, sum3;
@@ -55,8 +88,8 @@
     return var;
 }
 
-unsigned int vpx_variance16x16_mmx(const unsigned char *a, int  a_stride,
-                                   const unsigned char *b, int  b_stride,
+unsigned int vpx_variance16x16_mmx(const unsigned char *a, int a_stride,
+                                   const unsigned char *b, int b_stride,
                                    unsigned int *sse) {
     unsigned int sse0, sse1, sse2, sse3, var;
     int sum0, sum1, sum2, sum3, avg;
@@ -74,8 +107,8 @@
     return (var - (((unsigned int)avg * avg) >> 8));
 }
 
-unsigned int vpx_variance16x8_mmx(const unsigned char *a, int  a_stride,
-                                  const unsigned char *b, int  b_stride,
+unsigned int vpx_variance16x8_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
                                   unsigned int *sse) {
     unsigned int sse0, sse1, var;
     int sum0, sum1, avg;
@@ -89,8 +122,8 @@
     return (var - (((unsigned int)avg * avg) >> 7));
 }
 
-unsigned int vpx_variance8x16_mmx(const unsigned char *a, int  a_stride,
-                                  const unsigned char *b, int  b_stride,
+unsigned int vpx_variance8x16_mmx(const unsigned char *a, int a_stride,
+                                  const unsigned char *b, int b_stride,
                                   unsigned int *sse) {
     unsigned int sse0, sse1, var;
     int sum0, sum1, avg;
@@ -105,3 +138,112 @@
 
     return (var - (((unsigned int)avg * avg) >> 7));
 }
+
+uint32_t vpx_sub_pixel_variance4x4_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil4x4_var_mmx(a, a_stride, b, b_stride,
+                                      bilinear_filters_mmx[xoffset],
+                                      bilinear_filters_mmx[yoffset],
+                                      &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+}
+
+
+uint32_t vpx_sub_pixel_variance8x8_mmx(const uint8_t *a, int a_stride,
+                                       int xoffset, int yoffset,
+                                       const uint8_t *b, int b_stride,
+                                       uint32_t *sse) {
+    int xsum;
+    uint32_t xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 6));
+}
+
+uint32_t vpx_sub_pixel_variance16x16_mmx(const uint8_t *a, int a_stride,
+                                         int xoffset, int yoffset,
+                                         const uint8_t *b, int b_stride,
+                                         uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 8));
+}
+
+uint32_t vpx_sub_pixel_variance16x8_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum0, xsum1;
+    unsigned int xxsum0, xxsum1;
+
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum0, &xxsum0);
+
+    vpx_filter_block2d_bil_var_mmx(a + 8, a_stride, b + 8, b_stride, 8,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum1, &xxsum1);
+
+    xsum0 += xsum1;
+    xxsum0 += xxsum1;
+
+    *sse = xxsum0;
+    return (xxsum0 - (((uint32_t)xsum0 * xsum0) >> 7));
+}
+
+uint32_t vpx_sub_pixel_variance8x16_mmx(const uint8_t *a, int a_stride,
+                                        int xoffset, int yoffset,
+                                        const uint8_t *b, int b_stride,
+                                        uint32_t *sse) {
+    int xsum;
+    unsigned int xxsum;
+    vpx_filter_block2d_bil_var_mmx(a, a_stride, b, b_stride, 16,
+                                   bilinear_filters_mmx[xoffset],
+                                   bilinear_filters_mmx[yoffset],
+                                   &xsum, &xxsum);
+    *sse = xxsum;
+    return (xxsum - (((uint32_t)xsum * xsum) >> 7));
+}
+
+uint32_t vpx_variance_halfpixvar16x16_h_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 0, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_v_mmx(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 0, 4, b, b_stride, sse);
+}
+
+uint32_t vpx_variance_halfpixvar16x16_hv_mmx(const uint8_t *a, int a_stride,
+                                             const uint8_t *b, int b_stride,
+                                             uint32_t *sse) {
+  return vpx_sub_pixel_variance16x16_mmx(a, a_stride, 4, 4, b, b_stride, sse);
+}

diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 6256bc5..e6c9365 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c

@@ -307,3 +307,171 @@
   vpx_variance16x16_sse2(src, src_stride, ref, ref_stride, sse);
   return *sse;
 }
+
+#if CONFIG_USE_X86INC
+// The 2 unused parameters are place holders for PIC enabled build.
+// These definitions are for functions defined in subpel_variance.asm
+#define DECL(w, opt) \
+  int vpx_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+                                          ptrdiff_t src_stride, \
+                                          int x_offset, int y_offset, \
+                                          const uint8_t *dst, \
+                                          ptrdiff_t dst_stride, \
+                                          int height, unsigned int *sse, \
+                                          void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+  DECL(4, opt2); \
+  DECL(8, opt1); \
+  DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                     int src_stride, \
+                                                     int x_offset, \
+                                                     int y_offset, \
+                                                     const uint8_t *dst, \
+                                                     int dst_stride, \
+                                                     unsigned int *sse_ptr) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                y_offset, dst, dst_stride, \
+                                                h, &sse, NULL, NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                   x_offset, y_offset, \
+                                                   dst + 16, dst_stride, \
+                                                   h, &sse2, NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 32, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                 x_offset, y_offset, \
+                                                 dst + 48, dst_stride, \
+                                                 h, &sse2, NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sse_ptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+// The 2 unused parameters are place holders for PIC enabled build.
+#define DECL(w, opt) \
+int vpx_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+                                            ptrdiff_t src_stride, \
+                                            int x_offset, int y_offset, \
+                                            const uint8_t *dst, \
+                                            ptrdiff_t dst_stride, \
+                                            const uint8_t *sec, \
+                                            ptrdiff_t sec_stride, \
+                                            int height, unsigned int *sse, \
+                                            void *unused0, void *unused)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vpx_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+                                                         int src_stride, \
+                                                         int x_offset, \
+                                                         int y_offset, \
+                                                         const uint8_t *dst, \
+                                                         int dst_stride, \
+                                                         unsigned int *sseptr, \
+                                                         const uint8_t *sec) { \
+  unsigned int sse; \
+  int se = vpx_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+                                                    y_offset, dst, dst_stride, \
+                                                    sec, w, h, &sse, NULL, \
+                                                    NULL); \
+  if (w > wf) { \
+    unsigned int sse2; \
+    int se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+                                                       x_offset, y_offset, \
+                                                       dst + 16, dst_stride, \
+                                                       sec + 16, w, h, &sse2, \
+                                                       NULL, NULL); \
+    se += se2; \
+    sse += sse2; \
+    if (w > wf * 2) { \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 32, dst_stride, \
+                                                     sec + 32, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+      se2 = vpx_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+                                                     x_offset, y_offset, \
+                                                     dst + 48, dst_stride, \
+                                                     sec + 48, w, h, &sse2, \
+                                                     NULL, NULL); \
+      se += se2; \
+      sse += sse2; \
+    } \
+  } \
+  *sseptr = sse; \
+  return sse - ((cast se * se) >> (wlog2 + hlog2)); \
+}
+
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (uint32_t)); \
+FN(16,  8, 16, 4, 3, opt1, (uint32_t)); \
+FN(8,  16,  8, 3, 4, opt1, (uint32_t)); \
+FN(8,   8,  8, 3, 3, opt1, (uint32_t)); \
+FN(8,   4,  8, 3, 2, opt1, (uint32_t)); \
+FN(4,   8,  4, 2, 3, opt2, (uint32_t)); \
+FN(4,   4,  4, 2, 2, opt2, (uint32_t))
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+#endif  // CONFIG_USE_X86INC

diff --git a/vpx_thread/vpx_thread.c b/vpx_util/vpx_thread.c
similarity index 100%
rename from vpx_thread/vpx_thread.c
rename to vpx_util/vpx_thread.c


diff --git a/vpx_thread/vpx_thread.h b/vpx_util/vpx_thread.h
similarity index 100%
rename from vpx_thread/vpx_thread.h
rename to vpx_util/vpx_thread.h


diff --git a/vpx_thread/vpx_thread.mk b/vpx_util/vpx_util.mk
similarity index 80%
rename from vpx_thread/vpx_thread.mk
rename to vpx_util/vpx_util.mk
index 0a4a364..1161125 100644
--- a/vpx_thread/vpx_thread.mk
+++ b/vpx_util/vpx_util.mk

@@ -8,6 +8,6 @@
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
 
-THREAD_SRCS-yes += vpx_thread.mk
-THREAD_SRCS-yes += vpx_thread.c
-THREAD_SRCS-yes += vpx_thread.h
+UTIL_SRCS-yes += vpx_util.mk
+UTIL_SRCS-yes += vpx_thread.c
+UTIL_SRCS-yes += vpx_thread.h

diff --git a/vpxenc.c b/vpxenc.c
index 8bbb9fc..cb9b8fe 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -396,6 +396,12 @@
     NULL, "gf-cbr-boost", 1, "Boost for Golden Frame in CBR mode (pct)");
 static const arg_def_t max_inter_rate_pct = ARG_DEF(
     NULL, "max-inter-rate", 1, "Max P-frame bitrate (pct)");
+static const arg_def_t min_gf_interval = ARG_DEF(
+    NULL, "min-gf-interval", 1,
+    "min gf/arf frame interval (default 0, indicating in-built behavior)");
+static const arg_def_t max_gf_interval = ARG_DEF(
+    NULL, "max-gf-interval", 1,
+    "max gf/arf frame interval (default 0, indicating in-built behavior)");
 
 static const struct arg_enum_list color_space_enum[] = {
   { "unknown", VPX_CS_UNKNOWN },
@@ -445,6 +451,7 @@
   &gf_cbr_boost_pct, &lossless,
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   &noise_sens, &tune_content, &input_color_space,
+  &min_gf_interval, &max_gf_interval,
 #if CONFIG_VP9 && CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif
@@ -460,6 +467,7 @@
   VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE,
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
   VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
+  VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
   0
 };
 #endif