Merge "Fix for loop filter selection procedure" into nextgenv2
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 0000000..03e9b7d
--- /dev/null
+++ b/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,374 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth;
+
+  BlendA64Mask1DTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~BlendA64Mask1DTest() {}
+
+  virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+  void Common() {
+    w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+
+    dst_offset_ = rng_(33);
+    dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = rng_(33);
+    src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = rng_(33);
+    src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (rng_(3)) {
+      case 0:   // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:   // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:   // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default:
+        FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0 ; r < h_ ; ++r) {
+      for (int c = 0 ; c < w_ ; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  ACMRandom rng_;
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  size_t dst_stride_;
+  size_t dst_offset_;
+
+  T src0_[kBufSize];
+  size_t src0_stride_;
+  size_t src0_offset_;
+
+  T src1_[kBufSize];
+  size_t src1_stride_;
+  size_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+
+  int w_;
+  int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, int h, int w);
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+    ref_func_(dst_ref_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, h_, w_);
+
+    tst_func_(dst_tst_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, h_, w_);
+  }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+static void blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTest8B,
+  ::testing::Values(
+    make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_c),
+    make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTest8B,
+  ::testing::Values(
+    make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1),
+    make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, int h, int w, int bd);
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+              mask_, h_, w_, bit_depth_);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                mask_, h_, w_, bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+  for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void highbd_blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+    make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c),
+    make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+    make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_sse4_1),
+    make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/blend_mask6_test.cc b/test/blend_a64_mask_test.cc
similarity index 77%
rename from test/blend_mask6_test.cc
rename to test/blend_a64_mask_test.cc
index 6afaad7..08ee91d 100644
--- a/test/blend_mask6_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -26,6 +26,8 @@
 #include "test/acm_random.h"
 #include "vp10/common/enums.h"
 
+#include "vpx_dsp/blend.h"
+
 using libvpx_test::ACMRandom;
 using libvpx_test::FunctionEquivalenceTest;
 using std::tr1::make_tuple;
@@ -33,7 +35,7 @@
 namespace {
 
 template<typename F, typename T>
-class BlendMask6Test : public FunctionEquivalenceTest<F> {
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
  protected:
   static const int kIterations = 10000;
   static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
@@ -42,15 +44,15 @@
   static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
   static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
 
-  BlendMask6Test() : rng_(ACMRandom::DeterministicSeed()) {}
+  BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {}
 
-  virtual ~BlendMask6Test() {}
+  virtual ~BlendA64MaskTest() {}
 
-  virtual void Execute(T *p_src0, T *p_src1) = 0;
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
 
   void Common() {
-    w_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
-    h_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
+    w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
 
     subx_ = rng_(2);
     suby_ = rng_(2);
@@ -131,14 +133,14 @@
 //////////////////////////////////////////////////////////////////////////////
 
 typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
-                      uint8_t *src0, uint32_t src0_stride,
-                      uint8_t *src1, uint32_t src1_stride,
-                      const uint8_t *mask, uint32_t mask_stride,
-                      int h, int w, int suby, int subx);
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, uint32_t mask_stride,
+                    int h, int w, int suby, int subx);
 
-class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
  protected:
-  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
     ref_func_(dst_ref_ + dst_offset_, dst_stride_,
               p_src0 + src0_offset_, src0_stride_,
               p_src1 + src1_offset_, src1_stride_,
@@ -153,7 +155,7 @@
   }
 };
 
-TEST_P(BlendMask6Test8B, RandomValues) {
+TEST_P(BlendA64MaskTest8B, RandomValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     for (int i = 0 ; i < kBufSize ; ++i) {
       dst_ref_[i] = rng_.Rand8();
@@ -164,13 +166,13 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
 
     Common();
   }
 }
 
-TEST_P(BlendMask6Test8B, ExtremeValues) {
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     for (int i = 0 ; i < kBufSize ; ++i) {
       dst_ref_[i] = rng_(2) + 254;
@@ -180,7 +182,7 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(2) + 63;
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
 
     Common();
   }
@@ -188,8 +190,9 @@
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
-  SSE4_1_C_COMPARE, BlendMask6Test8B,
-  ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+  SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+  ::testing::Values(make_tuple(vpx_blend_a64_mask_c,
+                               vpx_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -198,14 +201,14 @@
 //////////////////////////////////////////////////////////////////////////////
 
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
-                       uint8_t *src0, uint32_t src0_stride,
-                       uint8_t *src1, uint32_t src1_stride,
-                       const uint8_t *mask, uint32_t mask_stride,
-                       int h, int w, int suby, int subx, int bd);
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, uint32_t mask_stride,
+                     int h, int w, int suby, int subx, int bd);
 
-class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
  protected:
-  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
     ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
               CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
               CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
@@ -223,7 +226,7 @@
   int bit_depth_;
 };
 
-TEST_P(BlendMask6TestHBD, RandomValues) {
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     switch (rng_(3)) {
     case 0:
@@ -247,13 +250,13 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
 
     Common();
   }
 }
 
-TEST_P(BlendMask6TestHBD, ExtremeValues) {
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
   for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
     switch (rng_(3)) {
     case 0:
@@ -278,7 +281,7 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
 
     Common();
   }
@@ -286,9 +289,9 @@
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
-  SSE4_1_C_COMPARE, BlendMask6TestHBD,
-  ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
-                               &vpx_highbd_blend_mask6_sse4_1)));
+  SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+  ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c,
+                               vpx_highbd_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index cd0b136..753a7e4 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -188,8 +188,8 @@
 
   const unsigned int w_y = img1->d_w;
   const unsigned int h_y = img1->d_h;
-  const unsigned int w_uv = ROUNDZ_POWER_OF_TWO(w_y, img1->x_chroma_shift);
-  const unsigned int h_uv = ROUNDZ_POWER_OF_TWO(h_y, img1->y_chroma_shift);
+  const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
+  const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
 
   if (img1->fmt != img2->fmt
       || img1->cs != img2->cs
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
index 9d8c2a2..95d56ae 100644
--- a/test/obmc_sad_test.cc
+++ b/test/obmc_sad_test.cc
@@ -29,7 +29,7 @@
 static const int kIterations = 1000;
 static const int kMaskMax = 64;
 
-typedef unsigned int (*ObmcSadF)(const uint8_t *ref, int ref_stride,
+typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
                                  const int32_t *wsrc, const int32_t *mask);
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -45,42 +45,42 @@
 };
 
 TEST_P(ObmcSadTest, RandomValues) {
-  DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
 
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
-    const int ref_stride = rng_(MAX_SB_SIZE + 1);
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
 
     for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
-      ref[i] = rng_.Rand8();
+      pre[i] = rng_.Rand8();
       wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
       mask[i] = rng_(kMaskMax * kMaskMax + 1);
     }
 
-    const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask);
-    const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask);
+    const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
+    const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
 
     ASSERT_EQ(ref_res, tst_res);
   }
 }
 
 TEST_P(ObmcSadTest, ExtremeValues) {
-  DECLARE_ALIGNED(32, uint8_t, ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
 
   for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
-    const int ref_stride = iter;
+    const int pre_stride = iter;
 
     for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
-      ref[i] = UINT8_MAX;
+      pre[i] = UINT8_MAX;
       wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
       mask[i] = kMaskMax * kMaskMax;
     }
 
-    const unsigned int ref_res = ref_func_(ref, ref_stride, wsrc, mask);
-    const unsigned int tst_res = tst_func_(ref, ref_stride, wsrc, mask);
+    const unsigned int ref_res = ref_func_(pre, pre_stride, wsrc, mask);
+    const unsigned int tst_res = tst_func_(pre, pre_stride, wsrc, mask);
 
     ASSERT_EQ(ref_res, tst_res);
   }
@@ -126,22 +126,22 @@
 };
 
 TEST_P(ObmcSadHBDTest, RandomValues) {
-  DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
 
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
-    const int ref_stride = rng_(MAX_SB_SIZE + 1);
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
 
     for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
-      ref[i] = rng_(1<<12);
+      pre[i] = rng_(1<<12);
       wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1);
       mask[i] = rng_(kMaskMax * kMaskMax + 1);
     }
 
-    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
+    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
                                            wsrc, mask);
-    const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
+    const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
                                            wsrc, mask);
 
     ASSERT_EQ(ref_res, tst_res);
@@ -149,22 +149,22 @@
 }
 
 TEST_P(ObmcSadHBDTest, ExtremeValues) {
-  DECLARE_ALIGNED(32, uint16_t, ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
   DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
 
   for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
-    const int ref_stride = iter;
+    const int pre_stride = iter;
 
     for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
-      ref[i] = (1 << 12) - 1;
+      pre[i] = (1 << 12) - 1;
       wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
       mask[i] = kMaskMax * kMaskMax;
     }
 
-    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
+    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
                                            wsrc, mask);
-    const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(ref), ref_stride,
+    const unsigned int tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
                                            wsrc, mask);
 
     ASSERT_EQ(ref_res, tst_res);
diff --git a/test/test.mk b/test/test.mk
index fcd565c..67fe705 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -178,11 +178,12 @@
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc
 
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
 endif
 
diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index 7999087..ec77035 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc
@@ -24,12 +24,25 @@
 typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
                               int, int, const InterpFilterParams,
                               const int, int, int);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*hbd_conv_filter_t)(const uint16_t*, int, uint16_t*, int,
+                                  int, int, const InterpFilterParams,
+                                  const int, int, int, int);
+#endif
+
 // Test parameter list:
 //  <convolve_horiz_func, convolve_vert_func,
 //  <width, height>, filter_params, subpel_x_q4, avg>
 typedef tuple<int, int> BlockDimension;
 typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
               int, int> ConvParams;
+#if CONFIG_VP9_HIGHBITDEPTH
+// Test parameter list:
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
+typedef tuple<hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
+              INTERP_FILTER, int, int, int> HbdConvParams;
+#endif
 
 // Note:
 //  src_ and src_ref_ have special boundary requirement
@@ -75,11 +88,8 @@
   void RunVertFilterBitExactCheck();
 
  private:
-  void PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
-                        uint8_t *dst, uint8_t *dst_ref,
-                        int w, int h);
-  void DiffFilterBuffer(const uint8_t *buf, const uint8_t *buf_ref,
-                        int w, int h, int fgroup, int findex);
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
   conv_filter_t conv_horiz_;
   conv_filter_t conv_vert_;
   uint8_t *alloc_;
@@ -94,18 +104,16 @@
   int avg_;
 };
 
-void VP10ConvolveOptimzTest::PrepFilterBuffer(uint8_t *src, uint8_t *src_ref,
-                                              uint8_t *dst, uint8_t *dst_ref,
-                                              int w, int h) {
+void VP10ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
   int r, c;
   ACMRandom rnd(ACMRandom::DeterministicSeed());
 
   memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
 
-  uint8_t *src_ptr = src;
-  uint8_t *dst_ptr = dst;
-  uint8_t *src_ref_ptr = src_ref;
-  uint8_t *dst_ref_ptr = dst_ref;
+  uint8_t *src_ptr = src_;
+  uint8_t *dst_ptr = dst_;
+  uint8_t *src_ref_ptr = src_ref_;
+  uint8_t *dst_ref_ptr = dst_ref_;
 
   for (r = 0; r < height_; ++r) {
     for (c = 0; c < width_; ++c) {
@@ -121,21 +129,17 @@
   }
 }
 
-void VP10ConvolveOptimzTest::DiffFilterBuffer(const uint8_t *buf,
-                                              const uint8_t *buf_ref,
-                                              int w, int h,
-                                              int filter_group,
-                                              int filter_index) {
+void VP10ConvolveOptimzTest::DiffFilterBuffer() {
   int r, c;
-  const uint8_t *dst_ptr = buf;
-  const uint8_t *dst_ref_ptr = buf_ref;
-  for (r = 0; r < h; ++r) {
-    for (c = 0; c < w; ++c) {
+  const uint8_t *dst_ptr = dst_;
+  const uint8_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
       EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
       << "Error at row: " << r << " col: " << c << " "
-      << "w = " << w << " " << "h = " << h << " "
-      << "filter group index = " << filter_group << " "
-      << "filter index = " << filter_index;
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_;
     }
     dst_ptr += stride;
     dst_ref_ptr += stride;
@@ -143,7 +147,7 @@
 }
 
 void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
 
   InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
 
@@ -153,14 +157,14 @@
   conv_horiz_(src_, stride, dst_, stride, width_, height_,
               filter_params, subpel_, x_step_q4, avg_);
 
-  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+  DiffFilterBuffer();
 
   // Note:
   // Here we need calculate a height which is different from the specified one
   // and test again.
   int intermediate_height =
       (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
 
   vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
                         intermediate_height, filter_params, subpel_, x_step_q4,
@@ -170,12 +174,11 @@
               intermediate_height, filter_params, subpel_, x_step_q4,
               avg_);
 
-  DiffFilterBuffer(dst_, dst_ref_, width_, intermediate_height, filter_,
-                   subpel_);
+  DiffFilterBuffer();
 }
 
 void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
-  PrepFilterBuffer(src_, src_ref_, dst_, dst_ref_, testMaxBlk, testMaxBlk);
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
 
   InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
 
@@ -185,7 +188,7 @@
   conv_vert_(src_, stride, dst_, stride, width_, height_,
              filter_params, subpel_, x_step_q4, avg_);
 
-  DiffFilterBuffer(dst_, dst_ref_, width_, height_, filter_, subpel_);
+  DiffFilterBuffer();
 }
 
 TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
@@ -197,6 +200,7 @@
 
 using std::tr1::make_tuple;
 
+#if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
 const BlockDimension kBlockDim[] = {
   make_tuple(2, 2),
   make_tuple(2, 4),
@@ -218,14 +222,15 @@
   make_tuple(128, 128),
 };
 
-#if HAVE_SSSE3 && CONFIG_EXT_INTERP
 // 10/12-tap filters
 const INTERP_FILTER kFilter[] = {6, 4, 2};
 
 const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
 
 const int kAvg[] = {0, 1};
+#endif
 
+#if HAVE_SSSE3 && CONFIG_EXT_INTERP
 INSTANTIATE_TEST_CASE_P(
     SSSE3, VP10ConvolveOptimzTest,
     ::testing::Combine(
@@ -236,4 +241,167 @@
          ::testing::ValuesIn(kSubpelQ4),
          ::testing::ValuesIn(kAvg)));
 #endif  // HAVE_SSSE3 && CONFIG_EXT_INTERP
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
+class VP10HbdConvolveOptimzTest : public TestWithHbdConvParams {
+ public:
+  virtual ~VP10HbdConvolveOptimzTest() {}
+  virtual void SetUp() {
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
+    width_ = std::tr1::get<0>(block);
+    height_ = std::tr1::get<1>(block);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
+    bit_depth_ = GET_PARAM(6);
+
+    alloc_ = new uint16_t[maxBlockSize * 4];
+    src_ = alloc_ + (vertiOffset * maxWidth);
+    src_ += horizOffset;
+    src_ref_ = src_ + maxBlockSize;
+
+    dst_ = alloc_ + 2 * maxBlockSize;
+    dst_ref_ = alloc_ + 3 * maxBlockSize;
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
+
+ private:
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
+  hbd_conv_filter_t conv_horiz_;
+  hbd_conv_filter_t conv_vert_;
+  uint16_t *alloc_;
+  uint16_t *src_;
+  uint16_t *dst_;
+  uint16_t *src_ref_;
+  uint16_t *dst_ref_;
+  int width_;
+  int height_;
+  int filter_;
+  int subpel_;
+  int avg_;
+  int bit_depth_;
+};
+
+void VP10HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+  int r, c;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
+
+  uint16_t *src_ptr = src_;
+  uint16_t *dst_ptr = dst_;
+  uint16_t *dst_ref_ptr = dst_ref_;
+  uint16_t hbd_mask = (1 << bit_depth_) - 1;
+
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      src_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ref_ptr[c] = dst_ptr[c];
+    }
+    src_ptr += stride;
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::DiffFilterBuffer() {
+  int r, c;
+  const uint16_t *dst_ptr = dst_;
+  const uint16_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
+      << "Error at row: " << r << " col: " << c << " "
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_ << " "
+      << "bit depth = " << bit_depth_;
+    }
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               height_, filter_params, subpel_, x_step_q4,
+                               avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+
+  // Note:
+  // Here we need calculate a height which is different from the specified one
+  // and test again.
+  int intermediate_height =
+      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               intermediate_height, filter_params, subpel_,
+                               x_step_q4, avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+void VP10HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
+                              filter_params, subpel_, x_step_q4, avg_,
+                              bit_depth_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+TEST_P(VP10HbdConvolveOptimzTest, HorizBitExactCheck) {
+  RunHorizFilterBitExactCheck();
+}
+TEST_P(VP10HbdConvolveOptimzTest, VertBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
+
+#if HAVE_SSE4_1 && CONFIG_EXT_INTERP
+
+const int kBitdepth[] = {10, 12};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HbdConvolveOptimzTest,
+    ::testing::Combine(
+         ::testing::Values(vp10_highbd_convolve_horiz_sse4_1),
+         ::testing::Values(vp10_highbd_convolve_vert_sse4_1),
+         ::testing::ValuesIn(kBlockDim),
+         ::testing::ValuesIn(kFilter),
+         ::testing::ValuesIn(kSubpelQ4),
+         ::testing::ValuesIn(kAvg),
+         ::testing::ValuesIn(kBitdepth)));
+#endif  // HAVE_SSE4_1 && CONFIG_EXT_INTERP
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/vp10_wedge_utils_test.cc b/test/vp10_wedge_utils_test.cc
index 930a598..7a541b2 100644
--- a/test/vp10_wedge_utils_test.cc
+++ b/test/vp10_wedge_utils_test.cc
@@ -104,7 +104,7 @@
       p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
     }
 
-    vpx_blend_mask6(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+    vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
 
     vpx_subtract_block(h, w, r0, w, s, w, p0, w);
     vpx_subtract_block(h, w, r1, w, s, w, p1, w);
diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index 5dde3ab..8427237 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -342,3 +342,25 @@
   (void)index;
   return NULL;
 }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index) {
+#if CONFIG_EXT_INTERP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
+    return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
+  }
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
+    return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
+    return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+#endif
diff --git a/vp10/common/filter.h b/vp10/common/filter.h
index 591ac4d..5ebf2a5 100644
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h
@@ -95,6 +95,10 @@
 #if USE_TEMPORALFILTER_12TAP
 extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
 extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const
+int16_t sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8];
+#endif
 #endif
 
 #if CONFIG_EXT_INTERP
@@ -102,15 +106,26 @@
 extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
 extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
 extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8];
+extern const int16_t sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8];
+#endif
 #endif
 
 typedef const int8_t (*SubpelFilterCoeffs)[16];
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef const int16_t (*HbdSubpelFilterCoeffs)[8];
+#endif
 
 SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
     const InterpFilterParams p, int index);
 
 SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
     const InterpFilterParams p, int index);
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index ffdad0c..5d52314 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -57,6 +57,12 @@
                      int eob);
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
+void vp10_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                      int eob);
+void vp10_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob);
+void vp10_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob);
 
 void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                            int stride, int eob, TX_TYPE tx_type, int lossless);
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 70cf5e7..53fd1a6 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/blend.h"
 
 #include "vp10/common/blockd.h"
 #include "vp10/common/reconinter.h"
@@ -448,8 +449,8 @@
 #if CONFIG_SUPERTX
 static void build_masked_compound_wedge_extend(
     uint8_t *dst, int dst_stride,
-    uint8_t *src0, int src0_stride,
-    uint8_t *src1, int src1_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
     int wedge_index,
     int wedge_sign,
     BLOCK_SIZE sb_type,
@@ -459,18 +460,18 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  vpx_blend_mask6(dst, dst_stride,
-                  src0, src0_stride,
-                  src1, src1_stride,
-                  mask, MASK_MASTER_STRIDE,
-                  h, w, subh, subw);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, MASK_MASTER_STRIDE,
+                     h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_masked_compound_wedge_extend_highbd(
     uint8_t *dst_8, int dst_stride,
-    uint8_t *src0_8, int src0_stride,
-    uint8_t *src1_8, int src1_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
     int wedge_index, int wedge_sign,
     BLOCK_SIZE sb_type,
     int wedge_offset_x, int wedge_offset_y,
@@ -479,52 +480,54 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
       wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  vpx_highbd_blend_mask6(dst_8, dst_stride,
-                         src0_8, src0_stride,
-                         src1_8, src1_stride,
-                         mask, MASK_MASTER_STRIDE,
-                         h, w, subh, subw, bd);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, MASK_MASTER_STRIDE,
+                            h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_SUPERTX
 
-static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
-                                        uint8_t *src0, int src0_stride,
-                                        uint8_t *src1, int src1_stride,
-                                        int wedge_index, int wedge_sign,
-                                        BLOCK_SIZE sb_type,
-                                        int h, int w) {
+static void build_masked_compound_wedge(
+    uint8_t *dst, int dst_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
                                                       sb_type);
-  vpx_blend_mask6(dst, dst_stride,
-                  src0, src0_stride,
-                  src1, src1_stride,
-                  mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
-                  h, w, subh, subw);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                     h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
-                                               uint8_t *src0_8, int src0_stride,
-                                               uint8_t *src1_8, int src1_stride,
-                                               int wedge_index, int wedge_sign,
-                                               BLOCK_SIZE sb_type,
-                                               int h, int w, int bd) {
+static void build_masked_compound_wedge_highbd(
+    uint8_t *dst_8, int dst_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
                                                       sb_type);
-  vpx_highbd_blend_mask6(dst_8, dst_stride,
-                         src0_8, src0_stride,
-                         src1_8, src1_stride,
-                         mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
-                         h, w, subh, subw, bd);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                            h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -1064,177 +1067,123 @@
   28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 };
 
-static void generate_1dmask(int length, uint8_t *mask, int plane) {
+static const uint8_t* get_supertx_mask(int length, int plane) {
   switch (length) {
     case 8:
-      memcpy(mask, plane ? mask_8_uv : mask_8, length);
-      break;
+      return plane ? mask_8_uv : mask_8;
     case 16:
-      memcpy(mask, plane ? mask_16_uv : mask_16, length);
-      break;
+      return plane ? mask_16_uv : mask_16;
     case 32:
-      memcpy(mask, plane ? mask_32_uv : mask_32, length);
-      break;
+      return plane ? mask_32_uv : mask_32;
     default:
       assert(0);
   }
+  return NULL;
 }
 
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col,
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane) {
-  int i, j;
   const struct macroblockd_plane *pd = &xd->plane[plane];
-  uint8_t mask[MAX_TX_SIZE];
-  int top_w = 4 << b_width_log2_lookup[top_bsize];
-  int top_h = 4 << b_height_log2_lookup[top_bsize];
-  int w = 4 << b_width_log2_lookup[bsize];
-  int h = 4 << b_height_log2_lookup[bsize];
-  int w_offset = (mi_col - mi_col_ori) * MI_SIZE;
-  int h_offset = (mi_row - mi_row_ori) * MI_SIZE;
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+  const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+  const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+  const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
-  uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
-  int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   assert(bsize <= BLOCK_32X32);
-
-  top_w >>= pd->subsampling_x;
-  top_h >>= pd->subsampling_y;
-  w >>= pd->subsampling_x;
-  h >>= pd->subsampling_y;
-  w_offset >>= pd->subsampling_x;
-  h_offset >>= pd->subsampling_y;
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
 
   switch (partition) {
-    case PARTITION_HORZ:
-    {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
-        uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
-
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
+      if (is_hdb)
+        vpx_highbd_blend_a64_vmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, h, top_w, xd->bd);
+      else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst + h_offset * dst_stride;
-        uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
+        vpx_blend_a64_vmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, h, top_w);
 
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-
+      dst += h * dst_stride;
+      pre += h * pre_stride;
       break;
-    case PARTITION_VERT:
-    {
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16;
-        uint16_t *dst2_tmp = dst216;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          memcpy(dst_tmp + j, dst2_tmp + j,
-                     (top_w - w_offset - w) * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst;
-        uint8_t *dst2_tmp = dst2;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-            memcpy(dst_tmp + j, dst2_tmp + j,
-                       (top_w - w_offset - w) * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (is_hdb)
+        vpx_highbd_blend_a64_hmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        vpx_blend_a64_hmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
       break;
-    default:
+    }
+    default: {
       assert(0);
+      return;
+    }
   }
-  (void) xd;
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t*)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t*)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
 }
 
 void vp10_build_inter_predictors_sb_sub8x8_extend(
@@ -1878,12 +1827,10 @@
                                BLOCK_SIZE plane_bsize,
                                uint8_t *comppred,
                                int compstride,
-                               uint8_t *interpred,
+                               const uint8_t *interpred,
                                int interstride,
-                               uint8_t *intrapred,
+                               const uint8_t *intrapred,
                                int intrastride) {
-  const int scale_bits = 8;
-  const int scale_max = (1 << scale_bits);
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
@@ -1896,11 +1843,11 @@
                                                           bsize);
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      vpx_blend_mask6(comppred, compstride,
-                      intrapred, intrastride,
-                      interpred, interstride,
-                      mask, 4 * num_4x4_blocks_wide_lookup[bsize],
-                      bh, bw, subh, subw);
+      vpx_blend_a64_mask(comppred, compstride,
+                         intrapred, intrastride,
+                         interpred, interstride,
+                         mask, 4 * num_4x4_blocks_wide_lookup[bsize],
+                         bh, bw, subh, subw);
     }
     return;
   }
@@ -1911,10 +1858,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1924,10 +1870,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[j * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1939,10 +1884,9 @@
           int scale = (ii_weights1d[i * size_scale] * 3 +
                        ii_weights1d[j * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1954,10 +1898,9 @@
           int scale = (ii_weights1d[j * size_scale] * 3 +
                        ii_weights1d[i * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1967,10 +1910,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[(i < j ? i : j) * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1981,10 +1923,9 @@
           int scale = (ii_weights1d[i * size_scale] +
                        ii_weights1d[j * size_scale]) >> 1;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1995,10 +1936,8 @@
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  interpred[i * interstride + j] +
-                  intrapred[i * intrastride + j],
-                  1);
+              VPX_BLEND_AVG(intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
         }
       }
       break;
@@ -2014,20 +1953,18 @@
                                       BLOCK_SIZE plane_bsize,
                                       uint8_t *comppred8,
                                       int compstride,
-                                      uint8_t *interpred8,
+                                      const uint8_t *interpred8,
                                       int interstride,
-                                      uint8_t *intrapred8,
+                                      const uint8_t *intrapred8,
                                       int intrastride, int bd) {
-  const int scale_bits = 8;
-  const int scale_max = (1 << scale_bits);
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
   int i, j;
 
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
-  uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
-  uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
@@ -2036,11 +1973,11 @@
                                                           bsize);
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      vpx_highbd_blend_mask6(comppred8, compstride,
-                             intrapred8, intrastride,
-                             interpred8, interstride,
-                             mask, bw,
-                             bh, bw, subh, subw, bd);
+      vpx_highbd_blend_a64_mask(comppred8, compstride,
+                                intrapred8, intrastride,
+                                interpred8, interstride,
+                                mask, bw,
+                                bh, bw, subh, subw, bd);
     }
     return;
   }
@@ -2051,10 +1988,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2064,10 +2000,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[j * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2079,10 +2014,9 @@
           int scale = (ii_weights1d[i * size_scale] * 3 +
                        ii_weights1d[j * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2094,10 +2028,9 @@
           int scale = (ii_weights1d[j * size_scale] * 3 +
                        ii_weights1d[i * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2107,10 +2040,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[(i < j ? i : j) * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2121,10 +2053,9 @@
           int scale = (ii_weights1d[i * size_scale] +
                        ii_weights1d[j * size_scale]) >> 1;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2135,10 +2066,8 @@
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  interpred[i * interstride + j] +
-                  intrapred[i * intrastride + j],
-                  1);
+              VPX_BLEND_AVG(interpred[i * interstride + j],
+                            intrapred[i * intrastride + j]);
         }
       }
       break;
@@ -2239,8 +2168,8 @@
 
 void vp10_combine_interintra(MACROBLOCKD *xd,
                              BLOCK_SIZE bsize, int plane,
-                             uint8_t *inter_pred, int inter_stride,
-                             uint8_t *intra_pred, int intra_stride) {
+                             const uint8_t *inter_pred, int inter_stride,
+                             const uint8_t *intra_pred, int intra_stride) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 4ede3e9..ac4a004 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -396,7 +396,8 @@
 struct macroblockd_plane;
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
     BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane);
@@ -631,8 +632,8 @@
 void vp10_combine_interintra(
     MACROBLOCKD *xd,
     BLOCK_SIZE bsize, int plane,
-    uint8_t *inter_pred, int inter_stride,
-    uint8_t *intra_pred, int intra_stride);
+    const uint8_t *inter_pred, int inter_stride,
+    const uint8_t *intra_pred, int intra_stride);
 void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
                                            uint8_t *upred,
                                            uint8_t *vpred,
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
index 2026df1..5b1d921 100644
--- a/vp10/common/vp10_convolve.c
+++ b/vp10/common/vp10_convolve.c
@@ -2,6 +2,7 @@
 #include <string.h>
 
 #include "./vp10_rtcd.h"
+#include "vp10/common/vp10_convolve.h"
 #include "vp10/common/filter.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
@@ -182,7 +183,7 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
+void vp10_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
                                   uint16_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams filter_params,
                                   const int subpel_x_q4, int x_step_q4, int avg,
@@ -213,7 +214,7 @@
   }
 }
 
-static void highbd_convolve_vert(const uint16_t *src, int src_stride,
+void vp10_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
                                  uint16_t *dst, int dst_stride, int w, int h,
                                  const InterpFilterParams filter_params,
                                  const int subpel_y_q4, int y_step_q4, int avg,
@@ -300,8 +301,9 @@
     InterpFilterParams filter_params =
         vp10_get_interp_filter_params(interp_filter);
 #endif
-    highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
-                          subpel_x_q4, x_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_x_q4, x_step_q4, ref_idx,
+                               bd);
   } else if (ignore_horiz) {
 #if CONFIG_DUAL_FILTER
     InterpFilterParams filter_params =
@@ -310,8 +312,9 @@
     InterpFilterParams filter_params =
         vp10_get_interp_filter_params(interp_filter);
 #endif
-    highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_y_q4, y_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
+                              filter_params, subpel_y_q4, y_step_q4, ref_idx,
+                              bd);
   } else {
     // temp's size is set to (maximum possible intermediate_height) *
     // MAX_BLOCK_WIDTH
@@ -336,9 +339,10 @@
     int intermediate_height =
         (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
 
-    highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
-                          temp, temp_stride, w, intermediate_height,
-                          filter_params, subpel_x_q4, x_step_q4, 0, bd);
+    vp10_highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1),
+                               src_stride, temp, temp_stride, w,
+                               intermediate_height, filter_params, subpel_x_q4,
+                               x_step_q4, 0, bd);
 
 #if CONFIG_DUAL_FILTER
     filter_params = filter_params_y;
@@ -346,9 +350,9 @@
     filter_size = filter_params.taps;
     assert(filter_params.taps <= MAX_FILTER_TAP);
 
-    highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
-                         temp_stride, dst, dst_stride, w, h, filter_params,
-                         subpel_y_q4, y_step_q4, ref_idx, bd);
+    vp10_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
+                              temp_stride, dst, dst_stride, w, h, filter_params,
+                              subpel_y_q4, y_step_q4, ref_idx, bd);
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_fwd_txfm.c b/vp10/common/vp10_fwd_txfm.c
index 3211cd0..17935c5 100644
--- a/vp10/common/vp10_fwd_txfm.c
+++ b/vp10/common/vp10_fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/vp10_fwd_txfm.h"
 
 void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
index 91a5357..cddd7dc 100644
--- a/vp10/common/vp10_fwd_txfm2d.c
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -10,6 +10,7 @@
 
 #include <assert.h>
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/enums.h"
 #include "vp10/common/vp10_fwd_txfm1d.h"
 #include "vp10/common/vp10_fwd_txfm2d_cfg.h"
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
index 403b209..0ebac42 100644
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/vp10_inv_txfm.h"
 
 void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
index ccf4614..85a33ba 100644
--- a/vp10/common/vp10_inv_txfm2d.c
+++ b/vp10/common/vp10_inv_txfm2d.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/enums.h"
 #include "vp10/common/vp10_txfm.h"
 #include "vp10/common/vp10_inv_txfm1d.h"
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 5a41511..1b501e2 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -93,6 +93,13 @@
 add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
 specialize qw/vp10_convolve_vert ssse3/;
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp10_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_horiz sse4_1/;
+  add_proto qw/void vp10_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_vert sse4_1/;
+}
+
 #
 # dct
 #
@@ -260,13 +267,19 @@
     specialize qw/vp10_fdct32x32_1/;
   } else {
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
+    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp10_iht8x8_64_add sse2 neon dspr2 msa/;
+    specialize qw/vp10_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+    specialize qw/vp10_iht16x16_256_add sse2 dspr2/;
+
+    if (vpx_config("CONFIG_EXT_TX") ne "yes") {
+      specialize qw/vp10_iht4x4_16_add msa/;
+      specialize qw/vp10_iht8x8_64_add msa/;
+      specialize qw/vp10_iht16x16_256_add msa/;
+    }
 
     add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/vp10_fdct4x4 sse2/;
@@ -437,13 +450,19 @@
   specialize qw/vp10_fwht4x4/, "$mmx_x86inc";
 } else {
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht4x4 sse2 msa/;
+  specialize qw/vp10_fht4x4 sse2/;
 
   add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht8x8 sse2 msa/;
+  specialize qw/vp10_fht8x8 sse2/;
 
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht16x16 sse2 msa/;
+  specialize qw/vp10_fht16x16 sse2/;
+
+  if (vpx_config("CONFIG_EXT_TX") ne "yes") {
+    specialize qw/vp10_fht4x4 msa/;
+    specialize qw/vp10_fht8x8 msa/;
+    specialize qw/vp10_fht16x16 msa/;
+  }
 
   add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht32x32/;
diff --git a/vp10/common/warped_motion.c b/vp10/common/warped_motion.c
index 4990bb3..3b924ea 100644
--- a/vp10/common/warped_motion.c
+++ b/vp10/common/warped_motion.c
@@ -85,19 +85,19 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
           WARPEDPIXEL_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((x << WARPEDMODEL_PREC_BITS)) + mat[0],
           WARPEDPIXEL_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
           WARPEDPIXEL_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((y << WARPEDMODEL_PREC_BITS)) + mat[1],
           WARPEDPIXEL_PREC_BITS);
     points += stride_points - 2;
@@ -115,21 +115,21 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[0] * 2 * x + mat[1] * 2 * y + mat[2] +
           (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
+                                            WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           -mat[1] * 2 * x + mat[0] * 2 * y + mat[3] +
           (-mat[1] + mat[0] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
+                                            WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -145,21 +145,21 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[0] * 2 * x + mat[1] * 2 * y + mat[4] +
           (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
+                                            WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[2] * 2 * x + mat[3] * 2 * y + mat[5] +
           (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
+                                            WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -357,7 +357,7 @@
     const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
     const int64_t v3 = x * (p[1] - p[-1]);
     const int64_t v4 = 2 * p[0];
-    return (int32_t)ROUNDZ_POWER_OF_TWO_SIGNED(
+    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
         (v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
         (v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
         (v2 << WARPEDPIXEL_PREC_BITS) + v1,
diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c
index d8b6d95..da60764 100644
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
diff --git a/vp10/common/x86/vp10_convolve_filters_ssse3.c b/vp10/common/x86/vp10_convolve_filters_ssse3.c
index 410da89..2f7b3c7 100644
--- a/vp10/common/x86/vp10_convolve_filters_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_filters_ssse3.c
@@ -7,626 +7,936 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-
 #include "./vpx_config.h"
 #include "vp10/common/filter.h"
 
-// Note:
-//  Filter coefficients are from "filter.c".  We use,
-//  sub_pel_filters_temporalfilter_12[],
-//  sub_pel_filters_12sharp[],
-//  sub_pel_filters_10sharp[].
-
-// (2-1) Parallel filtering along the intended signal direction
-
-// 12-tap filter padding:
-// {filter_coefficients, 0, 0, 0, 0},
-// {0, 0, filter_coefficients, 0, 0),
-#if USE_TEMPORALFILTER_12TAP
+#if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(16, const int8_t,
-                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
+                sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
   {
-    {0,   1,  -1,   3,  -7, 127,   8,  -4,   2,  -1,   0, 0, 0, 0, 0, 0},
-    {0, 0, 0,   1,  -1,   3,  -7, 127,   8,  -4,   2,  -1,   0, 0, 0, 0},
+    {  0,   0,  -1,   3,  -6, 127,   8,  -4,
+       2,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   3,  -6, 127,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
   },
   {
-    {0,   1,  -3,   5, -12, 124,  18,  -8,   4,  -2,   1, 0, 0, 0, 0, 0},
-    {0, 0, 0,   1,  -3,   5, -12, 124,  18,  -8,   4,  -2,   1, 0, 0, 0},
+    {  0,   1,  -2,   5, -12, 124,  18,  -7,
+       3,  -2,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   5, -12, 124,
+      18,  -7,   3,  -2,   0,   0,   0,   0, },
   },
   {
-    {-1,   2,  -4,   8, -17, 120,  28, -11,   6,  -3,   1, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -4,   8, -17, 120,  28, -11,   6,  -3,   1, -1, 0, 0},
+    {  0,   1,  -3,   7, -17, 119,  28, -11,
+       5,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   7, -17, 119,
+      28, -11,   5,  -2,   1,   0,   0,   0, },
   },
   {
-    {-1,   2,  -4,  10, -21, 114,  38, -15,   8,  -4,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -4,  10, -21, 114,  38, -15,   8,  -4,   2, -1, 0, 0},
+    {  0,   1,  -4,   8, -20, 114,  38, -14,
+       7,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   8, -20, 114,
+      38, -14,   7,  -3,   1,   0,   0,   0, },
   },
   {
-    {-1,   3,  -5,  11, -23, 107,  49, -18,   9,  -5,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -5,  11, -23, 107,  49, -18,   9,  -5,   2, -1, 0, 0},
+    {  0,   1,  -4,   9, -22, 107,  49, -17,
+       8,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   9, -22, 107,
+      49, -17,   8,  -4,   1,   0,   0,   0, },
   },
   {
-    {-1,   3,  -6,  12, -25,  99,  60, -21,  11,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  12, -25,  99,  60, -21,  11,  -6,   3, -1, 0, 0},
+    {  0,   2,  -5,  10, -24,  99,  59, -20,
+       9,  -4,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -24,  99,
+      59, -20,   9,  -4,   2,   0,   0,   0, },
   },
   {
-    {-1,   3,  -6,  12, -25,  90,  70, -23,  12,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  12, -25,  90,  70, -23,  12,  -6,   3, -1, 0, 0},
+    {  0,   2,  -5,  10, -24,  90,  70, -22,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -24,  90,
+      70, -22,  10,  -5,   2,   0,   0,   0, },
   },
   {
-    {-1,   3,  -6,  12, -24,  80,  80, -24,  12,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  12, -24,  80,  80, -24,  12,  -6,   3, -1, 0, 0},
+    {  0,   2,  -5,  10, -23,  80,  80, -23,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -23,  80,
+      80, -23,  10,  -5,   2,   0,   0,   0, },
   },
   {
-    {-1,   3,  -6,  12, -23,  70,  90, -25,  12,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  12, -23,  70,  90, -25,  12,  -6,   3, -1, 0, 0},
+    {  0,   2,  -5,  10, -22,  70,  90, -24,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -22,  70,
+      90, -24,  10,  -5,   2,   0,   0,   0, },
   },
   {
-    {-1,   3,  -6,  11, -21,  60,  99, -25,  12,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  11, -21,  60,  99, -25,  12,  -6,   3, -1, 0, 0},
+    {  0,   2,  -4,   9, -20,  59,  99, -24,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -4,   9, -20,  59,
+      99, -24,  10,  -5,   2,   0,   0,   0, },
   },
   {
-    {-1,   2,  -5,   9, -18,  49, 107, -23,  11,  -5,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -5,   9, -18,  49, 107, -23,  11,  -5,   3, -1, 0, 0},
+    {  0,   1,  -4,   8, -17,  49, 107, -22,
+       9,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   8, -17,  49,
+     107, -22,   9,  -4,   1,   0,   0,   0, },
   },
   {
-    {-1,   2,  -4,   8, -15,  38, 114, -21,  10,  -4,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -4,   8, -15,  38, 114, -21,  10,  -4,   2, -1, 0, 0},
+    {  0,   1,  -3,   7, -14,  38, 114, -20,
+       8,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   7, -14,  38,
+     114, -20,   8,  -4,   1,   0,   0,   0, },
   },
   {
-    {-1,   1,  -3,   6, -11,  28, 120, -17,   8,  -4,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   1,  -3,   6, -11,  28, 120, -17,   8,  -4,   2, -1, 0, 0},
+    {  0,   1,  -2,   5, -11,  28, 119, -17,
+       7,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   5, -11,  28,
+     119, -17,   7,  -3,   1,   0,   0,   0, },
   },
   {
-    {0,   1,  -2,   4,  -8,  18, 124, -12,   5,  -3,   1, 0, 0, 0, 0, 0},
-    {0, 0, 0,   1,  -2,   4,  -8,  18, 124, -12,   5,  -3,   1, 0, 0, 0},
+    {  0,   0,  -2,   3,  -7,  18, 124, -12,
+       5,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -2,   3,  -7,  18,
+     124, -12,   5,  -2,   1,   0,   0,   0, },
   },
   {
-    {0,   0,  -1,   2,  -4,   8, 127,  -7,   3,  -1,   1, 0, 0, 0, 0, 0},
-    {0, 0, 0,   0,  -1,   2,  -4,   8, 127,  -7,   3,  -1,   1, 0, 0, 0},
+    {  0,   0,  -1,   2,  -4,   8, 127,  -6,
+       3,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   2,  -4,   8,
+     127,  -6,   3,  -1,   0,   0,   0,   0, },
   },
 };
-#endif  // USE_TEMPORALFILTER_12TAP
-
+#endif
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_10sharp_ver_signal_dir[15][6][16]) = {
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6, 127,  -6, 127,  -6, 127,  -6, 127,
+      -6, 127,  -6, 127,  -6, 127,  -6, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5,
+      -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124,
+     -12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -7,  18,  -7,  18,  -7,  18,  -7,
+      18,  -7,  18,  -7,  18,  -7,  18,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7,
+      -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-17, 119, -17, 119, -17, 119, -17, 119,
+     -17, 119, -17, 119, -17, 119, -17, 119, },
+    { 28, -11,  28, -11,  28, -11,  28, -11,
+      28, -11,  28, -11,  28, -11,  28, -11, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2,
+       5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-20, 114, -20, 114, -20, 114, -20, 114,
+     -20, 114, -20, 114, -20, 114, -20, 114, },
+    { 38, -14,  38, -14,  38, -14,  38, -14,
+      38, -14,  38, -14,  38, -14,  38, -14, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3,
+       7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9,
+      -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-22, 107, -22, 107, -22, 107, -22, 107,
+     -22, 107, -22, 107, -22, 107, -22, 107, },
+    { 49, -17,  49, -17,  49, -17,  49, -17,
+      49, -17,  49, -17,  49, -17,  49, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  99, -24,  99, -24,  99, -24,  99,
+     -24,  99, -24,  99, -24,  99, -24,  99, },
+    { 59, -20,  59, -20,  59, -20,  59, -20,
+      59, -20,  59, -20,  59, -20,  59, -20, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4,
+       9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  90, -24,  90, -24,  90, -24,  90,
+     -24,  90, -24,  90, -24,  90, -24,  90, },
+    { 70, -22,  70, -22,  70, -22,  70, -22,
+      70, -22,  70, -22,  70, -22,  70, -22, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-23,  80, -23,  80, -23,  80, -23,  80,
+     -23,  80, -23,  80, -23,  80, -23,  80, },
+    { 80, -23,  80, -23,  80, -23,  80, -23,
+      80, -23,  80, -23,  80, -23,  80, -23, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-22,  70, -22,  70, -22,  70, -22,  70,
+     -22,  70, -22,  70, -22,  70, -22,  70, },
+    { 90, -24,  90, -24,  90, -24,  90, -24,
+      90, -24,  90, -24,  90, -24,  90, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9,
+      -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-20,  59, -20,  59, -20,  59, -20,  59,
+     -20,  59, -20,  59, -20,  59, -20,  59, },
+    { 99, -24,  99, -24,  99, -24,  99, -24,
+      99, -24,  99, -24,  99, -24,  99, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17,  49, -17,  49, -17,  49, -17,  49,
+     -17,  49, -17,  49, -17,  49, -17,  49, },
+    {107, -22, 107, -22, 107, -22, 107, -22,
+     107, -22, 107, -22, 107, -22, 107, -22, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4,
+       9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7,
+      -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-14,  38, -14,  38, -14,  38, -14,  38,
+     -14,  38, -14,  38, -14,  38, -14,  38, },
+    {114, -20, 114, -20, 114, -20, 114, -20,
+     114, -20, 114, -20, 114, -20, 114, -20, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5,
+      -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-11,  28, -11,  28, -11,  28, -11,  28,
+     -11,  28, -11,  28, -11,  28, -11,  28, },
+    {119, -17, 119, -17, 119, -17, 119, -17,
+     119, -17, 119, -17, 119, -17, 119, -17, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3,
+       7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7,  18,  -7,  18,  -7,  18,  -7,  18,
+      -7,  18,  -7,  18,  -7,  18,  -7,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12,
+     124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2,
+       5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -6, 127,  -6, 127,  -6, 127,  -6,
+     127,  -6, 127,  -6, 127,  -6, 127,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+};
+#endif
 #if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(16, const int8_t,
                 sub_pel_filters_12sharp_signal_dir[15][2][16]) = {
   {
-    {0,   1,  -2,   3,  -7, 127,   8,  -4,   2,  -1,   1, 0, 0, 0, 0, 0},
-    {0, 0, 0,   1,  -2,   3,  -7, 127,   8,  -4,   2,  -1,   1, 0, 0, 0},
+    {  0,   1,  -2,   3,  -7, 127,   8,  -4,
+       2,  -1,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   3,  -7, 127,
+       8,  -4,   2,  -1,   1,   0,   0,   0, },
   },
   {
-    {-1,   2,  -3,   6, -13, 124,  18,  -8,   4,  -2,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -3,   6, -13, 124,  18,  -8,   4,  -2,   2, -1, 0, 0},
+    { -1,   2,  -3,   6, -13, 124,  18,  -8,
+       4,  -2,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -3,   6, -13, 124,
+      18,  -8,   4,  -2,   2,  -1,   0,   0, },
   },
   {
-    {-1,   3,  -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1, 0, 0},
+    { -1,   3,  -4,   8, -18, 120,  28, -12,
+       7,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -4,   8, -18, 120,
+      28, -12,   7,  -4,   2,  -1,   0,   0, },
   },
   {
-    {-1,   3,  -6,  10, -21, 115,  38, -15,   8,  -5,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -6,  10, -21, 115,  38, -15,   8,  -5,   3, -1, 0, 0},
+    { -1,   3,  -6,  10, -21, 115,  38, -15,
+       8,  -5,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  10, -21, 115,
+      38, -15,   8,  -5,   3,  -1,   0,   0, },
   },
   {
-    {-2,   4,  -6,  12, -24, 108,  49, -18,  10,  -6,   3, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -6,  12, -24, 108,  49, -18,  10,  -6,   3, -2, 0, 0},
+    { -2,   4,  -6,  12, -24, 108,  49, -18,
+      10,  -6,   3,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -6,  12, -24, 108,
+      49, -18,  10,  -6,   3,  -2,   0,   0, },
   },
   {
-    {-2,   4,  -7,  13, -25, 100,  60, -21,  11,  -7,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -7,  13, -25, 100,  60, -21,  11,  -7,   4, -2, 0, 0},
+    { -2,   4,  -7,  13, -25, 100,  60, -21,
+      11,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -25, 100,
+      60, -21,  11,  -7,   4,  -2,   0,   0, },
   },
   {
-    {-2,   4,  -7,  13, -26,  91,  71, -24,  13,  -7,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -7,  13, -26,  91,  71, -24,  13,  -7,   4, -2, 0, 0},
+    { -2,   4,  -7,  13, -26,  91,  71, -24,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -26,  91,
+      71, -24,  13,  -7,   4,  -2,   0,   0, },
   },
   {
-    {-2,   4,  -7,  13, -25,  81,  81, -25,  13,  -7,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -7,  13, -25,  81,  81, -25,  13,  -7,   4, -2, 0, 0},
+    { -2,   4,  -7,  13, -25,  81,  81, -25,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -25,  81,
+      81, -25,  13,  -7,   4,  -2,   0,   0, },
   },
   {
-    {-2,   4,  -7,  13, -24,  71,  91, -26,  13,  -7,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -7,  13, -24,  71,  91, -26,  13,  -7,   4, -2, 0, 0},
+    { -2,   4,  -7,  13, -24,  71,  91, -26,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -24,  71,
+      91, -26,  13,  -7,   4,  -2,   0,   0, },
   },
   {
-    {-2,   4,  -7,  11, -21,  60, 100, -25,  13,  -7,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   4,  -7,  11, -21,  60, 100, -25,  13,  -7,   4, -2, 0, 0},
+    { -2,   4,  -7,  11, -21,  60, 100, -25,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  11, -21,  60,
+     100, -25,  13,  -7,   4,  -2,   0,   0, },
   },
   {
-    {-2,   3,  -6,  10, -18,  49, 108, -24,  12,  -6,   4, -2, 0, 0, 0, 0},
-    {0, 0, -2,   3,  -6,  10, -18,  49, 108, -24,  12,  -6,   4, -2, 0, 0},
+    { -2,   3,  -6,  10, -18,  49, 108, -24,
+      12,  -6,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   3,  -6,  10, -18,  49,
+     108, -24,  12,  -6,   4,  -2,   0,   0, },
   },
   {
-    {-1,   3,  -5,   8, -15,  38, 115, -21,  10,  -6,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   3,  -5,   8, -15,  38, 115, -21,  10,  -6,   3, -1, 0, 0},
+    { -1,   3,  -5,   8, -15,  38, 115, -21,
+      10,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -5,   8, -15,  38,
+     115, -21,  10,  -6,   3,  -1,   0,   0, },
   },
   {
-    {-1,   2,  -4,   7, -12,  28, 120, -18,   8,  -4,   3, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -4,   7, -12,  28, 120, -18,   8,  -4,   3, -1, 0, 0},
+    { -1,   2,  -4,   7, -12,  28, 120, -18,
+       8,  -4,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   7, -12,  28,
+     120, -18,   8,  -4,   3,  -1,   0,   0, },
   },
   {
-    {-1,   2,  -2,   4,  -8,  18, 124, -13,   6,  -3,   2, -1, 0, 0, 0, 0},
-    {0, 0, -1,   2,  -2,   4,  -8,  18, 124, -13,   6,  -3,   2, -1, 0, 0},
+    { -1,   2,  -2,   4,  -8,  18, 124, -13,
+       6,  -3,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -2,   4,  -8,  18,
+     124, -13,   6,  -3,   2,  -1,   0,   0, },
   },
   {
-    {0,   1,  -1,   2,  -4,   8, 127,  -7,   3,  -2,   1, 0, 0, 0, 0, 0},
-    {0, 0, 0,   1,  -1,   2,  -4,   8, 127,  -7,   3,  -2,   1, 0, 0, 0},
+    {  0,   1,  -1,   2,  -4,   8, 127,  -7,
+       3,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -1,   2,  -4,   8,
+     127,  -7,   3,  -2,   1,   0,   0,   0, },
   },
 };
-
-// 10-tap filter padding:
-// {0, filter_coefficients, 0, 0, 0, 0, 0},
-// {0, 0, 0, filter_coefficients, 0, 0, 0),
-DECLARE_ALIGNED(16, const int8_t,
-                sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
-  {
-    {0, 0, -1, 3,  -6, 127,  8,  -4,  2,  -1, 0,   0, 0, 0, 0, 0},
-    {0, 0,  0, 0,  -1,   3, -6, 127,  8,  -4, 2,  -1, 0, 0, 0, 0},
-  },
-  {
-    {0, 1, -2, 5, -12, 124,  18,  -7,   3, -2, 0,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -2,   5, -12, 124,  18, -7, 3, -2, 0, 0, 0, 0},
-  },
-  {
-    {0, 1, -3, 7, -17, 119,  28, -11,  5,  -2, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -3,   7, -17, 119, 28, -11, 5, -2, 1, 0, 0, 0},
-  },
-  {
-    {0, 1, -4, 8, -20, 114,  38, -14,  7,  -3, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -4,   8, -20, 114, 38, -14, 7, -3, 1, 0, 0, 0},
-  },
-  {
-    {0, 1, -4, 9, -22, 107,  49, -17,  8,  -4, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -4,   9, -22, 107, 49, -17, 8, -4, 1, 0, 0, 0},
-  },
-  {
-    {0, 2, -5, 10, -24, 99,  59, -20,  9,  -4, 2,  0, 0, 0, 0, 0},
-    {0, 0,  0,  2,  -5, 10, -24,  99, 59, -20, 9, -4, 2, 0, 0, 0},
-  },
-  {
-    {0, 2, -5, 10, -24, 90,  70, -22, 10,  -5,  2,  0, 0, 0, 0, 0},
-    {0, 0,  0,  2,  -5, 10, -24,  90, 70, -22, 10, -5, 2, 0, 0, 0},
-  },
-  {
-    {0, 2, -5, 10, -23, 80,  80, -23, 10,  -5,  2,  0, 0, 0, 0, 0},
-    {0, 0,  0,  2,  -5, 10, -23,  80, 80, -23, 10, -5, 2, 0, 0, 0},
-  },
-  {
-    {0, 2, -5, 10, -22, 70,  90, -24, 10,  -5,  2,  0, 0, 0, 0, 0},
-    {0, 0,  0,  2,  -5, 10, -22,  70, 90, -24, 10, -5, 2, 0, 0, 0},
-  },
-  {
-    {0, 2, -4, 9, -20, 59,  99, -24,  10,  -5,  2,  0, 0, 0, 0, 0},
-    {0, 0,  0, 2,  -4,  9, -20,  59,  99, -24, 10, -5, 2, 0, 0, 0},
-  },
-  {
-    {0, 1, -4, 8, -17,  49, 107, -22,   9,  -4, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -4,   8, -17,  49, 107, -22, 9, -4, 1, 0, 0, 0},
-  },
-  {
-    {0, 1, -3, 7, -14, 38, 114, -20,   8,  -4, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -3,  7, -14,  38, 114, -20, 8, -4, 1, 0, 0, 0},
-  },
-  {
-    {0, 1, -2, 5, -11, 28, 119, -17,   7,  -3, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 1,  -2,  5, -11,  28, 119, -17, 7, -3, 1, 0, 0, 0},
-  },
-  {
-    {0, 0, -2, 3,  -7, 18, 124, -12,   5,  -2, 1,  0, 0, 0, 0, 0},
-    {0, 0,  0, 0,  -2,  3,  -7,  18, 124, -12, 5, -2, 1, 0, 0, 0},
-  },
-  {
-    {0, 0, -1, 2,  -4,  8, 127, -6,   3,   -1, 0,  0, 0, 0, 0, 0},
-    {0, 0,  0, 0,  -1,  2,  -4,  8, 127,   -6, 3, -1, 0, 0, 0, 0},
-  },
-};
-#endif  // CONFIG_EXT_INTERP
-
-// (2-2) Parallel filtering vertically to signal direction
-#if USE_TEMPORALFILTER_12TAP
-DECLARE_ALIGNED(16, const int8_t,
-                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]) = {
-  {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
-     -7, 127, -7, 127},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-  },
-  {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5},
-    {-12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
-     -12, 124, -12, 124},
-    {18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
-  },
-  {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {-17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120,
-     -17, 120, -17, 120},
-    {28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11},
-    {6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3},
-    {1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1},
-  },
-  {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10},
-    {-21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114,
-     -21, 114, -21, 114},
-    {38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
-     38, -15, 38, -15},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11},
-    {-23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107,
-     -23, 107, -23, 107},
-    {49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18},
-    {9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
-    {-25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99},
-    {60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21},
-    {11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
-    {-25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90},
-    {70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23},
-    {12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
-    {-24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80},
-    {80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24, 80, -24},
-    {12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
-    {-23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70, -23, 70},
-    {90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25, 90, -25},
-    {12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11, -6, 11},
-    {-21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60},
-    {99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25, 99, -25},
-    {12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9, -5, 9},
-    {-18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49},
-    {107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107, -23, 107,
-     -23, 107, -23},
-    {11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5, 11, -5},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-  },
-  {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {-15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38},
-    {114, -21, 114, -21, 114, -21, 114, -21, 114, -21, 114, -21,
-     114, -21, 114, -21},
-    {10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4, 10, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-  },
-  {
-    {-1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1, -1, 1},
-    {-3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6},
-    {-11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28},
-    {120, -17, 120, -17, 120, -17, 120, -17, 120, -17, 120, -17,
-     120, -17, 120, -17},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-  },
-  {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18},
-    {124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
-     124, -12, 124, -12},
-    {5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3, 5, -3},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
-  },
-  {
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
-     127, -7, 127, -7},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
-  },
-};
-#endif  // USE_TEMPORALFILTER_12TAP
-
+#endif
 #if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(16, const int8_t,
                 sub_pel_filters_12sharp_ver_signal_dir[15][6][16]) = {
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
-    {-7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127,
-     -7, 127, -7, 127},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127,
+      -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
   },
   {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6},
-    {-13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124,
-     -13, 124, -13, 124},
-    {18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6,
+      -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-13, 124, -13, 124, -13, 124, -13, 124,
+     -13, 124, -13, 124, -13, 124, -13, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8,
+      18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
   },
   {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {-18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120,
-     -18, 120, -18, 120},
-    {28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12},
-    {7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-18, 120, -18, 120, -18, 120, -18, 120,
+     -18, 120, -18, 120, -18, 120, -18, 120, },
+    { 28, -12,  28, -12,  28, -12,  28, -12,
+      28, -12,  28, -12,  28, -12,  28, -12, },
+    {  7,  -4,   7,  -4,   7,  -4,   7,  -4,
+       7,  -4,   7,  -4,   7,  -4,   7,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
   },
   {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10},
-    {-21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115,
-     -21, 115, -21, 115},
-    {38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15,
-     38, -15, 38, -15},
-    {8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10,
+      -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-21, 115, -21, 115, -21, 115, -21, 115,
+     -21, 115, -21, 115, -21, 115, -21, 115, },
+    { 38, -15,  38, -15,  38, -15,  38, -15,
+      38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -5,   8,  -5,   8,  -5,   8,  -5,
+       8,  -5,   8,  -5,   8,  -5,   8,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12},
-    {-24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108,
-     -24, 108, -24, 108},
-    {49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18},
-    {10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6},
-    {3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24, 108, -24, 108, -24, 108, -24, 108,
+     -24, 108, -24, 108, -24, 108, -24, 108, },
+    { 49, -18,  49, -18,  49, -18,  49, -18,
+      49, -18,  49, -18,  49, -18,  49, -18, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6,
+      10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
-    {-25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100,
-     -25, 100, -25, 100},
-    {60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21},
-    {11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25, 100, -25, 100, -25, 100, -25, 100,
+     -25, 100, -25, 100, -25, 100, -25, 100, },
+    { 60, -21,  60, -21,  60, -21,  60, -21,
+      60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -7,  11,  -7,  11,  -7,  11,  -7,
+      11,  -7,  11,  -7,  11,  -7,  11,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
-    {-26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91},
-    {71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24},
-    {13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-26,  91, -26,  91, -26,  91, -26,  91,
+     -26,  91, -26,  91, -26,  91, -26,  91, },
+    { 71, -24,  71, -24,  71, -24,  71, -24,
+      71, -24,  71, -24,  71, -24,  71, -24, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
-    {-25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81},
-    {81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25, 81, -25},
-    {13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25,  81, -25,  81, -25,  81, -25,  81,
+     -25,  81, -25,  81, -25,  81, -25,  81, },
+    { 81, -25,  81, -25,  81, -25,  81, -25,
+      81, -25,  81, -25,  81, -25,  81, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13},
-    {-24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71, -24, 71},
-    {91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26, 91, -26},
-    {13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-24,  71, -24,  71, -24,  71, -24,  71,
+     -24,  71, -24,  71, -24,  71, -24,  71, },
+    { 91, -26,  91, -26,  91, -26,  91, -26,
+      91, -26,  91, -26,  91, -26,  91, -26, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11, -7, 11},
-    {-21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60, -21, 60},
-    {100, -25, 100, -25, 100, -25, 100, -25, 100, -25, 100, -25,
-     100, -25, 100, -25},
-    {13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7, 13, -7},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  11,  -7,  11,  -7,  11,  -7,  11,
+      -7,  11,  -7,  11,  -7,  11,  -7,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60,
+     -21,  60, -21,  60, -21,  60, -21,  60, },
+    {100, -25, 100, -25, 100, -25, 100, -25,
+     100, -25, 100, -25, 100, -25, 100, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
-    {-6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10},
-    {-18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49, -18, 49},
-    {108, -24, 108, -24, 108, -24, 108, -24, 108, -24, 108, -24,
-     108, -24, 108, -24},
-    {12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6, 12, -6},
-    {4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2},
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10,
+      -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-18,  49, -18,  49, -18,  49, -18,  49,
+     -18,  49, -18,  49, -18,  49, -18,  49, },
+    {108, -24, 108, -24, 108, -24, 108, -24,
+     108, -24, 108, -24, 108, -24, 108, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
   },
   {
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8, -5, 8},
-    {-15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38, -15, 38},
-    {115, -21, 115, -21, 115, -21, 115, -21, 115, -21, 115, -21,
-     115, -21, 115, -21},
-    {10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6, 10, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,   8,  -5,   8,  -5,   8,  -5,   8,
+      -5,   8,  -5,   8,  -5,   8,  -5,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38,
+     -15,  38, -15,  38, -15,  38, -15,  38, },
+    {115, -21, 115, -21, 115, -21, 115, -21,
+     115, -21, 115, -21, 115, -21, 115, -21, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6,
+      10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
   },
   {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7, -4, 7},
-    {-12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28, -12, 28},
-    {120, -18, 120, -18, 120, -18, 120, -18, 120, -18, 120, -18,
-     120, -18, 120, -18},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   7,  -4,   7,  -4,   7,  -4,   7,
+      -4,   7,  -4,   7,  -4,   7,  -4,   7, },
+    {-12,  28, -12,  28, -12,  28, -12,  28,
+     -12,  28, -12,  28, -12,  28, -12,  28, },
+    {120, -18, 120, -18, 120, -18, 120, -18,
+     120, -18, 120, -18, 120, -18, 120, -18, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
   },
   {
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4, -2, 4},
-    {-8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18, -8, 18},
-    {124, -13, 124, -13, 124, -13, 124, -13, 124, -13, 124, -13,
-     124, -13, 124, -13},
-    {6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3, 6, -3},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18,
+      -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -13, 124, -13, 124, -13, 124, -13,
+     124, -13, 124, -13, 124, -13, 124, -13, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3,
+       6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {127, -7, 127, -7, 127, -7, 127, -7, 127, -7, 127, -7,
-     127, -7, 127, -7},
-    {3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7,
+     127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
   },
 };
-
+#endif
+#if USE_TEMPORALFILTER_12TAP
 DECLARE_ALIGNED(16, const int8_t,
-                sub_pel_filters_10sharp_ver_signal_dir[15][6][16]) = {
+                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
   {
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3},
-    {-6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {  0,   1,  -1,   3,  -7, 127,   8,  -4,
+       2,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -1,   3,  -7, 127,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5},
-    {-12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124,
-     -12, 124, -12, 124},
-    {18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7},
-    {3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {  0,   1,  -3,   5, -12, 124,  18,  -8,
+       4,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   5, -12, 124,
+      18,  -8,   4,  -2,   1,   0,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7},
-    {-17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119,
-     -17, 119, -17, 119},
-    {28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11},
-    {5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   2,  -4,   8, -17, 120,  28, -11,
+       6,  -3,   1,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   8, -17, 120,
+      28, -11,   6,  -3,   1,  -1,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {-20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114,
-     -20, 114, -20, 114},
-    {38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14},
-    {7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   2,  -4,  10, -21, 114,  38, -15,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,  10, -21, 114,
+      38, -15,   8,  -4,   2,  -1,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9},
-    {-22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107,
-     -22, 107, -22, 107},
-    {49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   3,  -5,  11, -23, 107,  49, -18,
+       9,  -5,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -5,  11, -23, 107,
+      49, -18,   9,  -5,   2,  -1,   0,   0, },
   },
   {
-    {0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
-    {-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
-    {-24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99},
-    {59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20},
-    {9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4},
-    {2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
+    { -1,   3,  -6,  12, -25,  99,  60, -21,
+      11,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -25,  99,
+      60, -21,  11,  -6,   3,  -1,   0,   0, },
   },
   {
-    {0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
-    {-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
-    {-24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90},
-    {70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22},
-    {10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
-    {2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
+    { -1,   3,  -6,  12, -25,  90,  70, -23,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -25,  90,
+      70, -23,  12,  -6,   3,  -1,   0,   0, },
   },
   {
-    {0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
-    {-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
-    {-23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80},
-    {80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23, 80, -23},
-    {10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
-    {2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
+    { -1,   3,  -6,  12, -24,  80,  80, -24,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -24,  80,
+      80, -24,  12,  -6,   3,  -1,   0,   0, },
   },
   {
-    {0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
-    {-5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10},
-    {-22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70, -22, 70},
-    {90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24, 90, -24},
-    {10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
-    {2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
+    { -1,   3,  -6,  12, -23,  70,  90, -25,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -23,  70,
+      90, -25,  12,  -6,   3,  -1,   0,   0, },
   },
   {
-    {0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2},
-    {-4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9},
-    {-20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59, -20, 59},
-    {99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24, 99, -24},
-    {10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5, 10, -5},
-    {2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0, 2, 0},
+    { -1,   3,  -6,  11, -21,  60,  99, -25,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  11, -21,  60,
+      99, -25,  12,  -6,   3,  -1,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {-17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49, -17, 49},
-    {107, -22, 107, -22, 107, -22, 107, -22, 107, -22, 107, -22,
-     107, -22, 107, -22},
-    {9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4, 9, -4},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   2,  -5,   9, -18,  49, 107, -23,
+      11,  -5,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -5,   9, -18,  49,
+     107, -23,  11,  -5,   3,  -1,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7},
-    {-14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38, -14, 38},
-    {114, -20, 114, -20, 114, -20, 114, -20, 114, -20, 114, -20,
-     114, -20, 114, -20},
-    {8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   2,  -4,   8, -15,  38, 114, -21,
+      10,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   8, -15,  38,
+     114, -21,  10,  -4,   2,  -1,   0,   0, },
   },
   {
-    {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1},
-    {-2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5},
-    {-11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28, -11, 28},
-    {119, -17, 119, -17, 119, -17, 119, -17, 119, -17, 119, -17,
-     119, -17, 119, -17},
-    {7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3, 7, -3},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    { -1,   1,  -3,   6, -11,  28, 120, -17,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   1,  -3,   6, -11,  28,
+     120, -17,   8,  -4,   2,  -1,   0,   0, },
   },
   {
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3, -2, 3},
-    {-7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18, -7, 18},
-    {124, -12, 124, -12, 124, -12, 124, -12, 124, -12, 124, -12,
-     124, -12, 124, -12},
-    {5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2, 5, -2},
-    {1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0},
+    {  0,   1,  -2,   4,  -8,  18, 124, -12,
+       5,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   4,  -8,  18,
+     124, -12,   5,  -3,   1,   0,   0,   0, },
   },
   {
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
-    {-1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2, -1, 2},
-    {-4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8, -4, 8},
-    {127, -6, 127, -6, 127, -6, 127, -6, 127, -6, 127, -6,
-     127, -6, 127, -6},
-    {3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1, 3, -1},
-    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+    {  0,   0,  -1,   2,  -4,   8, 127,  -7,
+       3,  -1,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   2,  -4,   8,
+     127,  -7,   3,  -1,   1,   0,   0,   0, },
   },
 };
-#endif  // CONFIG_EXT_INTERP
+#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127,
+      -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   5,  -3,   5,  -3,   5,  -3,   5,
+      -3,   5,  -3,   5,  -3,   5,  -3,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124,
+     -12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8,
+      18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17, 120, -17, 120, -17, 120, -17, 120,
+     -17, 120, -17, 120, -17, 120, -17, 120, },
+    { 28, -11,  28, -11,  28, -11,  28, -11,
+      28, -11,  28, -11,  28, -11,  28, -11, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3,
+       6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  1,  -1,   1,  -1,   1,  -1,   1,  -1,
+       1,  -1,   1,  -1,   1,  -1,   1,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,  10,  -4,  10,  -4,  10,  -4,  10,
+      -4,  10,  -4,  10,  -4,  10,  -4,  10, },
+    {-21, 114, -21, 114, -21, 114, -21, 114,
+     -21, 114, -21, 114, -21, 114, -21, 114, },
+    { 38, -15,  38, -15,  38, -15,  38, -15,
+      38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,  11,  -5,  11,  -5,  11,  -5,  11,
+      -5,  11,  -5,  11,  -5,  11,  -5,  11, },
+    {-23, 107, -23, 107, -23, 107, -23, 107,
+     -23, 107, -23, 107, -23, 107, -23, 107, },
+    { 49, -18,  49, -18,  49, -18,  49, -18,
+      49, -18,  49, -18,  49, -18,  49, -18, },
+    {  9,  -5,   9,  -5,   9,  -5,   9,  -5,
+       9,  -5,   9,  -5,   9,  -5,   9,  -5, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  99, -25,  99, -25,  99, -25,  99,
+     -25,  99, -25,  99, -25,  99, -25,  99, },
+    { 60, -21,  60, -21,  60, -21,  60, -21,
+      60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -6,  11,  -6,  11,  -6,  11,  -6,
+      11,  -6,  11,  -6,  11,  -6,  11,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  90, -25,  90, -25,  90, -25,  90,
+     -25,  90, -25,  90, -25,  90, -25,  90, },
+    { 70, -23,  70, -23,  70, -23,  70, -23,
+      70, -23,  70, -23,  70, -23,  70, -23, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24,  80, -24,  80, -24,  80, -24,  80,
+     -24,  80, -24,  80, -24,  80, -24,  80, },
+    { 80, -24,  80, -24,  80, -24,  80, -24,
+      80, -24,  80, -24,  80, -24,  80, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-23,  70, -23,  70, -23,  70, -23,  70,
+     -23,  70, -23,  70, -23,  70, -23,  70, },
+    { 90, -25,  90, -25,  90, -25,  90, -25,
+      90, -25,  90, -25,  90, -25,  90, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  11,  -6,  11,  -6,  11,  -6,  11,
+      -6,  11,  -6,  11,  -6,  11,  -6,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60,
+     -21,  60, -21,  60, -21,  60, -21,  60, },
+    { 99, -25,  99, -25,  99, -25,  99, -25,
+      99, -25,  99, -25,  99, -25,  99, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -5,   9,  -5,   9,  -5,   9,  -5,   9,
+      -5,   9,  -5,   9,  -5,   9,  -5,   9, },
+    {-18,  49, -18,  49, -18,  49, -18,  49,
+     -18,  49, -18,  49, -18,  49, -18,  49, },
+    {107, -23, 107, -23, 107, -23, 107, -23,
+     107, -23, 107, -23, 107, -23, 107, -23, },
+    { 11,  -5,  11,  -5,  11,  -5,  11,  -5,
+      11,  -5,  11,  -5,  11,  -5,  11,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38,
+     -15,  38, -15,  38, -15,  38, -15,  38, },
+    {114, -21, 114, -21, 114, -21, 114, -21,
+     114, -21, 114, -21, 114, -21, 114, -21, },
+    { 10,  -4,  10,  -4,  10,  -4,  10,  -4,
+      10,  -4,  10,  -4,  10,  -4,  10,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   1,  -1,   1,  -1,   1,  -1,   1,
+      -1,   1,  -1,   1,  -1,   1,  -1,   1, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6,
+      -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-11,  28, -11,  28, -11,  28, -11,  28,
+     -11,  28, -11,  28, -11,  28, -11,  28, },
+    {120, -17, 120, -17, 120, -17, 120, -17,
+     120, -17, 120, -17, 120, -17, 120, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18,
+      -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12,
+     124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -3,   5,  -3,   5,  -3,   5,  -3,
+       5,  -3,   5,  -3,   5,  -3,   5,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7,
+     127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
diff --git a/vp10/common/x86/vp10_convolve_ssse3.c b/vp10/common/x86/vp10_convolve_ssse3.c
index 75520c9..07dc11d 100644
--- a/vp10/common/x86/vp10_convolve_ssse3.c
+++ b/vp10/common/x86/vp10_convolve_ssse3.c
@@ -81,8 +81,9 @@
 static store_pixel_t store4pixelTab[2] = {
   store_4_pixel_only, accumulate_store_4_pixel};
 
-void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
-                    int tapsNum, store_pixel_t store_func, uint8_t *dst) {
+static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
+                           int tapsNum, store_pixel_t store_func,
+                           uint8_t *dst) {
   __m128i sumPairRow[4];
   __m128i sumPairCol[8];
   __m128i pixel;
@@ -122,40 +123,40 @@
   store_func(&sumPairRow[1], dst);
 }
 
-void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                    store_pixel_t store, uint8_t *buf) {
+static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store, uint8_t *buf) {
   horiz_w4_ssse3(src, f, tapsNum, store, buf);
   src += 4;
   buf += 4;
   horiz_w4_ssse3(src, f, tapsNum, store, buf);
 }
 
-void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *buf) {
+static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
   horiz_w8_ssse3(src, f, tapsNum, store, buf);
   src += 8;
   buf += 8;
   horiz_w8_ssse3(src, f, tapsNum, store, buf);
 }
 
-void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *buf) {
+static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
   horiz_w16_ssse3(src, f, tapsNum, store, buf);
   src += 16;
   buf += 16;
   horiz_w16_ssse3(src, f, tapsNum, store, buf);
 }
 
-void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                     store_pixel_t store, uint8_t *buf) {
+static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
   horiz_w32_ssse3(src, f, tapsNum, store, buf);
   src += 32;
   buf += 32;
   horiz_w32_ssse3(src, f, tapsNum, store, buf);
 }
 
-void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
-                      store_pixel_t store, uint8_t *buf) {
+static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                             store_pixel_t store, uint8_t *buf) {
   horiz_w64_ssse3(src, f, tapsNum, store, buf);
   src += 64;
   buf += 64;
@@ -172,8 +173,8 @@
   horiz_w128_ssse3,
 };
 
-void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum, int width,
-                        store_pixel_t store, uint8_t *dst) {
+static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
+                               int width, store_pixel_t store, uint8_t *dst) {
   switch (width) {
     // Note:
     // For width=2 and 4, store function must be different
@@ -813,9 +814,10 @@
   store_func(&sum, dst);
 }
 
-void filter_vert_compute_small(const uint8_t *src, int src_stride, __m128i *f,
-                               int tapsNum, store_pixel_t store_func, int h,
-                               uint8_t *dst, int dst_stride) {
+static void filter_vert_compute_small(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int h,
+                                      uint8_t *dst, int dst_stride) {
   int rowIndex = 0;
   do {
     filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
@@ -826,9 +828,10 @@
   } while (rowIndex < h);
 }
 
-void filter_vert_compute_large(const uint8_t *src, int src_stride, __m128i *f,
-                               int tapsNum, store_pixel_t store_func, int w,
-                               int h, uint8_t *dst, int dst_stride) {
+static void filter_vert_compute_large(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int w, int h,
+                                      uint8_t *dst, int dst_stride) {
   int col;
   int rowIndex = 0;
   const uint8_t *src_ptr = src;
diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
index 1d70f14..ff04dc8 100644
--- a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
+++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/enums.h"
 #include "vp10/common/vp10_txfm.h"
 #include "vp10/common/x86/vp10_txfm1d_sse4.h"
diff --git a/vp10/common/x86/vp10_fwd_txfm_sse2.c b/vp10/common/x86/vp10_fwd_txfm_sse2.c
index 032c3cc..30bce5f 100644
--- a/vp10/common/x86/vp10_fwd_txfm_sse2.c
+++ b/vp10/common/x86/vp10_fwd_txfm_sse2.c
@@ -10,6 +10,7 @@
 
 #include <emmintrin.h>  // SSE2
 
+#include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
diff --git a/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c b/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
new file mode 100644
index 0000000..0251022
--- /dev/null
+++ b/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
@@ -0,0 +1,393 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp10/common/filter.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6, 127,  -6, 127,  -6, 127,  -6, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -7,  18,  -7,  18,  -7,  18,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-17, 119, -17, 119, -17, 119, -17, 119, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-20, 114, -20, 114, -20, 114, -20, 114, },
+    { 38, -14,  38, -14,  38, -14,  38, -14, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-22, 107, -22, 107, -22, 107, -22, 107, },
+    { 49, -17,  49, -17,  49, -17,  49, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  99, -24,  99, -24,  99, -24,  99, },
+    { 59, -20,  59, -20,  59, -20,  59, -20, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  90, -24,  90, -24,  90, -24,  90, },
+    { 70, -22,  70, -22,  70, -22,  70, -22, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-23,  80, -23,  80, -23,  80, -23,  80, },
+    { 80, -23,  80, -23,  80, -23,  80, -23, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-22,  70, -22,  70, -22,  70, -22,  70, },
+    { 90, -24,  90, -24,  90, -24,  90, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-20,  59, -20,  59, -20,  59, -20,  59, },
+    { 99, -24,  99, -24,  99, -24,  99, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17,  49, -17,  49, -17,  49, -17,  49, },
+    {107, -22, 107, -22, 107, -22, 107, -22, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-14,  38, -14,  38, -14,  38, -14,  38, },
+    {114, -20, 114, -20, 114, -20, 114, -20, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {119, -17, 119, -17, 119, -17, 119, -17, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7,  18,  -7,  18,  -7,  18,  -7,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -6, 127,  -6, 127,  -6, 127,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-13, 124, -13, 124, -13, 124, -13, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-18, 120, -18, 120, -18, 120, -18, 120, },
+    { 28, -12,  28, -12,  28, -12,  28, -12, },
+    {  7,  -4,   7,  -4,   7,  -4,   7,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-21, 115, -21, 115, -21, 115, -21, 115, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -5,   8,  -5,   8,  -5,   8,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24, 108, -24, 108, -24, 108, -24, 108, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25, 100, -25, 100, -25, 100, -25, 100, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -7,  11,  -7,  11,  -7,  11,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-26,  91, -26,  91, -26,  91, -26,  91, },
+    { 71, -24,  71, -24,  71, -24,  71, -24, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25,  81, -25,  81, -25,  81, -25,  81, },
+    { 81, -25,  81, -25,  81, -25,  81, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-24,  71, -24,  71, -24,  71, -24,  71, },
+    { 91, -26,  91, -26,  91, -26,  91, -26, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  11,  -7,  11,  -7,  11,  -7,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    {100, -25, 100, -25, 100, -25, 100, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {108, -24, 108, -24, 108, -24, 108, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,   8,  -5,   8,  -5,   8,  -5,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {115, -21, 115, -21, 115, -21, 115, -21, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   7,  -4,   7,  -4,   7,  -4,   7, },
+    {-12,  28, -12,  28, -12,  28, -12,  28, },
+    {120, -18, 120, -18, 120, -18, 120, -18, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -13, 124, -13, 124, -13, 124, -13, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   5,  -3,   5,  -3,   5,  -3,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17, 120, -17, 120, -17, 120, -17, 120, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  1,  -1,   1,  -1,   1,  -1,   1,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,  10,  -4,  10,  -4,  10,  -4,  10, },
+    {-21, 114, -21, 114, -21, 114, -21, 114, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,  11,  -5,  11,  -5,  11,  -5,  11, },
+    {-23, 107, -23, 107, -23, 107, -23, 107, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    {  9,  -5,   9,  -5,   9,  -5,   9,  -5, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  99, -25,  99, -25,  99, -25,  99, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -6,  11,  -6,  11,  -6,  11,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  90, -25,  90, -25,  90, -25,  90, },
+    { 70, -23,  70, -23,  70, -23,  70, -23, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24,  80, -24,  80, -24,  80, -24,  80, },
+    { 80, -24,  80, -24,  80, -24,  80, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-23,  70, -23,  70, -23,  70, -23,  70, },
+    { 90, -25,  90, -25,  90, -25,  90, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  11,  -6,  11,  -6,  11,  -6,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    { 99, -25,  99, -25,  99, -25,  99, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -5,   9,  -5,   9,  -5,   9,  -5,   9, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {107, -23, 107, -23, 107, -23, 107, -23, },
+    { 11,  -5,  11,  -5,  11,  -5,  11,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {114, -21, 114, -21, 114, -21, 114, -21, },
+    { 10,  -4,  10,  -4,  10,  -4,  10,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   1,  -1,   1,  -1,   1,  -1,   1, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {120, -17, 120, -17, 120, -17, 120, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -3,   5,  -3,   5,  -3,   5,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
diff --git a/vp10/common/x86/vp10_highbd_convolve_sse4.c b/vp10/common/x86/vp10_highbd_convolve_sse4.c
new file mode 100644
index 0000000..e828178
--- /dev/null
+++ b/vp10/common/x86/vp10_highbd_convolve_sse4.c
@@ -0,0 +1,474 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/filter.h"
+
+typedef void (*TransposeSave)(const int width, int pixelsNum,
+                              uint32_t *src, int src_stride,
+                              uint16_t *dst, int dst_stride,
+                              int bd);
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum,
+                       uint16_t *dst, int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(const int width, int pixelsNum,
+                    uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride,
+                    int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(const int width, int pixelsNum,
+                          uint32_t *src, int src_stride,
+                          uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = {
+  trans_save_4x4, trans_accum_save_4x4};
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+  __m128i x0, x1;
+
+  x0 = _mm_unpacklo_epi32(in[0], in[1]);
+  x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi64(x0, x1);
+  out[1] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpackhi_epi32(in[0], in[1]);
+  x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  out[2] = _mm_unpacklo_epi64(x0, x1);
+  out[3] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpacklo_epi32(in[4], in[5]);
+  x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+  out[4] = _mm_unpacklo_epi64(x0, x1);
+  out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride,
+                                __m128i *f, int tapsNum, uint32_t *buf) {
+  __m128i u[8], v[6];
+
+  if (tapsNum == 10) {
+    src -= 1;
+  }
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+  transpose_pair(u, v);
+
+  u[0] = _mm_madd_epi16(v[0], f[0]);
+  u[1] = _mm_madd_epi16(v[1], f[1]);
+  u[2] = _mm_madd_epi16(v[2], f[2]);
+  u[3] = _mm_madd_epi16(v[3], f[3]);
+  u[4] = _mm_madd_epi16(v[4], f[4]);
+  u[5] = _mm_madd_epi16(v[5], f[5]);
+
+  u[6] = _mm_min_epi32(u[2], u[3]);
+  u[7] = _mm_max_epi32(u[2], u[3]);
+
+  u[0] = _mm_add_epi32(u[0], u[1]);
+  u[0] = _mm_add_epi32(u[0], u[5]);
+  u[0] = _mm_add_epi32(u[0], u[4]);
+  u[0] = _mm_add_epi32(u[0], u[6]);
+  u[0] = _mm_add_epi32(u[0], u[7]);
+
+  _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+                                       uint16_t *dst, int dst_stride,
+                                       int w, int h,
+                                       const InterpFilterParams filter_params,
+                                       const int subpel_x_q4, int x_step_q4,
+                                       int avg, int bd) {
+  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const uint16_t *srcPtr;
+  const int tapsNum = filter_params.taps;
+  int i, col, count, blkResidu, blkHeight;
+  TransposeSave transSave = transSaveTab[avg];
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_x_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= (tapsNum >> 1) - 1;
+  srcPtr = src;
+
+  count = 0;
+  blkHeight = h >> 2;
+  blkResidu = h & 3;
+
+  while (blkHeight != 0) {
+    for (col = 0; col < w; col += 4) {
+      for (i = 0; i < 4; ++i) {
+        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+        srcPtr += 1;
+      }
+      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+    }
+    count++;
+    srcPtr = src + count * src_stride * 4;
+    dst += dst_stride * 4;
+    blkHeight--;
+  }
+
+  for (col = 0; col < w; col += 4) {
+    for (i = 0; i < 4; ++i) {
+      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+      srcPtr += 1;
+    }
+    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+  }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = {write2pixelsOnly, write2pixelsAccum};
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = {write4pixelsOnly, write4pixelsAccum};
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+                                       const __m128i *f, int taps,
+                                       uint16_t *dst, WritePixels saveFunc,
+                                       int bd) {
+  __m128i s[12];
+  __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  int r = 0;
+
+  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  if (10 == taps) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+  s[0] = _mm_madd_epi16(s[0], f[0]);
+  s[2] = _mm_madd_epi16(s[2], f[1]);
+  s[4] = _mm_madd_epi16(s[4], f[2]);
+  s[6] = _mm_madd_epi16(s[6], f[3]);
+  s[8] = _mm_madd_epi16(s[8], f[4]);
+  s[10] = _mm_madd_epi16(s[10], f[5]);
+
+  s[1] = _mm_min_epi32(s[4], s[6]);
+  s[3] = _mm_max_epi32(s[4], s[6]);
+
+  s[0] = _mm_add_epi32(s[0], s[2]);
+  s[0] = _mm_add_epi32(s[0], s[10]);
+  s[0] = _mm_add_epi32(s[0], s[8]);
+  s[0] = _mm_add_epi32(s[0], s[1]);
+  s[0] = _mm_add_epi32(s[0], s[3]);
+
+  saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int col;
+  int rowIndex = 0;
+  const uint16_t *src_ptr = src;
+  uint16_t *dst_ptr = dst;
+  const int step = 4;
+  WritePixels write4pixels = write4pixelsTab[avg];
+
+  do {
+    for (col = 0; col < w; col += step) {
+      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps,
+                                 dst_ptr, write4pixels, bd);
+      src_ptr += step;
+      dst_ptr += step;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int rowIndex = 0;
+  WritePixels write2pixels = write2pixelsTab[avg];
+  (void)w;
+
+  do {
+    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels,
+                               bd);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride,
+                                      int w, int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_y_q4, int y_step_q4,
+                                      int avg, int bd) {
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_y_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+
+  if (w > 2) {
+    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  } else {
+    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  }
+}
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 389e40b..3999c94 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -121,7 +121,7 @@
 static struct vp10_token motvar_encodings[MOTION_VARIATIONS];
 #endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
 
-void vp10_encode_token_init() {
+void vp10_encode_token_init(void) {
 #if CONFIG_EXT_TX
   int s;
   for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
diff --git a/vp10/encoder/bitstream.h b/vp10/encoder/bitstream.h
index 0284920..cacdb43 100644
--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h
@@ -20,7 +20,7 @@
 
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
 
-void vp10_encode_token_init();
+void vp10_encode_token_init(void);
 
 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 3307393..3810be5 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -160,7 +160,7 @@
     next_shortcut = shortcut;
 
     /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
+    if (UNLIKELY(x)) {
       error0 = tokens[next][0].error;
       error1 = tokens[next][1].error;
       /* Evaluate the first possibility for this state. */
@@ -204,7 +204,7 @@
       rate1 = tokens[next][1].rate;
 
       // The threshold of 3 is empirically obtained.
-      if (abs(x) > 3) {
+      if (UNLIKELY(abs(x) > 3)) {
         shortcut = 0;
       } else {
 #if CONFIG_NEW_QUANT
@@ -233,7 +233,7 @@
         best_index[i][1] = best_index[i][0];
         next = i;
 
-        if (!(--band_left)) {
+        if (UNLIKELY(!(--band_left))) {
           --band_counts;
           band_left = *band_counts;
           --token_costs;
@@ -255,7 +255,7 @@
       }
 
       if (next_shortcut) {
-        if (next < default_eob) {
+        if (LIKELY(next < default_eob)) {
           if (t0 != EOB_TOKEN) {
             token_cache[rc] = vp10_pt_energy_class[t0];
             pt = get_coef_context(nb, token_cache, i + 1);
@@ -350,7 +350,7 @@
       /* Don't update next, because we didn't add a new node. */
     }
 
-    if (!(--band_left)) {
+    if (UNLIKELY(!(--band_left))) {
       --band_counts;
       band_left = *band_counts;
       --token_costs;
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index f82e74c..9e0a339 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -582,10 +582,10 @@
   }
 }
 
-void vp10_get_entropy_contexts_plane(BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
-                                     const struct macroblockd_plane *pd,
-                                     ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
-                                     ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+static void get_entropy_contexts_plane(
+    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
+    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const ENTROPY_CONTEXT *const above = pd->above_context;
@@ -626,7 +626,7 @@
                               ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
                               ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
-  vp10_get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
 }
 
 void vp10_mv_pred(VP10_COMP *cpi, MACROBLOCK *x,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index e8209ba..b96e6e4 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -604,11 +604,10 @@
   (void) var[15];
 }
 
-int adst_vs_flipadst(const VP10_COMP *cpi,
-                     BLOCK_SIZE bsize,
-                     uint8_t *src, int src_stride,
-                     uint8_t *dst, int dst_stride,
-                     double *hdist, double *vdist) {
+static int adst_vs_flipadst(const VP10_COMP *cpi, BLOCK_SIZE bsize,
+                            uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            double *hdist, double *vdist) {
   int prune_bitmask = 0;
   double svm_proj_h = 0, svm_proj_v = 0;
   get_energy_distribution_fine(cpi, bsize, src, src_stride,
@@ -1217,7 +1216,7 @@
       sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        sse = ROUNDZ_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+        sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       sse = (int64_t)sse * 16;
 
@@ -3027,7 +3026,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUNDZ_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   *bsse += tmp * 16;
 
@@ -6664,7 +6663,7 @@
 
     mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
-    sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
     rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -6726,7 +6725,7 @@
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
-    sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
     rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -7909,6 +7908,10 @@
       *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
 
       *distortion = skip_sse_sb;
+      *psse = skip_sse_sb;
+      *rate_y = 0;
+      *rate_uv = 0;
+      *skippable = 1;
     }
 
 #if CONFIG_OBMC || CONFIG_WARPED_MOTION
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 0253b4c..e68e083 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -74,6 +74,10 @@
 VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
 VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_ssse3.c
 VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_filters_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_filters_sse4.c
+endif
 VP10_COMMON_SRCS-yes += common/vp10_convolve.c
 VP10_COMMON_SRCS-yes += common/vp10_convolve.h
 VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h
diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h
new file mode 100644
index 0000000..109183a
--- /dev/null
+++ b/vpx_dsp/blend.h
@@ -0,0 +1,40 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_BLEND_H_
+#define VPX_DSP_BLEND_H_
+
+#include "vpx_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the vpx_blend_* functions in vpx_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A64_ROUND_BITS  6
+#define VPX_BLEND_A64_MAX_ALPHA   (1 << VPX_BLEND_A64_ROUND_BITS)   // 64
+
+#define VPX_BLEND_A64(a, v0, v1)                                              \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1),     \
+                     VPX_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A256_ROUND_BITS 8
+#define VPX_BLEND_A256_MAX_ALPHA  (1 << VPX_BLEND_A256_ROUND_BITS)  // 256
+
+#define VPX_BLEND_A256(a, v0, v1)                                             \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1),    \
+                     VPX_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define VPX_BLEND_AVG(v0, v1)   ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif  // VPX_DSP_BLEND_H_
diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000..90f3415
--- /dev/null
+++ b/vpx_dsp/blend_a64_hmask.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_hmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c
new file mode 100644
index 0000000..1649798
--- /dev/null
+++ b/vpx_dsp/blend_a64_mask.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride,
+                          int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000..5d48a83
--- /dev/null
+++ b/vpx_dsp/blend_a64_vmask.c
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_vmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_vmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
deleted file mode 100644
index 584ee6a..0000000
--- a/vpx_dsp/blend_mask6.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-#include "./vpx_dsp_rtcd.h"
-
-#define MASK_BITS 6
-
-void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
-                       uint8_t *src0, uint32_t src0_stride,
-                       uint8_t *src1, uint32_t src1_stride,
-                       const uint8_t *mask, uint32_t mask_stride,
-                       int h, int w, int subh, int subw) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 = mask[i * mask_stride + j];
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
-                               mask[i * mask_stride + (2 * j + 1)], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
-                               mask[(2 * i + 1) * mask_stride + j], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
-                              uint8_t *src0_8, uint32_t src0_stride,
-                              uint8_t *src1_8, uint32_t src1_stride,
-                              const uint8_t *mask, uint32_t mask_stride,
-                              int h, int w, int subh, int subw, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 = mask[i * mask_stride + j];
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
-                               mask[i * mask_stride + (2 * j + 1)], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
-                               mask[(2 * i + 1) * mask_stride + j], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index bb1daf8..e64dae3 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -452,23 +452,23 @@
 #endif  // CONFIG_VP10 && CONFIG_EXT_INTER
 
 #if CONFIG_VP10 && CONFIG_OBMC
-// a: pred
-// b: target weighted prediction (has been *4096 to keep precision)
-// m: 2d weights (scaled by 4096)
-static INLINE unsigned int obmc_sad(const uint8_t *a, int a_stride,
-                                    const int32_t *b,
-                                    const int32_t *m,
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc,
+                                    const int32_t *mask,
                                     int width, int height) {
   int y, x;
   unsigned int sad = 0;
 
   for (y = 0; y < height; y++) {
     for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12);
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
 
-    a += a_stride;
-    b += width;
-    m += width;
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
   }
 
   return sad;
@@ -477,8 +477,8 @@
 #define OBMCSADMxN(m, n)                                                      \
 unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride,    \
                                        const int32_t *wsrc,                   \
-                                       const int32_t *msk) {                  \
-  return obmc_sad(ref, ref_stride, wsrc, msk, m, n);                          \
+                                       const int32_t *mask) {                 \
+  return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                         \
 }
 
 #if CONFIG_EXT_PARTITION
@@ -501,21 +501,21 @@
 OBMCSADMxN(4, 4)
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int highbd_obmc_sad(const uint8_t *a8, int a_stride,
-                                           const int32_t *b,
-                                           const int32_t *m,
+static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
                                            int width, int height) {
   int y, x;
   unsigned int sad = 0;
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
 
   for (y = 0; y < height; y++) {
     for (x = 0; x < width; x++)
-      sad += ROUND_POWER_OF_TWO(abs(b[x] - a[x] * m[x]), 12);
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
 
-    a += a_stride;
-    b += width;
-    m += width;
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
   }
 
   return sad;
@@ -525,8 +525,8 @@
 unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref,             \
                                               int ref_stride,                 \
                                               const int32_t *wsrc,            \
-                                              const int32_t *msk) {           \
-  return highbd_obmc_sad(ref, ref_stride, wsrc, msk, m, n);                   \
+                                              const int32_t *mask) {          \
+  return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n);                  \
 }
 
 #if CONFIG_EXT_PARTITION
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3c519b6..3eb7a9f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -70,10 +70,14 @@
 # inter predictions
 
 ifeq ($(CONFIG_VP10),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-yes            += blend_mask6.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
-endif  #CONFIG_EXT_INTER
+DSP_SRCS-yes            += blend.h
+DSP_SRCS-yes            += blend_a64_mask.c
+DSP_SRCS-yes            += blend_a64_hmask.c
+DSP_SRCS-yes            += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
 endif  #CONFIG_VP10
 
 # interpolation filters
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e630994..d8055e9 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -959,6 +959,27 @@
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
+if (vpx_config("CONFIG_VP10") eq "yes") {
+  #
+  # Alpha blending with mask
+  #
+  add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  specialize "vpx_blend_a64_mask", qw/sse4_1/;
+  specialize "vpx_blend_a64_hmask", qw/sse4_1/;
+  specialize "vpx_blend_a64_vmask", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/;
+  }
+}  # CONFIG_VP10
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction
@@ -1103,14 +1124,14 @@
 if (vpx_config("CONFIG_OBMC") eq "yes") {
   foreach (@block_sizes) {
     ($w, $h) = @$_;
-    add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask";
+    add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
     specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/;
   }
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     foreach (@block_sizes) {
       ($w, $h) = @$_;
-      add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *ref_ptr, int ref_stride, const int32_t *wsrc_ptr, const int32_t *mask";
+      add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
       specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
     }
   }
@@ -1384,14 +1405,6 @@
       }
     }
   }
-
-  add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  specialize "vpx_blend_mask6", qw/sse4_1/;
-
-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
-    specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
-  }
 }
 
 #
diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000..a10e077
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx/vpx_integer.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void vpx_blend_a64_hmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  vpx_blend_a64_mask_sse4_1(dst, dst_stride,
+                            src0, src0_stride,
+                            src1, src1_stride,
+                            mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w,
+    int bd) {
+  vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
+                                   src0_8, src0_stride,
+                                   src1_8, src1_stride,
+                                   mask, 0, h, w, 0, 0, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c
similarity index 61%
rename from vpx_dsp/x86/blend_mask6_sse4.c
rename to vpx_dsp/x86/blend_a64_mask_sse4.c
index 28693a4..cdb40c2 100644
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_a64_mask_sse4.c
@@ -15,62 +15,24 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
 
 #include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
 
 #include "./vpx_dsp_rtcd.h"
 
-#define MASK_BITS 6
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_w4_sse4_1(
+static void blend_a64_mask_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -92,13 +54,13 @@
   } while (--h);
 }
 
-static void blend_mask6_w8_sse4_1(
+static void blend_a64_mask_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -120,13 +82,13 @@
   } while (--h);
 }
 
-static void blend_mask6_w16n_sse4_1(
+static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -158,15 +120,15 @@
 // Horizontal sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sx_w4_sse4_1(
+static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -190,15 +152,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_w8_sse4_1(
+static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -222,15 +184,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_w16n_sse4_1(
+static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -265,13 +227,13 @@
 // Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sy_w4_sse4_1(
+static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -296,13 +258,13 @@
   } while (--h);
 }
 
-static void blend_mask6_sy_w8_sse4_1(
+static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -327,14 +289,14 @@
   } while (--h);
 }
 
-static void blend_mask6_sy_w16n_sse4_1(
+static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -368,15 +330,15 @@
 // Horizontal and Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sx_sy_w4_sse4_1(
+static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -405,15 +367,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_sy_w8_sse4_1(
+static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -442,15 +404,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_sy_w16n_sse4_1(
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -495,146 +457,67 @@
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                            uint8_t *src0, uint32_t src0_stride,
-                            uint8_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, uint32_t mask_stride,
-                            int h, int w, int suby, int subx) {
+void vpx_blend_a64_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx) {
   typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
-                            uint8_t *src0, uint32_t src0_stride,
-                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
                             const uint8_t *mask, uint32_t mask_stride,
                             int h, int w);
 
-  static blend_fn blend[3][2][2] = {  // width_index X subx X suby
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
     {     // w % 16 == 0
-      {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
-      {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+      {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
+      {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
     }, {  // w == 4
-      {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
-      {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+      {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
+      {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
     }, {  // w == 8
-      {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
-      {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+      {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
+      {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
     }
   };
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
-  assert(h >= 4);
-  assert(w >= 4);
+  assert(h >= 1);
+  assert(w >= 1);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
-                                            src0, src0_stride,
-                                            src1, src1_stride,
-                                            mask, mask_stride,
-                                            h, w);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_blend_a64_mask_c(dst, dst_stride,
+                         src0, src0_stride,
+                         src1, src1_stride,
+                         mask, mask_stride,
+                         h, w, suby, subx);
+  } else {
+    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+                                              src0, src0_stride,
+                                              src1, src1_stride,
+                                              mask, mask_stride,
+                                              h, w);
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
-                                 const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  // Interleave
-  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
-  // Scale
-  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  // Interleave
-  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
-  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
-  // Scale
-  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
-  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-//////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_m0_b = xx_loadl_32(mask);
@@ -652,37 +535,37 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_w4_sse4_1(
+static void blend_a64_mask_b10_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                           src1_stride, mask, mask_stride, h,
-                           blend_4_b10);
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
 }
 
-static void blend_mask6_b12_w4_sse4_1(
+static void blend_a64_mask_b12_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                           src1_stride, mask, mask_stride, h,
-                           blend_4_b12);
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
 }
 
-static inline void blend_mask6_bn_w8n_sse4_1(
+static inline void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -702,41 +585,41 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_w8n_sse4_1(
+static void blend_a64_mask_b10_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, mask_stride, h, w,
-                            blend_8_b10);
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
 }
 
-static void blend_mask6_b12_w8n_sse4_1(
+static void blend_a64_mask_b12_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, mask_stride, h, w,
-                            blend_8_b12);
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Horizontal sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_r_b = xx_loadl_64(mask);
@@ -756,39 +639,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_w4_sse4_1(
+static void blend_a64_mask_b10_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b10);
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
 }
 
-static void blend_mask6_b12_sx_w4_sse4_1(
+static void blend_a64_mask_b12_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b12);
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -810,39 +693,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b10);
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
 }
 
-static void blend_mask6_b12_sx_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b12);
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_ra_b = xx_loadl_32(mask);
@@ -863,37 +746,37 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b10);
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
 }
 
-static void blend_mask6_b12_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b12);
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -916,41 +799,41 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b10);
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
 }
 
-static void blend_mask6_b12_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b12);
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Horizontal and Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
@@ -975,39 +858,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
 }
 
-static void blend_mask6_b12_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -1034,82 +917,91 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b10);
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b10);
 }
 
-static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b12);
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-                                   uint8_t *src0_8, uint32_t src0_stride,
-                                   uint8_t *src1_8, uint32_t src1_stride,
-                                   const uint8_t *mask, uint32_t mask_stride,
-                                   int h, int w, int suby, int subx, int bd) {
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-  uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
+void vpx_highbd_blend_a64_mask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx, int bd) {
   typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
-                            uint16_t *src0, uint32_t src0_stride,
-                            uint16_t *src1, uint32_t src1_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
                             const uint8_t *mask, uint32_t mask_stride,
                             int h, int w);
 
-  static blend_fn blend[2][2][2][2] = {  // bd_index X width_index X subx X suby
+  // Dimensions are: bd_index X width_index X subx X suby
+  static const blend_fn blend[2][2][2][2] = {
     {   // bd == 8 or 10
       {     // w % 8 == 0
-        {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
-        {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+        {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
+        {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
       }, {  // w == 4
-        {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
-        {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+        {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
+        {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
       }
     },
     {   // bd == 12
       {     // w % 8 == 0
-        {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
-        {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+        {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
+        {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
       }, {  // w == 4
-        {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
-        {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+        {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
+        {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
       }
     }
   };
 
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
 
-  assert(h >= 4);
-  assert(w >= 4);
+  assert(h >= 1);
+  assert(w >= 1);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
   assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
+                                src0_8, src0_stride,
+                                src1_8, src1_stride,
+                                mask, mask_stride,
+                                h, w, suby, subx, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-  blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
-                                                      src0, src0_stride,
-                                                      src1, src1_stride,
-                                                      mask, mask_stride,
-                                                      h, w);
+    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+                                                        src0, src0_stride,
+                                                        src1, src1_stride,
+                                                        mask, mask_stride,
+                                                        h, w);
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000..4b0f38d
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,293 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0_w, v_m1_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0_w, v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_vmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    vpx_blend_a64_vmask_c,        // w == 1
+    vpx_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride,
+                 src0, src0_stride,
+                 src1, src1_stride,
+                 mask, h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b12);
+}
+
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {     // bd == 8 or 10
+      blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    }, {  // bd == 12
+      blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
+                                 src0_8, src0_stride,
+                                 src1_8, src1_stride,
+                                 mask, h, w, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
+                                  src0, src0_stride,
+                                  src1, src1_stride,
+                                  mask, h, w);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000..9b74f90
--- /dev/null
+++ b/vpx_dsp/x86/blend_sse4.h
@@ -0,0 +1,145 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_X86_BLEND_SSE4_H_
+#define VPX_DSP_X86_BLEND_SSE4_H_
+
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
+                                          VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_DSP_X86_BLEND_SSE4_H_
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index fd46bef..164ffcf 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -10,6 +10,7 @@
 
 #include <emmintrin.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
diff --git a/vpx_dsp/x86/obmc_sad_sse4.c b/vpx_dsp/x86/obmc_sad_sse4.c
index 57e1428..de12e1d 100644
--- a/vpx_dsp/x86/obmc_sad_sse4.c
+++ b/vpx_dsp/x86/obmc_sad_sse4.c
@@ -21,26 +21,28 @@
 // 8 bit
 ////////////////////////////////////////////////////////////////////////////////
 
-static INLINE unsigned int obmc_sad_w4(const uint8_t *a, const int a_stride,
-                                       const int32_t *b, const int32_t *m,
+static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                       const int pre_stride,
+                                       const int32_t *wsrc,
+                                       const int32_t *mask,
                                        const int height) {
-  const int a_step = a_stride - 4;
+  const int pre_step = pre_stride - 4;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
 
   do {
-    const __m128i v_a_b = xx_loadl_32(a + n);
-    const __m128i v_m_d = xx_load_128(m + n);
-    const __m128i v_b_d = xx_load_128(b + n);
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
 
-    const __m128i v_a_d = _mm_cvtepu8_epi32(v_a_b);
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
 
-    // Values in both a and m fit in 15 bits, and are packed at 32 bit
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     // than pmulld but produces the same result with these inputs.
-    const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d);
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
 
-    const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d);
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
 
     // Rounded absolute difference
@@ -51,39 +53,42 @@
     n += 4;
 
     if (n % 4 == 0)
-      a += a_step;
+      pre += pre_step;
   } while (n < 4 * height);
 
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int obmc_sad_w8n(const uint8_t *a, const int a_stride,
-                                        const int32_t *b, const int32_t *m,
-                                        const int width, const int height) {
-  const int a_step = a_stride - width;
+static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
+                                        const int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask,
+                                        const int width,
+                                        const int height) {
+  const int pre_step = pre_stride - width;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
   assert(width >= 8 && (width & (width - 1)) == 0);
 
   do {
-    const __m128i v_a1_b = xx_loadl_32(a + n + 4);
-    const __m128i v_m1_d = xx_load_128(m + n + 4);
-    const __m128i v_b1_d = xx_load_128(b + n + 4);
-    const __m128i v_a0_b = xx_loadl_32(a + n);
-    const __m128i v_m0_d = xx_load_128(m + n);
-    const __m128i v_b0_d = xx_load_128(b + n);
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
 
-    const __m128i v_a0_d = _mm_cvtepu8_epi32(v_a0_b);
-    const __m128i v_a1_d = _mm_cvtepu8_epi32(v_a1_b);
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
 
-    // Values in both a and m fit in 15 bits, and are packed at 32 bit
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     // than pmulld but produces the same result with these inputs.
-    const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d);
-    const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d);
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
 
-    const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d);
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
 
@@ -97,21 +102,21 @@
     n += 8;
 
     if (n % width == 0)
-      a += a_step;
+      pre += pre_step;
   } while (n < width * height);
 
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
 #define OBMCSADWXH(w, h)                                                      \
-unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref,               \
-                                            int ref_stride,                   \
+unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,               \
+                                            int pre_stride,                   \
                                             const int32_t *wsrc,              \
                                             const int32_t *msk) {             \
   if (w == 4)                                                                 \
-    return obmc_sad_w4(ref, ref_stride, wsrc, msk, h);                        \
+    return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);                        \
   else                                                                        \
-    return obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h);                    \
+    return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);                    \
 }
 
 #if CONFIG_EXT_PARTITION
@@ -138,28 +143,29 @@
 ////////////////////////////////////////////////////////////////////////////////
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *a8,
-                                           const int a_stride,
-                                           const int32_t *b, const int32_t *m,
+static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                           const int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
                                            const int height) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const int a_step = a_stride - 4;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
 
   do {
-    const __m128i v_a_w = xx_loadl_64(a + n);
-    const __m128i v_m_d = xx_load_128(m + n);
-    const __m128i v_b_d = xx_load_128(b + n);
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
 
-    const __m128i v_a_d = _mm_cvtepu16_epi32(v_a_w);
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
 
-    // Values in both a and m fit in 15 bits, and are packed at 32 bit
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     // than pmulld but produces the same result with these inputs.
-    const __m128i v_am_d = _mm_madd_epi16(v_a_d, v_m_d);
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
 
-    const __m128i v_diff_d = _mm_sub_epi32(v_b_d, v_am_d);
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
     const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
 
     // Rounded absolute difference
@@ -170,41 +176,43 @@
     n += 4;
 
     if (n % 4 == 0)
-      a += a_step;
+      pre += pre_step;
   } while (n < 4 * height);
 
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
-static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *a8,
-                                            const int a_stride,
-                                            const int32_t *b, const int32_t *m,
-                                            const int width, const int height) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const int a_step = a_stride - width;
+static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int width,
+                                            const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
   int n = 0;
   __m128i v_sad_d = _mm_setzero_si128();
   assert(width >= 8 && (width & (width - 1)) == 0);
 
   do {
-    const __m128i v_a1_w = xx_loadl_64(a + n + 4);
-    const __m128i v_m1_d = xx_load_128(m + n + 4);
-    const __m128i v_b1_d = xx_load_128(b + n + 4);
-    const __m128i v_a0_w = xx_loadl_64(a + n);
-    const __m128i v_m0_d = xx_load_128(m + n);
-    const __m128i v_b0_d = xx_load_128(b + n);
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
 
-    const __m128i v_a0_d = _mm_cvtepu16_epi32(v_a0_w);
-    const __m128i v_a1_d = _mm_cvtepu16_epi32(v_a1_w);
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
 
-    // Values in both a and m fit in 15 bits, and are packed at 32 bit
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
     // boundaries. We use pmaddwd, as it has lower latency on Haswell
     // than pmulld but produces the same result with these inputs.
-    const __m128i v_am0_d = _mm_madd_epi16(v_a0_d, v_m0_d);
-    const __m128i v_am1_d = _mm_madd_epi16(v_a1_d, v_m1_d);
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
 
-    const __m128i v_diff0_d = _mm_sub_epi32(v_b0_d, v_am0_d);
-    const __m128i v_diff1_d = _mm_sub_epi32(v_b1_d, v_am1_d);
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
     const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
     const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
 
@@ -218,21 +226,21 @@
     n += 8;
 
     if (n % width == 0)
-      a += a_step;
+      pre += pre_step;
   } while (n < width * height);
 
   return xx_hsum_epi32_si32(v_sad_d);
 }
 
 #define HBD_OBMCSADWXH(w, h)                                                  \
-unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *ref,        \
-                                                   int ref_stride,            \
+unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,        \
+                                                   int pre_stride,            \
                                                    const int32_t *wsrc,       \
-                                                   const int32_t *msk) {      \
+                                                   const int32_t *mask) {     \
   if (w == 4)                                                                 \
-    return hbd_obmc_sad_w4(ref, ref_stride, wsrc, msk, h);                    \
+    return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);                   \
   else                                                                        \
-    return hbd_obmc_sad_w8n(ref, ref_stride, wsrc, msk, w, h);                \
+    return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h);               \
 }
 
 #if CONFIG_EXT_PARTITION
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 1dca1a8..48549ce 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -38,24 +38,15 @@
 #define __builtin_prefetch(x)
 #endif
 
-/* Shift down with rounding for use when n > 0 */
+/* Shift down with rounding for use when n >= 0, value >= 0 */
 #define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
+    (((value) + (((1 << (n)) >> 1))) >> (n))
 
-/* Shift down with rounding for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO(value, n) \
-    ((n) ? (((value) + (1 << ((n) - 1))) >> (n)) : (value))
-
-/* Shift down with rounding for signed integers, for use when n > 0 */
+/* Shift down with rounding for signed integers, for use when n >= 0 */
 #define ROUND_POWER_OF_TWO_SIGNED(value, n) \
     (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
                    : ROUND_POWER_OF_TWO((value), (n)))
 
-/* Shift down with rounding for signed integers, for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO_SIGNED(value, n) \
-    (((value) < 0) ? -ROUNDZ_POWER_OF_TWO(-(value), (n)) \
-     : ROUNDZ_POWER_OF_TWO((value), (n)))
-
 #define ALIGN_POWER_OF_TWO(value, n) \
     (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))