Merge "Cosmetics for vp10/common/vp10_rtcd_defs.pl" into nextgenv2

diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 0000000..03e9b7d
--- /dev/null
+++ b/test/blend_a64_mask_1d_test.cc

@@ -0,0 +1,374 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth;
+
+  BlendA64Mask1DTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~BlendA64Mask1DTest() {}
+
+  virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+  void Common() {
+    w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+
+    dst_offset_ = rng_(33);
+    dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = rng_(33);
+    src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = rng_(33);
+    src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (rng_(3)) {
+      case 0:   // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:   // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:   // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default:
+        FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0 ; r < h_ ; ++r) {
+      for (int c = 0 ; c < w_ ; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  ACMRandom rng_;
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  size_t dst_stride_;
+  size_t dst_offset_;
+
+  T src0_[kBufSize];
+  size_t src0_stride_;
+  size_t src0_offset_;
+
+  T src1_[kBufSize];
+  size_t src1_stride_;
+  size_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+
+  int w_;
+  int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, int h, int w);
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+    ref_func_(dst_ref_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, h_, w_);
+
+    tst_func_(dst_tst_ + dst_offset_, dst_stride_,
+              p_src0 + src0_offset_, src0_stride_,
+              p_src1 + src1_offset_, src1_stride_,
+              mask_, h_, w_);
+  }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+static void blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTest8B,
+  ::testing::Values(
+    make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_c),
+    make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTest8B,
+  ::testing::Values(
+    make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1),
+    make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, int h, int w, int bd);
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+              mask_, h_, w_, bit_depth_);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                mask_, h_, w_, bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+  for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void highbd_blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+    make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c),
+    make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+    make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_sse4_1),
+    make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace

diff --git a/test/blend_mask6_test.cc b/test/blend_a64_mask_test.cc
similarity index 77%
rename from test/blend_mask6_test.cc
rename to test/blend_a64_mask_test.cc
index 6afaad7..08ee91d 100644
--- a/test/blend_mask6_test.cc
+++ b/test/blend_a64_mask_test.cc

@@ -26,6 +26,8 @@
 #include "test/acm_random.h"
 #include "vp10/common/enums.h"
 
+#include "vpx_dsp/blend.h"
+
 using libvpx_test::ACMRandom;
 using libvpx_test::FunctionEquivalenceTest;
 using std::tr1::make_tuple;
@@ -33,7 +35,7 @@
 namespace {
 
 template<typename F, typename T>
-class BlendMask6Test : public FunctionEquivalenceTest<F> {
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
  protected:
   static const int kIterations = 10000;
   static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
@@ -42,15 +44,15 @@
   static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
   static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
 
-  BlendMask6Test() : rng_(ACMRandom::DeterministicSeed()) {}
+  BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {}
 
-  virtual ~BlendMask6Test() {}
+  virtual ~BlendA64MaskTest() {}
 
-  virtual void Execute(T *p_src0, T *p_src1) = 0;
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
 
   void Common() {
-    w_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
-    h_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
+    w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
 
     subx_ = rng_(2);
     suby_ = rng_(2);
@@ -131,14 +133,14 @@
 //////////////////////////////////////////////////////////////////////////////
 
 typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
-                      uint8_t *src0, uint32_t src0_stride,
-                      uint8_t *src1, uint32_t src1_stride,
-                      const uint8_t *mask, uint32_t mask_stride,
-                      int h, int w, int suby, int subx);
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, uint32_t mask_stride,
+                    int h, int w, int suby, int subx);
 
-class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
  protected:
-  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
     ref_func_(dst_ref_ + dst_offset_, dst_stride_,
               p_src0 + src0_offset_, src0_stride_,
               p_src1 + src1_offset_, src1_stride_,
@@ -153,7 +155,7 @@
   }
 };
 
-TEST_P(BlendMask6Test8B, RandomValues) {
+TEST_P(BlendA64MaskTest8B, RandomValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     for (int i = 0 ; i < kBufSize ; ++i) {
       dst_ref_[i] = rng_.Rand8();
@@ -164,13 +166,13 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
 
     Common();
   }
 }
 
-TEST_P(BlendMask6Test8B, ExtremeValues) {
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     for (int i = 0 ; i < kBufSize ; ++i) {
       dst_ref_[i] = rng_(2) + 254;
@@ -180,7 +182,7 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(2) + 63;
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
 
     Common();
   }
@@ -188,8 +190,9 @@
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
-  SSE4_1_C_COMPARE, BlendMask6Test8B,
-  ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+  SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+  ::testing::Values(make_tuple(vpx_blend_a64_mask_c,
+                               vpx_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -198,14 +201,14 @@
 //////////////////////////////////////////////////////////////////////////////
 
 typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
-                       uint8_t *src0, uint32_t src0_stride,
-                       uint8_t *src1, uint32_t src1_stride,
-                       const uint8_t *mask, uint32_t mask_stride,
-                       int h, int w, int suby, int subx, int bd);
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, uint32_t mask_stride,
+                     int h, int w, int suby, int subx, int bd);
 
-class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
  protected:
-  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
     ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
               CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
               CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
@@ -223,7 +226,7 @@
   int bit_depth_;
 };
 
-TEST_P(BlendMask6TestHBD, RandomValues) {
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
   for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
     switch (rng_(3)) {
     case 0:
@@ -247,13 +250,13 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
 
     Common();
   }
 }
 
-TEST_P(BlendMask6TestHBD, ExtremeValues) {
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
   for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
     switch (rng_(3)) {
     case 0:
@@ -278,7 +281,7 @@
     }
 
     for (int i = 0 ; i < kMaxMaskSize ; ++i)
-      mask_[i] = rng_(65);
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
 
     Common();
   }
@@ -286,9 +289,9 @@
 
 #if HAVE_SSE4_1
 INSTANTIATE_TEST_CASE_P(
-  SSE4_1_C_COMPARE, BlendMask6TestHBD,
-  ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
-                               &vpx_highbd_blend_mask6_sse4_1)));
+  SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+  ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c,
+                               vpx_highbd_blend_a64_mask_sse4_1)));
 #endif  // HAVE_SSE4_1
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index cd0b136..753a7e4 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -188,8 +188,8 @@
 
   const unsigned int w_y = img1->d_w;
   const unsigned int h_y = img1->d_h;
-  const unsigned int w_uv = ROUNDZ_POWER_OF_TWO(w_y, img1->x_chroma_shift);
-  const unsigned int h_uv = ROUNDZ_POWER_OF_TWO(h_y, img1->y_chroma_shift);
+  const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
+  const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
 
   if (img1->fmt != img2->fmt
       || img1->cs != img2->cs

diff --git a/test/test.mk b/test/test.mk
index fcd565c..67fe705 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -178,11 +178,12 @@
 
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc
 
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
 endif
 

diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index 7999087..66f267f 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc

@@ -197,6 +197,7 @@
 
 using std::tr1::make_tuple;
 
+#if HAVE_SSSE3 && CONFIG_EXT_INTERP
 const BlockDimension kBlockDim[] = {
   make_tuple(2, 2),
   make_tuple(2, 4),
@@ -218,7 +219,6 @@
   make_tuple(128, 128),
 };
 
-#if HAVE_SSSE3 && CONFIG_EXT_INTERP
 // 10/12-tap filters
 const INTERP_FILTER kFilter[] = {6, 4, 2};
 

diff --git a/test/vp10_wedge_utils_test.cc b/test/vp10_wedge_utils_test.cc
index 930a598..7a541b2 100644
--- a/test/vp10_wedge_utils_test.cc
+++ b/test/vp10_wedge_utils_test.cc

@@ -104,7 +104,7 @@
       p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
     }
 
-    vpx_blend_mask6(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+    vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
 
     vpx_subtract_block(h, w, r0, w, s, w, p0, w);
     vpx_subtract_block(h, w, r1, w, s, w, p1, w);

diff --git a/vp10/common/quant_common.c b/vp10/common/quant_common.c
index 3093ab7..a1ce23e 100644
--- a/vp10/common/quant_common.c
+++ b/vp10/common/quant_common.c

@@ -42,55 +42,55 @@
 static const qprofile_type nuq[QUANT_PROFILES][QUANT_RANGES][COEF_BANDS] = {
   {
     {
-      {{91, 133, 139}, 11},  // dc, band 0
-      {{78, 122, 134}, 12},  // band 1
-      {{83, 127, 139}, 22},  // band 2
-      {{84, 117, 128}, 18},  // band 3
-      {{88, 117, 129}, 20},  // band 4
-      {{93, 122, 134}, 21}   // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
     }, {
-      {{91, 133, 139}, 11},  // dc, band 0
-      {{78, 122, 134}, 12},  // band 1
-      {{83, 127, 139}, 22},  // band 2
-      {{84, 117, 128}, 18},  // band 3
-      {{88, 117, 129}, 20},  // band 4
-      {{93, 122, 134}, 21}   // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
     }
   },
 #if QUANT_PROFILES > 1
   {
     {
-      {{86, 122, 134},  6},  // dc, band 0
-      {{78, 122, 134}, 15},  // band 1
-      {{78, 122, 134}, 17},  // band 2
-      {{84, 122, 134}, 22},  // band 3
-      {{88, 122, 134}, 23},  // band 4
-      {{88, 122, 134}, 23}   // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
     }, {
-      {{86, 122, 134},  6},  // dc, band 0
-      {{78, 122, 134}, 15},  // band 1
-      {{78, 122, 134}, 17},  // band 2
-      {{84, 122, 134}, 22},  // band 3
-      {{88, 122, 134}, 23},  // band 4
-      {{88, 122, 134}, 23}   // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
     }
   },
 #if QUANT_PROFILES > 2
   {
     {
-      {{86, 122, 134},  6},  // dc, band 0
-      {{78, 122, 135}, 14},  // band 1
-      {{78, 122, 134}, 16},  // band 2
-      {{84, 122, 133}, 22},  // band 3
-      {{88, 122, 134}, 23},  // band 4
-      {{88, 122, 134}, 27},  // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0},  // band 5
     }, {
-      {{86, 122, 134},  6},  // dc, band 0
-      {{78, 122, 135}, 14},  // band 1
-      {{78, 122, 134}, 16},  // band 2
-      {{84, 122, 133}, 22},  // band 3
-      {{88, 122, 134}, 23},  // band 4
-      {{88, 122, 134}, 27},  // band 5
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0},  // band 5
     }
   }
 #endif  // QUANT_PROFILES > 2

diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 70cf5e7..53fd1a6 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c

@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/blend.h"
 
 #include "vp10/common/blockd.h"
 #include "vp10/common/reconinter.h"
@@ -448,8 +449,8 @@
 #if CONFIG_SUPERTX
 static void build_masked_compound_wedge_extend(
     uint8_t *dst, int dst_stride,
-    uint8_t *src0, int src0_stride,
-    uint8_t *src1, int src1_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
     int wedge_index,
     int wedge_sign,
     BLOCK_SIZE sb_type,
@@ -459,18 +460,18 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  vpx_blend_mask6(dst, dst_stride,
-                  src0, src0_stride,
-                  src1, src1_stride,
-                  mask, MASK_MASTER_STRIDE,
-                  h, w, subh, subw);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, MASK_MASTER_STRIDE,
+                     h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_masked_compound_wedge_extend_highbd(
     uint8_t *dst_8, int dst_stride,
-    uint8_t *src0_8, int src0_stride,
-    uint8_t *src1_8, int src1_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
     int wedge_index, int wedge_sign,
     BLOCK_SIZE sb_type,
     int wedge_offset_x, int wedge_offset_y,
@@ -479,52 +480,54 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
       wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  vpx_highbd_blend_mask6(dst_8, dst_stride,
-                         src0_8, src0_stride,
-                         src1_8, src1_stride,
-                         mask, MASK_MASTER_STRIDE,
-                         h, w, subh, subw, bd);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, MASK_MASTER_STRIDE,
+                            h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_SUPERTX
 
-static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
-                                        uint8_t *src0, int src0_stride,
-                                        uint8_t *src1, int src1_stride,
-                                        int wedge_index, int wedge_sign,
-                                        BLOCK_SIZE sb_type,
-                                        int h, int w) {
+static void build_masked_compound_wedge(
+    uint8_t *dst, int dst_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
                                                       sb_type);
-  vpx_blend_mask6(dst, dst_stride,
-                  src0, src0_stride,
-                  src1, src1_stride,
-                  mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
-                  h, w, subh, subw);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                     h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
-                                               uint8_t *src0_8, int src0_stride,
-                                               uint8_t *src1_8, int src1_stride,
-                                               int wedge_index, int wedge_sign,
-                                               BLOCK_SIZE sb_type,
-                                               int h, int w, int bd) {
+static void build_masked_compound_wedge_highbd(
+    uint8_t *dst_8, int dst_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
                                                       sb_type);
-  vpx_highbd_blend_mask6(dst_8, dst_stride,
-                         src0_8, src0_stride,
-                         src1_8, src1_stride,
-                         mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
-                         h, w, subh, subw, bd);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                            h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -1064,177 +1067,123 @@
   28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
 };
 
-static void generate_1dmask(int length, uint8_t *mask, int plane) {
+static const uint8_t* get_supertx_mask(int length, int plane) {
   switch (length) {
     case 8:
-      memcpy(mask, plane ? mask_8_uv : mask_8, length);
-      break;
+      return plane ? mask_8_uv : mask_8;
     case 16:
-      memcpy(mask, plane ? mask_16_uv : mask_16, length);
-      break;
+      return plane ? mask_16_uv : mask_16;
     case 32:
-      memcpy(mask, plane ? mask_32_uv : mask_32, length);
-      break;
+      return plane ? mask_32_uv : mask_32;
     default:
       assert(0);
   }
+  return NULL;
 }
 
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col,
     int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane) {
-  int i, j;
   const struct macroblockd_plane *pd = &xd->plane[plane];
-  uint8_t mask[MAX_TX_SIZE];
-  int top_w = 4 << b_width_log2_lookup[top_bsize];
-  int top_h = 4 << b_height_log2_lookup[top_bsize];
-  int w = 4 << b_width_log2_lookup[bsize];
-  int h = 4 << b_height_log2_lookup[bsize];
-  int w_offset = (mi_col - mi_col_ori) * MI_SIZE;
-  int h_offset = (mi_row - mi_row_ori) * MI_SIZE;
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+  const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+  const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+  const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
-  uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
-  int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   assert(bsize <= BLOCK_32X32);
-
-  top_w >>= pd->subsampling_x;
-  top_h >>= pd->subsampling_y;
-  w >>= pd->subsampling_x;
-  h >>= pd->subsampling_y;
-  w_offset >>= pd->subsampling_x;
-  h_offset >>= pd->subsampling_y;
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
 
   switch (partition) {
-    case PARTITION_HORZ:
-    {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
-        uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
-
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
+      if (is_hdb)
+        vpx_highbd_blend_a64_vmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, h, top_w, xd->bd);
+      else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst + h_offset * dst_stride;
-        uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
-        generate_1dmask(h, mask + h_offset,
-                        plane && xd->plane[plane].subsampling_y);
+        vpx_blend_a64_vmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, h, top_w);
 
-        for (i = h_offset; i < h_offset + h; i++) {
-          for (j = 0; j < top_w; j++) {
-            const int m = mask[i];  assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-
-        for (; i < top_h; i ++) {
-          memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-    }
-
+      dst += h * dst_stride;
+      pre += h * pre_stride;
       break;
-    case PARTITION_VERT:
-    {
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (b_hdb) {
-        uint16_t *dst_tmp = dst16;
-        uint16_t *dst2_tmp = dst216;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-          memcpy(dst_tmp + j, dst2_tmp + j,
-                     (top_w - w_offset - w) * sizeof(uint16_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-      } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        uint8_t *dst_tmp = dst;
-        uint8_t *dst2_tmp = dst2;
-        generate_1dmask(w, mask + w_offset,
-                        plane && xd->plane[plane].subsampling_x);
-
-        for (i = 0; i < top_h; i++) {
-          for (j = w_offset; j < w_offset + w; j++) {
-            const int m = mask[j];   assert(m >= 0 && m <= 64);
-            if (m == 64)
-              continue;
-
-            if (m == 0)
-              dst_tmp[j] = dst2_tmp[j];
-            else
-              dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
-                                              dst2_tmp[j] * (64 - m), 6);
-          }
-            memcpy(dst_tmp + j, dst2_tmp + j,
-                       (top_w - w_offset - w) * sizeof(uint8_t));
-          dst_tmp += dst_stride;
-          dst2_tmp += dst2_stride;
-        }
-#if CONFIG_VP9_HIGHBITDEPTH
-      }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (is_hdb)
+        vpx_highbd_blend_a64_hmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        vpx_blend_a64_hmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
       break;
-    default:
+    }
+    default: {
       assert(0);
+      return;
+    }
   }
-  (void) xd;
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t*)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t*)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
 }
 
 void vp10_build_inter_predictors_sb_sub8x8_extend(
@@ -1878,12 +1827,10 @@
                                BLOCK_SIZE plane_bsize,
                                uint8_t *comppred,
                                int compstride,
-                               uint8_t *interpred,
+                               const uint8_t *interpred,
                                int interstride,
-                               uint8_t *intrapred,
+                               const uint8_t *intrapred,
                                int intrastride) {
-  const int scale_bits = 8;
-  const int scale_max = (1 << scale_bits);
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
@@ -1896,11 +1843,11 @@
                                                           bsize);
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      vpx_blend_mask6(comppred, compstride,
-                      intrapred, intrastride,
-                      interpred, interstride,
-                      mask, 4 * num_4x4_blocks_wide_lookup[bsize],
-                      bh, bw, subh, subw);
+      vpx_blend_a64_mask(comppred, compstride,
+                         intrapred, intrastride,
+                         interpred, interstride,
+                         mask, 4 * num_4x4_blocks_wide_lookup[bsize],
+                         bh, bw, subh, subw);
     }
     return;
   }
@@ -1911,10 +1858,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1924,10 +1870,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[j * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1939,10 +1884,9 @@
           int scale = (ii_weights1d[i * size_scale] * 3 +
                        ii_weights1d[j * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1954,10 +1898,9 @@
           int scale = (ii_weights1d[j * size_scale] * 3 +
                        ii_weights1d[i * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1967,10 +1910,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[(i < j ? i : j) * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1981,10 +1923,9 @@
           int scale = (ii_weights1d[i * size_scale] +
                        ii_weights1d[j * size_scale]) >> 1;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -1995,10 +1936,8 @@
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  interpred[i * interstride + j] +
-                  intrapred[i * intrastride + j],
-                  1);
+              VPX_BLEND_AVG(intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
         }
       }
       break;
@@ -2014,20 +1953,18 @@
                                       BLOCK_SIZE plane_bsize,
                                       uint8_t *comppred8,
                                       int compstride,
-                                      uint8_t *interpred8,
+                                      const uint8_t *interpred8,
                                       int interstride,
-                                      uint8_t *intrapred8,
+                                      const uint8_t *intrapred8,
                                       int intrastride, int bd) {
-  const int scale_bits = 8;
-  const int scale_max = (1 << scale_bits);
   const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   const int size_scale = ii_size_scales[plane_bsize];
   int i, j;
 
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
-  uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
-  uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
@@ -2036,11 +1973,11 @@
                                                           bsize);
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      vpx_highbd_blend_mask6(comppred8, compstride,
-                             intrapred8, intrastride,
-                             interpred8, interstride,
-                             mask, bw,
-                             bh, bw, subh, subw, bd);
+      vpx_highbd_blend_a64_mask(comppred8, compstride,
+                                intrapred8, intrastride,
+                                interpred8, interstride,
+                                mask, bw,
+                                bh, bw, subh, subw, bd);
     }
     return;
   }
@@ -2051,10 +1988,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[i * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2064,10 +2000,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[j * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2079,10 +2014,9 @@
           int scale = (ii_weights1d[i * size_scale] * 3 +
                        ii_weights1d[j * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2094,10 +2028,9 @@
           int scale = (ii_weights1d[j * size_scale] * 3 +
                        ii_weights1d[i * size_scale]) >> 2;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2107,10 +2040,9 @@
         for (j = 0; j < bw; ++j) {
           int scale = ii_weights1d[(i < j ? i : j) * size_scale];
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2121,10 +2053,9 @@
           int scale = (ii_weights1d[i * size_scale] +
                        ii_weights1d[j * size_scale]) >> 1;
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  (scale_max - scale) * interpred[i * interstride + j] +
-                  scale * intrapred[i * intrastride + j],
-                  scale_bits);
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
         }
       }
       break;
@@ -2135,10 +2066,8 @@
       for (i = 0; i < bh; ++i) {
         for (j = 0; j < bw; ++j) {
           comppred[i * compstride + j] =
-              ROUND_POWER_OF_TWO(
-                  interpred[i * interstride + j] +
-                  intrapred[i * intrastride + j],
-                  1);
+              VPX_BLEND_AVG(interpred[i * interstride + j],
+                            intrapred[i * intrastride + j]);
         }
       }
       break;
@@ -2239,8 +2168,8 @@
 
 void vp10_combine_interintra(MACROBLOCKD *xd,
                              BLOCK_SIZE bsize, int plane,
-                             uint8_t *inter_pred, int inter_stride,
-                             uint8_t *intra_pred, int intra_stride) {
+                             const uint8_t *inter_pred, int inter_stride,
+                             const uint8_t *intra_pred, int intra_stride) {
   const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {

diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 4ede3e9..ac4a004 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h

@@ -396,7 +396,8 @@
 struct macroblockd_plane;
 void vp10_build_masked_inter_predictor_complex(
     MACROBLOCKD *xd,
-    uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
     int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
     BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
     PARTITION_TYPE partition, int plane);
@@ -631,8 +632,8 @@
 void vp10_combine_interintra(
     MACROBLOCKD *xd,
     BLOCK_SIZE bsize, int plane,
-    uint8_t *inter_pred, int inter_stride,
-    uint8_t *intra_pred, int intra_stride);
+    const uint8_t *inter_pred, int inter_stride,
+    const uint8_t *intra_pred, int intra_stride);
 void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
                                            uint8_t *upred,
                                            uint8_t *vpred,

diff --git a/vp10/common/warped_motion.c b/vp10/common/warped_motion.c
index 4990bb3..3b924ea 100644
--- a/vp10/common/warped_motion.c
+++ b/vp10/common/warped_motion.c

@@ -85,19 +85,19 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
           WARPEDPIXEL_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((x << WARPEDMODEL_PREC_BITS)) + mat[0],
           WARPEDPIXEL_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
           WARPEDPIXEL_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           ((y << WARPEDMODEL_PREC_BITS)) + mat[1],
           WARPEDPIXEL_PREC_BITS);
     points += stride_points - 2;
@@ -115,21 +115,21 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[0] * 2 * x + mat[1] * 2 * y + mat[2] +
           (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
+                                            WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           -mat[1] * 2 * x + mat[0] * 2 * y + mat[3] +
           (-mat[1] + mat[0] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
+                                            WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -145,21 +145,21 @@
   for (i = 0; i < n; ++i) {
     const int x = *(points++), y = *(points++);
     if (subsampling_x)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[0] * 2 * x + mat[1] * 2 * y + mat[4] +
           (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
+                                            WARPEDDIFF_PREC_BITS);
     if (subsampling_y)
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
           mat[2] * 2 * x + mat[3] * 2 * y + mat[5] +
           (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
           WARPEDDIFF_PREC_BITS + 1);
     else
-      *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
-                                             WARPEDDIFF_PREC_BITS);
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
+                                            WARPEDDIFF_PREC_BITS);
     points += stride_points - 2;
     proj += stride_proj - 2;
   }
@@ -357,7 +357,7 @@
     const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
     const int64_t v3 = x * (p[1] - p[-1]);
     const int64_t v4 = 2 * p[0];
-    return (int32_t)ROUNDZ_POWER_OF_TWO_SIGNED(
+    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
         (v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
         (v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
         (v2 << WARPEDPIXEL_PREC_BITS) + v1,

diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index cabfc40..389e40b 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c

@@ -965,7 +965,12 @@
     }
 #else
     if (!vp10_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+      assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
+      assert(mbmi->interp_filter[1] == EIGHTTAP_REGULAR);
+#else
       assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
+#endif
       return;
     }
 #endif  // CONFIG_DUAL_FILTER

diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 3307393..3810be5 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c

@@ -160,7 +160,7 @@
     next_shortcut = shortcut;
 
     /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
+    if (UNLIKELY(x)) {
       error0 = tokens[next][0].error;
       error1 = tokens[next][1].error;
       /* Evaluate the first possibility for this state. */
@@ -204,7 +204,7 @@
       rate1 = tokens[next][1].rate;
 
       // The threshold of 3 is empirically obtained.
-      if (abs(x) > 3) {
+      if (UNLIKELY(abs(x) > 3)) {
         shortcut = 0;
       } else {
 #if CONFIG_NEW_QUANT
@@ -233,7 +233,7 @@
         best_index[i][1] = best_index[i][0];
         next = i;
 
-        if (!(--band_left)) {
+        if (UNLIKELY(!(--band_left))) {
           --band_counts;
           band_left = *band_counts;
           --token_costs;
@@ -255,7 +255,7 @@
       }
 
       if (next_shortcut) {
-        if (next < default_eob) {
+        if (LIKELY(next < default_eob)) {
           if (t0 != EOB_TOKEN) {
             token_cache[rc] = vp10_pt_energy_class[t0];
             pt = get_coef_context(nb, token_cache, i + 1);
@@ -350,7 +350,7 @@
       /* Don't update next, because we didn't add a new node. */
     }
 
-    if (!(--band_left)) {
+    if (UNLIKELY(!(--band_left))) {
       --band_counts;
       band_left = *band_counts;
       --token_costs;

diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 7cc65e6..c64d57e 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c

@@ -44,6 +44,24 @@
 #include "vp10/encoder/rdopt.h"
 #include "vp10/encoder/aq_variance.h"
 
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+static const int filter_sets[25][2] = {
+    {0, 0}, {0, 1}, {0, 2}, {0, 3}, {0, 4},
+    {1, 0}, {1, 1}, {1, 2}, {1, 3}, {1, 4},
+    {2, 0}, {2, 1}, {2, 2}, {2, 3}, {2, 4},
+    {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 4},
+    {4, 0}, {4, 1}, {4, 2}, {4, 3}, {4, 4},
+};
+#else
+static const int filter_sets[9][2] = {
+    {0, 0}, {0, 1}, {0, 2},
+    {1, 0}, {1, 1}, {1, 2},
+    {2, 0}, {2, 1}, {2, 2},
+};
+#endif
+#endif
+
 #if CONFIG_EXT_REFS
 
 #define LAST_FRAME_MODE_MASK    ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | \
@@ -403,18 +421,6 @@
   {{INTRA_FRAME,  NONE}},
 };
 
-#if CONFIG_DUAL_FILTER
-// TODO(jingning): The magic number 9 here really means the combination
-// of prediction filter types for vertical and horizontal directions.
-// It will be replaced after we integrate the dual filter experiment with
-// the ext-interp experiment.
-static int filter_sets[9][2] = {
-    {0, 0}, {0, 1}, {0, 2},
-    {1, 0}, {1, 1}, {1, 2},
-    {2, 0}, {2, 1}, {2, 2},
-};
-#endif
-
 static INLINE int write_uniform_cost(int n, int v) {
   int l = get_unsigned_bits(n), m = (1 << l) - n;
   if (l == 0)
@@ -1211,7 +1217,7 @@
       sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-        sse = ROUNDZ_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+        sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       sse = (int64_t)sse * 16;
 
@@ -3021,7 +3027,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
-    tmp = ROUNDZ_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   *bsse += tmp * 16;
 
@@ -5357,7 +5363,7 @@
             this_mode == NEWMV &&
 #endif  // CONFIG_EXT_INTER
 #if CONFIG_DUAL_FILTER
-            1) {
+            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) {
 #else
             (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) {
 #endif
@@ -6658,7 +6664,7 @@
 
     mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
-    sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
     rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -6720,7 +6726,7 @@
   for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
     mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
     sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
-    sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
 
     model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
     rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -7181,7 +7187,11 @@
     int64_t tmp_dist_sum = 0;
 
 #if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+    for (i = 0; i < 25; ++i) {
+#else
     for (i = 0; i < 9; ++i) {
+#endif
 #else
     for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
 #endif
@@ -7899,6 +7909,10 @@
       *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
 
       *distortion = skip_sse_sb;
+      *psse = skip_sse_sb;
+      *rate_y = 0;
+      *rate_uv = 0;
+      *skippable = 1;
     }
 
 #if CONFIG_OBMC || CONFIG_WARPED_MOTION
@@ -10479,7 +10493,11 @@
       b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
       MB_MODE_INFO tmp_best_mbmode;
 #if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+      BEST_SEG_INFO bsi[25];
+#else
       BEST_SEG_INFO bsi[9];
+#endif
 #else
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
 #endif
@@ -10546,7 +10564,11 @@
         } else {
 #if CONFIG_DUAL_FILTER
           for (switchable_filter_index = 0;
+#if CONFIG_EXT_INTERP
+               switchable_filter_index < 25;
+#else
                switchable_filter_index < 9;
+#endif
                ++switchable_filter_index) {
 #else
           for (switchable_filter_index = 0;
@@ -10578,7 +10600,8 @@
 #if CONFIG_EXT_INTERP
 #if CONFIG_DUAL_FILTER
             if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
-                mbmi->interp_filter[0] != EIGHTTAP_REGULAR)  // invalid config
+                (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+                 mbmi->interp_filter[1] != EIGHTTAP_REGULAR))  // invalid config
               continue;
 #else
             if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
@@ -10664,9 +10687,11 @@
 #if CONFIG_EXT_INTERP
 #if CONFIG_DUAL_FILTER
         if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
-            mbmi->interp_filter[0] != EIGHTTAP_REGULAR)
-          for (i = 0; i < 4; ++i)
-            mbmi->interp_filter[i] = EIGHTTAP_REGULAR;
+            (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+             mbmi->interp_filter[1] != EIGHTTAP_REGULAR)) {
+          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+        }
 #else
         if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
             mbmi->interp_filter != EIGHTTAP_REGULAR)

diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h
new file mode 100644
index 0000000..109183a
--- /dev/null
+++ b/vpx_dsp/blend.h

@@ -0,0 +1,40 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_BLEND_H_
+#define VPX_DSP_BLEND_H_
+
+#include "vpx_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the vpx_blend_* functions in vpx_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A64_ROUND_BITS  6
+#define VPX_BLEND_A64_MAX_ALPHA   (1 << VPX_BLEND_A64_ROUND_BITS)   // 64
+
+#define VPX_BLEND_A64(a, v0, v1)                                              \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1),     \
+                     VPX_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A256_ROUND_BITS 8
+#define VPX_BLEND_A256_MAX_ALPHA  (1 << VPX_BLEND_A256_ROUND_BITS)  // 256
+
+#define VPX_BLEND_A256(a, v0, v1)                                             \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1),    \
+                     VPX_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define VPX_BLEND_AVG(v0, v1)   ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif  // VPX_DSP_BLEND_H_

diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000..90f3415
--- /dev/null
+++ b/vpx_dsp/blend_a64_hmask.c

@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_hmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c
new file mode 100644
index 0000000..1649798
--- /dev/null
+++ b/vpx_dsp/blend_a64_mask.c

@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride,
+                          int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000..5d48a83
--- /dev/null
+++ b/vpx_dsp/blend_a64_vmask.c

@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_vmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_vmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
deleted file mode 100644
index 584ee6a..0000000
--- a/vpx_dsp/blend_mask6.c
+++ /dev/null

@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-#include "./vpx_dsp_rtcd.h"
-
-#define MASK_BITS 6
-
-void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
-                       uint8_t *src0, uint32_t src0_stride,
-                       uint8_t *src1, uint32_t src1_stride,
-                       const uint8_t *mask, uint32_t mask_stride,
-                       int h, int w, int subh, int subw) {
-  int i, j;
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 = mask[i * mask_stride + j];
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
-                               mask[i * mask_stride + (2 * j + 1)], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
-                               mask[(2 * i + 1) * mask_stride + j], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
-                              uint8_t *src0_8, uint32_t src0_stride,
-                              uint8_t *src1_8, uint32_t src1_stride,
-                              const uint8_t *mask, uint32_t mask_stride,
-                              int h, int w, int subh, int subw, int bd) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
-  uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
-  assert(h >= 4);
-  assert(w >= 4);
-  assert(IS_POWER_OF_TWO(h));
-  assert(IS_POWER_OF_TWO(w));
-
-  assert(bd == 8 || bd == 10 || bd == 12);
-
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 = mask[i * mask_stride + j];
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
-                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
-                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
-                               2);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
-                               mask[i * mask_stride + (2 * j + 1)], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        const int m0 =
-            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
-                               mask[(2 * i + 1) * mask_stride + j], 1);
-        const int m1 = ((1 << MASK_BITS) - m0);
-        dst[i * dst_stride + j] =
-            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
-                               src1[i * src1_stride + j] * m1, MASK_BITS);
-      }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3c519b6..3eb7a9f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk

@@ -70,10 +70,14 @@
 # inter predictions
 
 ifeq ($(CONFIG_VP10),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-yes            += blend_mask6.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
-endif  #CONFIG_EXT_INTER
+DSP_SRCS-yes            += blend.h
+DSP_SRCS-yes            += blend_a64_mask.c
+DSP_SRCS-yes            += blend_a64_hmask.c
+DSP_SRCS-yes            += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
 endif  #CONFIG_VP10
 
 # interpolation filters

diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e630994..02c8727 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl

@@ -959,6 +959,27 @@
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
+if (vpx_config("CONFIG_VP10") eq "yes") {
+  #
+  # Alpha blending with mask
+  #
+  add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  specialize "vpx_blend_a64_mask", qw/sse4_1/;
+  specialize "vpx_blend_a64_hmask", qw/sse4_1/;
+  specialize "vpx_blend_a64_vmask", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/;
+  }
+}  # CONFIG_VP10
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction
@@ -1384,14 +1405,6 @@
       }
     }
   }
-
-  add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
-  specialize "vpx_blend_mask6", qw/sse4_1/;
-
-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
-    specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
-  }
 }
 
 #

diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000..a10e077
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c

@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx/vpx_integer.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void vpx_blend_a64_hmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  vpx_blend_a64_mask_sse4_1(dst, dst_stride,
+                            src0, src0_stride,
+                            src1, src1_stride,
+                            mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w,
+    int bd) {
+  vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
+                                   src0_8, src0_stride,
+                                   src1_8, src1_stride,
+                                   mask, 0, h, w, 0, 0, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c
similarity index 61%
rename from vpx_dsp/x86/blend_mask6_sse4.c
rename to vpx_dsp/x86/blend_a64_mask_sse4.c
index 28693a4..cdb40c2 100644
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_a64_mask_sse4.c

@@ -15,62 +15,24 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
 
 #include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
 
 #include "./vpx_dsp_rtcd.h"
 
-#define MASK_BITS 6
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_32(src0);
-  const __m128i v_s1_b = xx_loadl_32(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
-                              const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_b = xx_loadl_64(src0);
-  const __m128i v_s1_b = xx_loadl_64(src1);
-  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
-  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
 //////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_w4_sse4_1(
+static void blend_a64_mask_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -92,13 +54,13 @@
   } while (--h);
 }
 
-static void blend_mask6_w8_sse4_1(
+static void blend_a64_mask_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -120,13 +82,13 @@
   } while (--h);
 }
 
-static void blend_mask6_w16n_sse4_1(
+static void blend_a64_mask_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -158,15 +120,15 @@
 // Horizontal sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sx_w4_sse4_1(
+static void blend_a64_mask_sx_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -190,15 +152,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_w8_sse4_1(
+static void blend_a64_mask_sx_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -222,15 +184,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_w16n_sse4_1(
+static void blend_a64_mask_sx_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -265,13 +227,13 @@
 // Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sy_w4_sse4_1(
+static void blend_a64_mask_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -296,13 +258,13 @@
   } while (--h);
 }
 
-static void blend_mask6_sy_w8_sse4_1(
+static void blend_a64_mask_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -327,14 +289,14 @@
   } while (--h);
 }
 
-static void blend_mask6_sy_w16n_sse4_1(
+static void blend_a64_mask_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zero = _mm_setzero_si128();
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -368,15 +330,15 @@
 // Horizontal and Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static void blend_mask6_sx_sy_w4_sse4_1(
+static void blend_a64_mask_sx_sy_w4_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -405,15 +367,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_sy_w8_sse4_1(
+static void blend_a64_mask_sx_sy_w8_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   (void)w;
 
@@ -442,15 +404,15 @@
   } while (--h);
 }
 
-static void blend_mask6_sx_sy_w16n_sse4_1(
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
     uint8_t *dst, uint32_t dst_stride,
-    uint8_t *src0, uint32_t src0_stride,
-    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -495,146 +457,67 @@
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
-                            uint8_t *src0, uint32_t src0_stride,
-                            uint8_t *src1, uint32_t src1_stride,
-                            const uint8_t *mask, uint32_t mask_stride,
-                            int h, int w, int suby, int subx) {
+void vpx_blend_a64_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx) {
   typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
-                            uint8_t *src0, uint32_t src0_stride,
-                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
                             const uint8_t *mask, uint32_t mask_stride,
                             int h, int w);
 
-  static blend_fn blend[3][2][2] = {  // width_index X subx X suby
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
     {     // w % 16 == 0
-      {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
-      {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+      {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
+      {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
     }, {  // w == 4
-      {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
-      {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+      {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
+      {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
     }, {  // w == 8
-      {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
-      {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+      {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
+      {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
     }
   };
 
   assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
   assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
 
-  assert(h >= 4);
-  assert(w >= 4);
+  assert(h >= 1);
+  assert(w >= 1);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
-  blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
-                                            src0, src0_stride,
-                                            src1, src1_stride,
-                                            mask, mask_stride,
-                                            h, w);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_blend_a64_mask_c(dst, dst_stride,
+                         src0, src0_stride,
+                         src1, src1_stride,
+                         mask, mask_stride,
+                         h, w, suby, subx);
+  } else {
+    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+                                              src0, src0_stride,
+                                              src1, src1_stride,
+                                              mask, mask_stride,
+                                              h, w);
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 //////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
-                                 const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
-  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
-  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
-  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadl_64(src0);
-  const __m128i v_s1_w = xx_loadl_64(src1);
-
-  // Interleave
-  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
-  // Scale
-  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
-                                  const __m128i v_m0_w, const __m128i v_m1_w) {
-  const __m128i v_s0_w = xx_loadu_128(src0);
-  const __m128i v_s1_w = xx_loadu_128(src1);
-
-  // Interleave
-  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
-  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
-  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
-  // Multiply-Add
-  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
-  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
-  // Scale
-  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
-  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
-
-  // Pack
-  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
-  // Round
-  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
-  return v_res_w;
-}
-
-//////////////////////////////////////////////////////////////////////////////
 // No sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_m0_b = xx_loadl_32(mask);
@@ -652,37 +535,37 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_w4_sse4_1(
+static void blend_a64_mask_b10_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                           src1_stride, mask, mask_stride, h,
-                           blend_4_b10);
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
 }
 
-static void blend_mask6_b12_w4_sse4_1(
+static void blend_a64_mask_b12_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                           src1_stride, mask, mask_stride, h,
-                           blend_4_b12);
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
 }
 
-static inline void blend_mask6_bn_w8n_sse4_1(
+static inline void blend_a64_mask_bn_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -702,41 +585,41 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_w8n_sse4_1(
+static void blend_a64_mask_b10_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, mask_stride, h, w,
-                            blend_8_b10);
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
 }
 
-static void blend_mask6_b12_w8n_sse4_1(
+static void blend_a64_mask_b12_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                            src1_stride, mask, mask_stride, h, w,
-                            blend_8_b12);
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Horizontal sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_r_b = xx_loadl_64(mask);
@@ -756,39 +639,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_w4_sse4_1(
+static void blend_a64_mask_b10_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b10);
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
 }
 
-static void blend_mask6_b12_sx_w4_sse4_1(
+static void blend_a64_mask_b12_sx_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b12);
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -810,39 +693,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b10);
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
 }
 
-static void blend_mask6_b12_sx_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b12);
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_ra_b = xx_loadl_32(mask);
@@ -863,37 +746,37 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b10);
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
 }
 
-static void blend_mask6_b12_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                              src1_stride, mask, mask_stride, h,
-                              blend_4_b12);
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -916,41 +799,41 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b10);
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
 }
 
-static void blend_mask6_b12_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                               src1_stride, mask, mask_stride, h, w,
-                               blend_8_b12);
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Horizontal and Vertical sub-sampling
 //////////////////////////////////////////////////////////////////////////////
 
-static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     const __m128i v_ra_b = xx_loadl_64(mask);
@@ -975,39 +858,39 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b10);
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
 }
 
-static void blend_mask6_b12_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
   (void)w;
-  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                 src1_stride, mask, mask_stride, h,
-                                 blend_4_b12);
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
 }
 
-static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w, blend_unit_fn blend) {
   const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
                                          0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
-  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
 
   do {
     int c;
@@ -1034,82 +917,91 @@
   } while (--h);
 }
 
-static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b10);
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b10);
 }
 
-static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
     uint16_t *dst, uint32_t dst_stride,
-    uint16_t *src0, uint32_t src0_stride,
-    uint16_t *src1, uint32_t src1_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
     const uint8_t *mask, uint32_t mask_stride,
     int h, int w) {
-  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
-                                  src1_stride, mask, mask_stride, h, w,
-                                  blend_8_b12);
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b12);
 }
 
 //////////////////////////////////////////////////////////////////////////////
 // Dispatch
 //////////////////////////////////////////////////////////////////////////////
 
-void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
-                                   uint8_t *src0_8, uint32_t src0_stride,
-                                   uint8_t *src1_8, uint32_t src1_stride,
-                                   const uint8_t *mask, uint32_t mask_stride,
-                                   int h, int w, int suby, int subx, int bd) {
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
-  uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
+void vpx_highbd_blend_a64_mask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx, int bd) {
   typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
-                            uint16_t *src0, uint32_t src0_stride,
-                            uint16_t *src1, uint32_t src1_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
                             const uint8_t *mask, uint32_t mask_stride,
                             int h, int w);
 
-  static blend_fn blend[2][2][2][2] = {  // bd_index X width_index X subx X suby
+  // Dimensions are: bd_index X width_index X subx X suby
+  static const blend_fn blend[2][2][2][2] = {
     {   // bd == 8 or 10
       {     // w % 8 == 0
-        {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
-        {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+        {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
+        {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
       }, {  // w == 4
-        {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
-        {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+        {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
+        {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
       }
     },
     {   // bd == 12
       {     // w % 8 == 0
-        {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
-        {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+        {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
+        {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
       }, {  // w == 4
-        {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
-        {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+        {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
+        {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
       }
     }
   };
 
-  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
-  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
 
-  assert(h >= 4);
-  assert(w >= 4);
+  assert(h >= 1);
+  assert(w >= 1);
   assert(IS_POWER_OF_TWO(h));
   assert(IS_POWER_OF_TWO(w));
 
   assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
+                                src0_8, src0_stride,
+                                src1_8, src1_stride,
+                                mask, mask_stride,
+                                h, w, suby, subx, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
 
-  blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
-                                                      src0, src0_stride,
-                                                      src1, src1_stride,
-                                                      mask, mask_stride,
-                                                      h, w);
+    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+                                                        src0, src0_stride,
+                                                        src1, src1_stride,
+                                                        mask, mask_stride,
+                                                        h, w);
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000..4b0f38d
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c

@@ -0,0 +1,293 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0_w, v_m1_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0_w, v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_vmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    vpx_blend_a64_vmask_c,        // w == 1
+    vpx_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride,
+                 src0, src0_stride,
+                 src1, src1_stride,
+                 mask, h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b12);
+}
+
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {     // bd == 8 or 10
+      blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    }, {  // bd == 12
+      blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
+                                 src0_8, src0_stride,
+                                 src1_8, src1_stride,
+                                 mask, h, w, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
+                                  src0, src0_stride,
+                                  src1, src1_stride,
+                                  mask, h, w);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH

diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000..9b74f90
--- /dev/null
+++ b/vpx_dsp/x86/blend_sse4.h

@@ -0,0 +1,145 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_X86_BLEND_SSE4_H_
+#define VPX_DSP_X86_BLEND_SSE4_H_
+
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
+                                          VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_DSP_X86_BLEND_SSE4_H_

diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 1dca1a8..48549ce 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h

@@ -38,24 +38,15 @@
 #define __builtin_prefetch(x)
 #endif
 
-/* Shift down with rounding for use when n > 0 */
+/* Shift down with rounding for use when n >= 0, value >= 0 */
 #define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
+    (((value) + (((1 << (n)) >> 1))) >> (n))
 
-/* Shift down with rounding for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO(value, n) \
-    ((n) ? (((value) + (1 << ((n) - 1))) >> (n)) : (value))
-
-/* Shift down with rounding for signed integers, for use when n > 0 */
+/* Shift down with rounding for signed integers, for use when n >= 0 */
 #define ROUND_POWER_OF_TWO_SIGNED(value, n) \
     (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
                    : ROUND_POWER_OF_TWO((value), (n)))
 
-/* Shift down with rounding for signed integers, for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO_SIGNED(value, n) \
-    (((value) < 0) ? -ROUNDZ_POWER_OF_TWO(-(value), (n)) \
-     : ROUNDZ_POWER_OF_TWO((value), (n)))
-
 #define ALIGN_POWER_OF_TWO(value, n) \
     (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))