Add optimized vpx_blend_mask6

This is to replace vp10/common/reconinter.c:build_masked_compound.
Functionality is equivalent, but the interface is slightly more
generic.

Total encoder speedup with ext-inter: ~7.5%

Change-Id: Iee18b83ae324ffc9c7f7dc16d4b2b06adb4d4305
diff --git a/test/assertion_helpers.h b/test/assertion_helpers.h
new file mode 100644
index 0000000..108c40a
--- /dev/null
+++ b/test/assertion_helpers.h
@@ -0,0 +1,278 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef TEST_ASSERTION_HELPERS_H_
+#define TEST_ASSERTION_HELPERS_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace libvpx_test {
+namespace assertion_helpers {
+
+// Arrays (1D) are element-wise equal
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEq(const E (&a)[n],
+                                    const E (&b)[n]) {
+  for (size_t i = 0; i < n; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// within the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n],
+                                          const E (&b)[n],
+                                          const size_t lo,
+                                          const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = lo; i < hi; i++) {
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// outside the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n],
+                                           const E (&b)[n],
+                                           const size_t lo,
+                                           const size_t hi) {
+  assert(hi > lo);
+  assert(hi <= n);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo <= i && i < hi)
+      continue;
+
+    const E &va = a[i];
+    const E &vb = b[i];
+    if (va != vb) {
+      return ::testing::AssertionFailure()
+        << "Arrays do not equal at index "
+        << "[" << i << "]"
+        << " values are: " << va << " vs " << vb;
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEq(const E (&a)[n][m],
+                                    const E (&b)[n][m]) {
+  for (size_t i = 0; i < n; i++) {
+    for (size_t j = 0; j < m; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// within the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n][m],
+                                          const E (&b)[n][m],
+                                          const size_t lo0,
+                                          const size_t hi0,
+                                          const size_t lo1,
+                                          const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = lo0; i < hi0; i++) {
+    for (size_t j = lo1; j < hi1; j++) {
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// outside the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t lo0,
+                                           const size_t hi0,
+                                           const size_t lo1,
+                                           const size_t hi1) {
+  assert(hi0 > lo0);
+  assert(hi0 <= n);
+  assert(hi1 > lo1);
+  assert(hi1 <= m);
+
+  for (size_t i = 0; i < n; i++) {
+    if (lo0 <= i && i < hi0)
+      continue;
+
+    for (size_t j = 0; j < m; j++) {
+      if (lo1 <= j && j < hi1)
+        continue;
+
+      const E &va = a[i][j];
+      const E &vb = b[i][j];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << i << "][" << j << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// at corresponding linear indices specified by rows/cols/stride/offset
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqWithin(const E (&a)[n][m],
+                                           const E (&b)[n][m],
+                                           const size_t stridea,
+                                           const size_t strideb,
+                                           const size_t offseta,
+                                           const size_t offsetb,
+                                           const size_t rows,
+                                           const size_t cols) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stridea <= m);
+  assert(strideb <= m);
+  assert(cols <= stridea);
+  assert(cols <= strideb);
+  assert(offseta < n * m);
+  assert(offsetb < n * m);
+  assert(offseta + (rows - 1) * stridea + (cols - 1) < n * m);
+  assert(offsetb + (rows - 1) * strideb + (cols - 1) < n * m);
+
+  const E *pa = &a[0][0] + offseta;
+  const E *pb = &b[0][0] + offsetb;
+
+  for (size_t r = 0 ; r < rows ; r++) {
+    for (size_t c = 0 ; c < cols ; c++) {
+      const E &va = pa[c];
+      const E &vb = pb[c];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at linear index "
+          << "[" << pa - &a[0][0]  << "] vs [" << pb - &b[0][0]  << "]"
+          << " row=" << r << " col=" << c
+          << " values are: " << va << " vs " << vb;
+      }
+    }
+    pa += stridea;
+    pb += strideb;
+  }
+
+  return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// except at corresponding linear indices specified by
+// rows/cols/stride/offset.
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqOutside(const E (&a)[n][m],
+                                            const E (&b)[n][m],
+                                            const size_t stride,
+                                            const size_t offset,
+                                            const size_t rows,
+                                            const size_t cols ) {
+  assert(rows <= n);
+  assert(cols <= m);
+  assert(stride <= m);
+  assert(cols <= stride);
+  assert(offset < n * m);
+  assert(offset + (rows - 1) * stride + (cols - 1) < n * m);
+
+  const E *const pa = &a[0][0];
+  const E *const pb = &b[0][0];
+
+  size_t idx = 0;
+  size_t r = 0;
+  size_t end = offset;  // beginning of first row
+
+  while (idx < n * m) {
+    while (idx < end) {   // until beginning of row or end of buffer
+      const E &va = pa[idx];
+      const E &vb = pb[idx];
+      if (va != vb) {
+        return ::testing::AssertionFailure()
+          << "Arrays do not equal at index "
+          << "[" << idx / m << "][" << idx % m << "]"
+          << " values are: " << va << " vs " << vb;
+      }
+
+      idx++;
+    }
+
+    // Move past row end
+    idx += cols;
+
+    if (++r < rows) {
+      // Move to next row
+      end += stride;
+    } else {
+      // Move to end of buffer
+      end = n * m;
+    }
+  }
+
+  // Sanity check
+  assert(idx == n * m + cols);
+
+  return ::testing::AssertionSuccess();
+}
+
+}   // namespace assertion_helpers
+}   // namespace libvpx_test
+
+#endif  // TEST_ASSERTION_HELPERS_H_
diff --git a/test/blend_mask6_test.cc b/test/blend_mask6_test.cc
new file mode 100644
index 0000000..d737ddd
--- /dev/null
+++ b/test/blend_mask6_test.cc
@@ -0,0 +1,311 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/randomise.h"
+#include "test/snapshot.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/assertion_helpers.h"
+#include "vp10/common/enums.h"
+
+using libvpx_test::assertion_helpers::BuffersEqWithin;
+using libvpx_test::assertion_helpers::BuffersEqOutside;
+using libvpx_test::assertion_helpers::ArraysEq;
+using libvpx_test::FunctionEquivalenceTest;
+using libvpx_test::Snapshot;
+using libvpx_test::Randomise;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendMask6Test : public FunctionEquivalenceTest<F> {
+ protected:
+  virtual ~BlendMask6Test() {}
+
+  virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+  void Common() {
+    w = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+    h = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+
+    randomise(subx);
+    randomise(suby);
+
+    randomise(dst_offset, 0, 32);
+    randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src0_offset, 0, 32);
+    randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(src1_offset, 0, 32);
+    randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1);
+
+    randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1);
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (randomise.uniform<int>(3)) {
+      case 0:   // Separate sources
+        p_src0 = &src0[0][0];
+        p_src1 = &src1[0][0];
+        break;
+      case 1:   // src0 == dst
+        p_src0 = &dst_tst[0][0];
+        src0_stride = dst_stride;
+        src0_offset = dst_offset;
+        p_src1 = &src1[0][0];
+        break;
+      case 2:   // src1 == dst
+        p_src0 = &src0[0][0];
+        p_src1 = &dst_tst[0][0];
+        src1_stride = dst_stride;
+        src1_offset = dst_offset;
+        break;
+      default:
+        FAIL();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // Prepare
+    //////////////////////////////////////////////////////////////////////////
+
+    snapshot(dst_ref);
+    snapshot(dst_tst);
+
+    snapshot(src0);
+    snapshot(src1);
+
+    snapshot(mask);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Execute
+    //////////////////////////////////////////////////////////////////////////
+
+    Execute(p_src0, p_src1);
+
+    //////////////////////////////////////////////////////////////////////////
+    // Check
+    //////////////////////////////////////////////////////////////////////////
+
+    ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst,
+                                dst_stride, dst_stride,
+                                dst_offset, dst_offset,
+                                h, w));
+
+    ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0));
+    ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1));
+    ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+
+    ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst,
+                                 dst_stride,
+                                 dst_offset,
+                                 h, w));
+  }
+
+  Snapshot snapshot;
+  Randomise randomise;
+
+  T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t dst_stride;
+  size_t dst_offset;
+
+  T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src0_stride;
+  size_t src0_offset;
+
+  T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+  size_t src1_stride;
+  size_t src1_offset;
+
+  uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE];
+  size_t mask_stride;
+
+  int w;
+  int h;
+
+  bool suby;
+  bool subx;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                      uint8_t *src0, uint32_t src0_stride,
+                      uint8_t *src1, uint32_t src1_stride,
+                      const uint8_t *mask, uint32_t mask_stride,
+                      int h, int w, int suby, int subx);
+
+class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+ protected:
+  void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+    ref_func_(&dst_ref[0][dst_offset], dst_stride,
+              p_src0 + src0_offset, src0_stride,
+              p_src1 + src1_offset, src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(&dst_tst[0][dst_offset], dst_stride,
+                p_src0 + src0_offset, src0_stride,
+                p_src1 + src1_offset, src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx));
+  }
+};
+
+TEST_P(BlendMask6Test8B, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref);
+    randomise(dst_tst);
+
+    randomise(src0);
+    randomise(src1);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6Test8B, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    randomise(dst_ref, 254, 256);
+    randomise(dst_tst, 254, 256);
+
+    randomise(src0, 254, 256);
+    randomise(src1, 254, 256);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6Test8B,
+  ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int suby, int subx, int bd);
+
+class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+ protected:
+  void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+    ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride,
+              CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+              CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+              &mask[0][0], sizeof(mask[0]),
+              h, w, suby, subx, bit_depth);
+
+    ASM_REGISTER_STATE_CHECK(
+      tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride,
+                CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+                CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+                &mask[0][0], sizeof(mask[0]),
+                h, w, suby, subx, bit_depth));
+  }
+
+  int bit_depth;
+};
+
+TEST_P(BlendMask6TestHBD, RandomValues) {
+  for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+
+    randomise(dst_ref, hi);
+    randomise(dst_tst, hi);
+
+    randomise(src0, hi);
+    randomise(src1, hi);
+
+    randomise(mask, 65);
+
+    Common();
+  }
+}
+
+TEST_P(BlendMask6TestHBD, ExtremeValues) {
+  for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+    //////////////////////////////////////////////////////////////////////////
+    // Randomise
+    //////////////////////////////////////////////////////////////////////////
+
+    bit_depth = randomise.choice(8, 10, 12);
+
+    const int hi = 1 << bit_depth;
+    const int lo = hi - 2;
+
+    randomise(dst_ref, lo, hi);
+    randomise(dst_tst, lo, hi);
+
+    randomise(src0, lo, hi);
+    randomise(src1, lo, hi);
+
+    randomise(mask, 63, 65);
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendMask6TestHBD,
+  ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
+                               &vpx_highbd_blend_mask6_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
new file mode 100644
index 0000000..50ad4c5
--- /dev/null
+++ b/test/function_equivalence_test.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+namespace libvpx_test {
+template <typename T>
+class FunctionEquivalenceTest :
+  public ::testing::TestWithParam< std::tr1::tuple< T, T > > {
+ public:
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() {
+    ref_func_ = std::tr1::get<0>(this->GetParam());
+    tst_func_ = std::tr1::get<1>(this->GetParam());
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  T ref_func_;
+  T tst_func_;
+};
+
+}   // namespace libvpx_test
+#endif  // TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/test/randomise.h b/test/randomise.h
new file mode 100644
index 0000000..fbf419c
--- /dev/null
+++ b/test/randomise.h
@@ -0,0 +1,207 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_RANDOMISE_H_
+#define TEST_RANDOMISE_H_
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+
+namespace libvpx_test {
+
+// TODO(any): Replace this when built with C++11
+#define STATIC_ASSERT_INTEGER_TYPE_(T) \
+  GTEST_COMPILE_ASSERT_(std::numeric_limits<T>::is_integer, \
+    integer_type_required);
+
+/**
+ * Deterministic random number generator with various convenience methods.
+ */
+class Randomise {
+ public:
+  Randomise() {
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+  }
+
+  virtual ~Randomise() { }
+
+  // Uniformly distributed random number from the range
+  // [std::numeric_limits<R>::min(), and std::numeric_limits<R>::max()]
+  template<typename R>
+  R uniform() {
+    STATIC_ASSERT_INTEGER_TYPE_(R);
+  }
+
+  // Uniformly distributed random number from the range
+  // [0, hi)
+  template<typename R, typename H>
+  R uniform(H hi) {
+    assert(hi > 0);
+    R v = uniform<R>();
+    if (std::numeric_limits<R>::is_signed && v < 0)
+      return -v % hi;
+    else
+      return v % hi;
+  }
+
+  // Uniformly distributed random number from the range
+  // [lo, hi)
+  template<typename R, typename L, typename H>
+  R uniform(L lo, H hi) {
+    assert(hi > lo);
+    return uniform<R, H>(hi - lo) + lo;
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1) {
+    switch (uniform<int>(2)) {
+      case 0: return v0;
+      default: return v1;
+    }
+  }
+
+  // Randomly pick and return one of the arguments
+  template<typename T>
+  T choice(T v0, T v1, T v2) {
+    switch (uniform<int>(3)) {
+      case 0: return v0;
+      case 1: return v1;
+      default: return v2;
+    }
+  }
+
+  template<typename T>
+  void operator()(T &e) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T>();
+  }
+
+  template<typename T, typename H>
+  void operator()(T &e, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, H>(hi);
+  }
+
+  template<typename T, typename L, typename H>
+  void operator()(T &e, L lo, H hi) {  // NOLINT
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    e = uniform<T, L, H>(lo, hi);
+  }
+
+  template<typename T, size_t n>
+  void operator()(T (&arr)[n]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T>();
+    }
+  }
+
+  template<typename T, size_t n, typename H>
+  void operator()(T (&arr)[n], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, H>(hi);
+    }
+  }
+
+  template<typename T, size_t n, typename L, typename H>
+  void operator()(T (&arr)[n], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      arr[i] = uniform<T, L, H>(lo, hi);
+    }
+  }
+
+  template<typename T, size_t n, size_t m>
+  void operator()(T (&arr)[n][m]) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T>();
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename H>
+  void operator()(T (&arr)[n][m], H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, H>(hi);
+      }
+    }
+  }
+
+  template<typename T, size_t n, size_t m, typename L, typename H>
+  void operator()(T (&arr)[n][m], L lo, H hi) {
+    STATIC_ASSERT_INTEGER_TYPE_(T);
+    for (size_t i = 0; i < n ; i++) {
+      for (size_t j = 0; j < m ; j++) {
+        arr[i][j] = uniform<T, L, H>(lo, hi);
+      }
+    }
+  }
+
+ private:
+  libvpx_test::ACMRandom rnd_;
+};
+
+// Add further specialisations as necessary
+
+template<>
+bool Randomise::uniform<bool>() {
+  return rnd_.Rand8() & 1 ? true : false;
+}
+
+template<>
+uint8_t Randomise::uniform<uint8_t>() {
+  return rnd_.Rand8();
+}
+
+template<>
+uint16_t Randomise::uniform<uint16_t>() {
+  return rnd_.Rand16();
+}
+
+template<>
+uint32_t Randomise::uniform<uint32_t>() {
+  const uint32_t l = uniform<uint16_t>();
+  const uint32_t h = uniform<uint16_t>();
+  return h << 16 | l;
+}
+
+template<>
+uint64_t Randomise::uniform<uint64_t>() {
+  const uint64_t l = uniform<uint32_t>();
+  const uint64_t h = uniform<uint32_t>();
+  return h << 32 | l;
+}
+
+template<>
+int8_t Randomise::uniform<int8_t>() { return uniform<uint8_t>(); }
+
+template<>
+int16_t Randomise::uniform<int16_t>() { return uniform<uint16_t>(); }
+
+template<>
+int32_t Randomise::uniform<int32_t>() { return uniform<uint32_t>(); }
+
+template<>
+int64_t Randomise::uniform<int64_t>() { return uniform<uint64_t>(); }
+
+}  // namespace libvpx_test
+
+#endif  // TEST_RANDOMISE_H_
diff --git a/test/snapshot.h b/test/snapshot.h
new file mode 100644
index 0000000..b67edde
--- /dev/null
+++ b/test/snapshot.h
@@ -0,0 +1,104 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_SNAPSHOT_H_
+#define TEST_SNAPSHOT_H_
+
+#include <map>
+
+namespace libvpx_test {
+
+/**
+ * Allows capturing and retrieving snapshots of arbitrary blobs of memory,
+ * blob size is based on compile time type information.
+ *
+ * Usage:
+ * void example() {
+ *   Snapshot snapshot;
+ *
+ *   int foo = 4;
+ *
+ *   snapshot(foo);
+ *
+ *   foo = 10;
+ *
+ *   assert(snapshot.get(foo) == 4);     // Pass
+ *   assert(snapshot.get(foo) == foo);   // Fail (4 != 10)
+ *
+ *   char bar[10][10];
+ *   memset(bar, 3, sizeof(bar));
+ *
+ *   snapshot(bar);
+ *
+ *   memset(bar, 8, sizeof(bar));
+ *
+ *   assert(sum(bar) == 800);                 // Pass
+ *   assert(sum(snapshot.get(bar)) == 300);   // Pass
+ * }
+ */
+class Snapshot {
+ public:
+  virtual ~Snapshot() {
+    for (snapshot_map_t::iterator it = snapshots_.begin();
+         it != snapshots_.end(); it++) {
+      delete[] it->second;
+    }
+  }
+
+  /**
+   * Take new snapshot for object
+   */
+  template<typename E>
+  void take(const E &e) {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::iterator it = snapshots_.find(key);
+
+    if (it != snapshots_.end())
+      delete[] it->second;
+
+    char *const buf = new char[sizeof(E)];
+
+    memcpy(buf, &e, sizeof(E));
+
+    snapshots_[key] = buf;
+  }
+
+  /**
+   * Same as 'take'
+   */
+  template<typename E>
+  void operator()(const E &e) {
+    take(e);
+  }
+
+  /**
+   * Retrieve last snapshot for object
+   */
+  template<typename E>
+  const E& get(const E &e) const {
+    const void *const key = reinterpret_cast<const void*>(&e);
+
+    snapshot_map_t::const_iterator it = snapshots_.find(key);
+
+    assert(it != snapshots_.end());
+
+    return *reinterpret_cast<const E*>(it->second);
+  }
+
+ private:
+  typedef std::map<const void*, const char*> snapshot_map_t;
+
+  snapshot_map_t snapshots_;
+};
+
+}   // namespace libvpx_test
+
+#endif  // TEST_SNAPSHOT_H_
diff --git a/test/test.mk b/test/test.mk
index 77b00a5..339e274 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -178,6 +178,7 @@
 ifeq ($(CONFIG_EXT_INTER),yes)
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
 LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
 endif
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index d6ac4bb..825fff3 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -11,6 +11,7 @@
 #include <assert.h>
 
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
@@ -410,119 +411,6 @@
   return mask;
 }
 
-static void build_masked_compound(uint8_t *dst, int dst_stride,
-                                  uint8_t *dst1, int dst1_stride,
-                                  uint8_t *dst2, int dst2_stride,
-                                  const uint8_t *mask,
-                                  int h, int w, int subh, int subw) {
-  int i, j;
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
-                                         uint8_t *dst1_8, int dst1_stride,
-                                         uint8_t *dst2_8, int dst2_stride,
-                                         const uint8_t *mask,
-                                         int h, int w, int subh, int subw) {
-  int i, j;
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
-  uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8);
-  uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
-  if (subw == 0 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = mask[i * MASK_MASTER_STRIDE + j];
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 1) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else if (subw == 1 && subh == 0) {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
-                 mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  } else {
-    for (i = 0; i < h; ++i)
-      for (j = 0; j < w; ++j) {
-        int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
-                 mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
-        dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
-                                   dst2[i * dst2_stride + j] *
-                                   ((1 << WEDGE_WEIGHT_BITS) - m) +
-                                   (1 << (WEDGE_WEIGHT_BITS - 1))) >>
-            WEDGE_WEIGHT_BITS;
-      }
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_SUPERTX
 static void build_masked_compound_wedge_extend(
@@ -537,9 +425,11 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  dst, dst_stride,
+                  dst2, dst2_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -549,14 +439,16 @@
     int wedge_index, int wedge_sign,
     BLOCK_SIZE sb_type,
     int wedge_offset_x, int wedge_offset_y,
-    int h, int w) {
+    int h, int w, int bd) {
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(
       wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         dst_8, dst_stride,
+                         dst2_8, dst2_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -573,9 +465,11 @@
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                            sb_type, 0, 0);
-  build_masked_compound(dst, dst_stride,
-                        dst, dst_stride, dst2, dst2_stride, mask,
-                        h, w, subh, subw);
+  vpx_blend_mask6(dst, dst_stride,
+                  dst, dst_stride,
+                  dst2, dst2_stride,
+                  mask, MASK_MASTER_STRIDE,
+                  h, w, subh, subw);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -583,16 +477,18 @@
                                                uint8_t *dst2_8, int dst2_stride,
                                                int wedge_index, int wedge_sign,
                                                BLOCK_SIZE sb_type,
-                                               int h, int w) {
+                                               int h, int w, int bd) {
   // Derive subsampling from h and w passed in. May be refactored to
   // pass in subsampling factors directly.
   const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
   const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
   const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
                                            sb_type, 0, 0);
-  build_masked_compound_highbd(dst_8, dst_stride,
-                               dst_8, dst_stride, dst2_8, dst2_stride, mask,
-                               h, w, subh, subw);
+  vpx_highbd_blend_mask6(dst_8, dst_stride,
+                         dst_8, dst_stride,
+                         dst2_8, dst2_stride,
+                         mask, MASK_MASTER_STRIDE,
+                         h, w, subh, subw, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // CONFIG_SUPERTX
@@ -641,7 +537,7 @@
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
         mi->mbmi.sb_type,
-        wedge_offset_x, wedge_offset_y, h, w);
+        wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
   else
     build_masked_compound_wedge_extend(
         dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@@ -655,7 +551,7 @@
         dst, dst_stride, tmp_dst, MAX_SB_SIZE,
         mi->mbmi.interinter_wedge_index,
         mi->mbmi.interinter_wedge_sign,
-        mi->mbmi.sb_type, h, w);
+        mi->mbmi.sb_type, h, w, xd->cur_buf->bit_depth);
   else
     build_masked_compound_wedge(
         dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@@ -1872,10 +1768,11 @@
                                                bsize, 0, 0);
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
-      build_masked_compound(comppred, compstride,
-                            intrapred, intrastride,
-                            interpred, interstride, mask,
-                            bh, bw, subh, subw);
+      vpx_blend_mask6(comppred, compstride,
+                      intrapred, intrastride,
+                      interpred, interstride,
+                      mask, MASK_MASTER_STRIDE,
+                      bh, bw, subh, subw);
     }
     return;
   }
@@ -1995,7 +1892,6 @@
   uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
   uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
   uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
-  (void) bd;
 
   if (use_wedge_interintra) {
     if (is_interintra_wedge_used(bsize)) {
@@ -2003,10 +1899,11 @@
                                                bsize, 0, 0);
       const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
       const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
-      build_masked_compound_highbd(comppred8, compstride,
-                                   intrapred8, intrastride,
-                                   interpred8, interstride, mask,
-                                   bh, bw, subh, subw);
+      vpx_highbd_blend_mask6(comppred8, compstride,
+                             intrapred8, intrastride,
+                             interpred8, interstride,
+                             mask, MASK_MASTER_STRIDE,
+                             bh, bw, subh, subw, bd);
     }
     return;
   }
@@ -2460,7 +2357,7 @@
             mi->mbmi.interinter_wedge_index,
             mi->mbmi.interinter_wedge_sign,
             mi->mbmi.sb_type,
-            wedge_offset_x, wedge_offset_y, h, w);
+            wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
       } else {
         build_masked_compound_wedge_extend(
             dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
@@ -2484,7 +2381,8 @@
                                            MAX_SB_SIZE,
                                            mi->mbmi.interinter_wedge_index,
                                            mi->mbmi.interinter_wedge_sign,
-                                           mi->mbmi.sb_type, h, w);
+                                           mi->mbmi.sb_type, h, w,
+                                           xd->cur_buf->bit_depth);
       else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
         build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
new file mode 100644
index 0000000..584ee6a
--- /dev/null
+++ b/vpx_dsp/blend_mask6.c
@@ -0,0 +1,152 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
+                       uint8_t *src0, uint32_t src0_stride,
+                       uint8_t *src1, uint32_t src1_stride,
+                       const uint8_t *mask, uint32_t mask_stride,
+                       int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
+                              uint8_t *src0_8, uint32_t src0_stride,
+                              uint8_t *src1_8, uint32_t src1_stride,
+                              const uint8_t *mask, uint32_t mask_stride,
+                              int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 = mask[i * mask_stride + j];
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+                               mask[i * mask_stride + (2 * j + 1)], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  } else {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        const int m0 =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+                               mask[(2 * i + 1) * mask_stride + j], 1);
+        const int m1 = ((1 << MASK_BITS) - m0);
+        dst[i * dst_stride + j] =
+            ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+                               src1[i * src1_stride + j] * m1, MASK_BITS);
+      }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 46ef5fc..430cae1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -65,6 +65,15 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
+# inter predictions
+
+ifeq ($(CONFIG_VP10),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-yes            += blend_mask6.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
+endif  #CONFIG_EXT_INTER
+endif  #CONFIG_VP10
+
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 3571eea..7aaa89f 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -32,6 +32,8 @@
 
 #define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
 
+#define IS_POWER_OF_TWO(x)  (((x) & ((x) - 1)) == 0)
+
 // These can be used to give a hint about branch outcomes.
 // This can have an effect, even if your target processor has a
 // good branch predictor, as these hints can affect basic block
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad524a2..7bae037 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1358,10 +1358,10 @@
   }
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
 #
 # Masked Variance / Masked Subpixel Variance
 #
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
   foreach (@block_sizes) {
     ($w, $h) = @$_;
     add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
@@ -1381,6 +1381,14 @@
       }
     }
   }
+
+  add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  specialize "vpx_blend_mask6", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride,  uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
+  }
 }
 
 #
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c
new file mode 100644
index 0000000..5de3e23
--- /dev/null
+++ b/vpx_dsp/x86/blend_mask6_sse4.c
@@ -0,0 +1,1146 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+static INLINE __m128i mm_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i mm_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i mm_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void mm_storel_32(void *const a, const __m128i v) {
+  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void mm_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void mm_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = mm_loadl_32(src0);
+  const __m128i v_s1_b = mm_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = mm_loadl_64(src0);
+  const __m128i v_s1_b = mm_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = mm_loadl_64(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0l_b = mm_loadl_64(mask + c);
+      const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = mm_loadu_128(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_32(mask);
+    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zero = _mm_setzero_si128();
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = mm_loadu_128(mask + c);
+      const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = mm_loadu_128(mask);
+    const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    mm_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    uint8_t *src0, uint32_t src0_stride,
+    uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      mm_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
+                            uint8_t *src0, uint32_t src0_stride,
+                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w, int suby, int subx) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            uint8_t *src0, uint32_t src0_stride,
+                            uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  static blend_fn blend[3][2][2] = {  // width_index X subx X suby
+    {     // w % 16 == 0
+      {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
+      {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+    }, {  // w == 4
+      {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
+      {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+    }, {  // w == 8
+      {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
+      {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+    }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+                                            src0, src0_stride,
+                                            src1, src1_stride,
+                                            mask, mask_stride,
+                                            h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadl_64(src0);
+  const __m128i v_s1_w = mm_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadu_128(src0);
+  const __m128i v_s1_w = mm_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadl_64(src0);
+  const __m128i v_s1_w = mm_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = mm_loadu_128(src0);
+  const __m128i v_s1_w = mm_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
+  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_m0_b = mm_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                           src1_stride, mask, mask_stride, h,
+                           blend_4_b10);
+}
+
+static void blend_mask6_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                           src1_stride, mask, mask_stride, h,
+                           blend_4_b12);
+}
+
+static inline void blend_mask6_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = mm_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, mask_stride, h, w,
+                            blend_8_b10);
+}
+
+static void blend_mask6_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                            src1_stride, mask, mask_stride, h, w,
+                            blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_r_b = mm_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_ra_b = mm_loadl_32(mask);
+    const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
+}
+
+static void blend_mask6_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = mm_loadl_64(mask + c);
+      const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
+}
+
+static void blend_mask6_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    const __m128i v_ra_b = mm_loadl_64(mask);
+    const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    mm_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                             v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      mm_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    uint16_t *src0, uint32_t src0_stride,
+    uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+                                   uint8_t *src0_8, uint32_t src0_stride,
+                                   uint8_t *src1_8, uint32_t src1_stride,
+                                   const uint8_t *mask, uint32_t mask_stride,
+                                   int h, int w, int suby, int subx, int bd) {
+  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+  uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+  uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            uint16_t *src0, uint32_t src0_stride,
+                            uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  static blend_fn blend[2][2][2][2] = {  // bd_index X width_index X subx X suby
+    {   // bd == 8 or 10
+      {     // w % 8 == 0
+        {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
+        {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
+        {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+      }
+    },
+    {   // bd == 12
+      {     // w % 8 == 0
+        {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
+        {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
+        {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+      }
+    }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 4);
+  assert(w >= 4);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+                                                      src0, src0_stride,
+                                                      src1, src1_stride,
+                                                      mask, mask_stride,
+                                                      h, w);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH