Add optimized vpx_blend_mask6
This is to replace vp10/common/reconinter.c:build_masked_compound.
Functionality is equivalent, but the interface is slightly more
generic.
Total encoder speedup with ext-inter: ~7.5%
Change-Id: Iee18b83ae324ffc9c7f7dc16d4b2b06adb4d4305
diff --git a/test/assertion_helpers.h b/test/assertion_helpers.h
new file mode 100644
index 0000000..108c40a
--- /dev/null
+++ b/test/assertion_helpers.h
@@ -0,0 +1,278 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef TEST_ASSERTION_HELPERS_H_
+#define TEST_ASSERTION_HELPERS_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+namespace libvpx_test {
+namespace assertion_helpers {
+
+// Arrays (1D) are element-wise equal
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEq(const E (&a)[n],
+ const E (&b)[n]) {
+ for (size_t i = 0; i < n; i++) {
+ const E &va = a[i];
+ const E &vb = b[i];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// within the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n],
+ const E (&b)[n],
+ const size_t lo,
+ const size_t hi) {
+ assert(hi > lo);
+ assert(hi <= n);
+
+ for (size_t i = lo; i < hi; i++) {
+ const E &va = a[i];
+ const E &vb = b[i];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Arrays (1D) are element-wise equal
+// outside the index interval [lo, hi)
+template<typename E, size_t n>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n],
+ const E (&b)[n],
+ const size_t lo,
+ const size_t hi) {
+ assert(hi > lo);
+ assert(hi <= n);
+
+ for (size_t i = 0; i < n; i++) {
+ if (lo <= i && i < hi)
+ continue;
+
+ const E &va = a[i];
+ const E &vb = b[i];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEq(const E (&a)[n][m],
+ const E (&b)[n][m]) {
+ for (size_t i = 0; i < n; i++) {
+ for (size_t j = 0; j < m; j++) {
+ const E &va = a[i][j];
+ const E &vb = b[i][j];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "][" << j << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// within the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqWithin(const E (&a)[n][m],
+ const E (&b)[n][m],
+ const size_t lo0,
+ const size_t hi0,
+ const size_t lo1,
+ const size_t hi1) {
+ assert(hi0 > lo0);
+ assert(hi0 <= n);
+ assert(hi1 > lo1);
+ assert(hi1 <= m);
+
+ for (size_t i = lo0; i < hi0; i++) {
+ for (size_t j = lo1; j < hi1; j++) {
+ const E &va = a[i][j];
+ const E &vb = b[i][j];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "][" << j << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Arrays (2D) are element-wise equal
+// outside the index interval [lo0, hi0) x [lo1, hi1) (Cartesian product)
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult ArraysEqOutside(const E (&a)[n][m],
+ const E (&b)[n][m],
+ const size_t lo0,
+ const size_t hi0,
+ const size_t lo1,
+ const size_t hi1) {
+ assert(hi0 > lo0);
+ assert(hi0 <= n);
+ assert(hi1 > lo1);
+ assert(hi1 <= m);
+
+ for (size_t i = 0; i < n; i++) {
+ if (lo0 <= i && i < hi0)
+ continue;
+
+ for (size_t j = 0; j < m; j++) {
+ if (lo1 <= j && j < hi1)
+ continue;
+
+ const E &va = a[i][j];
+ const E &vb = b[i][j];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << i << "][" << j << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// at corresponding linear indices specified by rows/cols/stride/offset
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqWithin(const E (&a)[n][m],
+ const E (&b)[n][m],
+ const size_t stridea,
+ const size_t strideb,
+ const size_t offseta,
+ const size_t offsetb,
+ const size_t rows,
+ const size_t cols) {
+ assert(rows <= n);
+ assert(cols <= m);
+ assert(stridea <= m);
+ assert(strideb <= m);
+ assert(cols <= stridea);
+ assert(cols <= strideb);
+ assert(offseta < n * m);
+ assert(offsetb < n * m);
+ assert(offseta + (rows - 1) * stridea + (cols - 1) < n * m);
+ assert(offsetb + (rows - 1) * strideb + (cols - 1) < n * m);
+
+ const E *pa = &a[0][0] + offseta;
+ const E *pb = &b[0][0] + offsetb;
+
+ for (size_t r = 0 ; r < rows ; r++) {
+ for (size_t c = 0 ; c < cols ; c++) {
+ const E &va = pa[c];
+ const E &vb = pb[c];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at linear index "
+ << "[" << pa - &a[0][0] << "] vs [" << pb - &b[0][0] << "]"
+ << " row=" << r << " col=" << c
+ << " values are: " << va << " vs " << vb;
+ }
+ }
+ pa += stridea;
+ pb += strideb;
+ }
+
+ return ::testing::AssertionSuccess();
+}
+
+// Non contiguous 2D array buffers are element-wise equal
+// except at corresponding linear indices specified by
+// rows/cols/stride/offset.
+template<typename E, size_t n, size_t m>
+::testing::AssertionResult BuffersEqOutside(const E (&a)[n][m],
+ const E (&b)[n][m],
+ const size_t stride,
+ const size_t offset,
+ const size_t rows,
+ const size_t cols ) {
+ assert(rows <= n);
+ assert(cols <= m);
+ assert(stride <= m);
+ assert(cols <= stride);
+ assert(offset < n * m);
+ assert(offset + (rows - 1) * stride + (cols - 1) < n * m);
+
+ const E *const pa = &a[0][0];
+ const E *const pb = &b[0][0];
+
+ size_t idx = 0;
+ size_t r = 0;
+ size_t end = offset; // beginning of first row
+
+ while (idx < n * m) {
+ while (idx < end) { // until beginning of row or end of buffer
+ const E &va = pa[idx];
+ const E &vb = pb[idx];
+ if (va != vb) {
+ return ::testing::AssertionFailure()
+ << "Arrays do not equal at index "
+ << "[" << idx / m << "][" << idx % m << "]"
+ << " values are: " << va << " vs " << vb;
+ }
+
+ idx++;
+ }
+
+ // Move past row end
+ idx += cols;
+
+ if (++r < rows) {
+ // Move to next row
+ end += stride;
+ } else {
+ // Move to end of buffer
+ end = n * m;
+ }
+ }
+
+ // Sanity check
+ assert(idx == n * m + cols);
+
+ return ::testing::AssertionSuccess();
+}
+
+} // namespace assertion_helpers
+} // namespace libvpx_test
+
+#endif // TEST_ASSERTION_HELPERS_H_
diff --git a/test/blend_mask6_test.cc b/test/blend_mask6_test.cc
new file mode 100644
index 0000000..d737ddd
--- /dev/null
+++ b/test/blend_mask6_test.cc
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/randomise.h"
+#include "test/snapshot.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/assertion_helpers.h"
+#include "vp10/common/enums.h"
+
+using libvpx_test::assertion_helpers::BuffersEqWithin;
+using libvpx_test::assertion_helpers::BuffersEqOutside;
+using libvpx_test::assertion_helpers::ArraysEq;
+using libvpx_test::FunctionEquivalenceTest;
+using libvpx_test::Snapshot;
+using libvpx_test::Randomise;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendMask6Test : public FunctionEquivalenceTest<F> {
+ protected:
+ virtual ~BlendMask6Test() {}
+
+ virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+ void Common() {
+ w = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+ h = 1 << randomise.uniform<int>(2, MAX_SB_SIZE_LOG2 + 1);
+
+ randomise(subx);
+ randomise(suby);
+
+ randomise(dst_offset, 0, 32);
+ randomise(dst_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(src0_offset, 0, 32);
+ randomise(src0_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(src1_offset, 0, 32);
+ randomise(src1_stride, w, MAX_SB_SIZE * 5 + 1);
+
+ randomise(mask_stride, w * (subx ? 2: 1), 2 * MAX_SB_SIZE + 1);
+
+ T *p_src0;
+ T *p_src1;
+
+ switch (randomise.uniform<int>(3)) {
+ case 0: // Separate sources
+ p_src0 = &src0[0][0];
+ p_src1 = &src1[0][0];
+ break;
+ case 1: // src0 == dst
+ p_src0 = &dst_tst[0][0];
+ src0_stride = dst_stride;
+ src0_offset = dst_offset;
+ p_src1 = &src1[0][0];
+ break;
+ case 2: // src1 == dst
+ p_src0 = &src0[0][0];
+ p_src1 = &dst_tst[0][0];
+ src1_stride = dst_stride;
+ src1_offset = dst_offset;
+ break;
+ default:
+ FAIL();
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ // Prepare
+ //////////////////////////////////////////////////////////////////////////
+
+ snapshot(dst_ref);
+ snapshot(dst_tst);
+
+ snapshot(src0);
+ snapshot(src1);
+
+ snapshot(mask);
+
+ //////////////////////////////////////////////////////////////////////////
+ // Execute
+ //////////////////////////////////////////////////////////////////////////
+
+ Execute(p_src0, p_src1);
+
+ //////////////////////////////////////////////////////////////////////////
+ // Check
+ //////////////////////////////////////////////////////////////////////////
+
+ ASSERT_TRUE(BuffersEqWithin(dst_ref, dst_tst,
+ dst_stride, dst_stride,
+ dst_offset, dst_offset,
+ h, w));
+
+ ASSERT_TRUE(ArraysEq(snapshot.get(src0), src0));
+ ASSERT_TRUE(ArraysEq(snapshot.get(src1), src1));
+ ASSERT_TRUE(ArraysEq(snapshot.get(mask), mask));
+
+ ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_ref), dst_ref,
+ dst_stride,
+ dst_offset,
+ h, w));
+
+ ASSERT_TRUE(BuffersEqOutside(snapshot.get(dst_tst), dst_tst,
+ dst_stride,
+ dst_offset,
+ h, w));
+ }
+
+ Snapshot snapshot;
+ Randomise randomise;
+
+ T dst_ref[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ T dst_tst[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t dst_stride;
+ size_t dst_offset;
+
+ T src0[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t src0_stride;
+ size_t src0_offset;
+
+ T src1[MAX_SB_SIZE][MAX_SB_SIZE * 5];
+ size_t src1_stride;
+ size_t src1_offset;
+
+ uint8_t mask[2 * MAX_SB_SIZE][2 * MAX_SB_SIZE];
+ size_t mask_stride;
+
+ int w;
+ int h;
+
+ bool suby;
+ bool subx;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx);
+
+class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ ref_func_(&dst_ref[0][dst_offset], dst_stride,
+ p_src0 + src0_offset, src0_stride,
+ p_src1 + src1_offset, src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(&dst_tst[0][dst_offset], dst_stride,
+ p_src0 + src0_offset, src0_stride,
+ p_src1 + src1_offset, src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx));
+ }
+};
+
+TEST_P(BlendMask6Test8B, RandomValues) {
+ for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ randomise(dst_ref);
+ randomise(dst_tst);
+
+ randomise(src0);
+ randomise(src1);
+
+ randomise(mask, 65);
+
+ Common();
+ }
+}
+
+TEST_P(BlendMask6Test8B, ExtremeValues) {
+ for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ randomise(dst_ref, 254, 256);
+ randomise(dst_tst, 254, 256);
+
+ randomise(src0, 254, 256);
+ randomise(src1, 254, 256);
+
+ randomise(mask, 63, 65);
+
+ Common();
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1_C_COMPARE, BlendMask6Test8B,
+ ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd);
+
+class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+ protected:
+ void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ ref_func_(CONVERT_TO_BYTEPTR(&dst_ref[0][dst_offset]), dst_stride,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx, bit_depth);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(CONVERT_TO_BYTEPTR(&dst_tst[0][dst_offset]), dst_stride,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset), src0_stride,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset), src1_stride,
+ &mask[0][0], sizeof(mask[0]),
+ h, w, suby, subx, bit_depth));
+ }
+
+ int bit_depth;
+};
+
+TEST_P(BlendMask6TestHBD, RandomValues) {
+ for (int i = 0 ; i < 10000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ bit_depth = randomise.choice(8, 10, 12);
+
+ const int hi = 1 << bit_depth;
+
+ randomise(dst_ref, hi);
+ randomise(dst_tst, hi);
+
+ randomise(src0, hi);
+ randomise(src1, hi);
+
+ randomise(mask, 65);
+
+ Common();
+ }
+}
+
+TEST_P(BlendMask6TestHBD, ExtremeValues) {
+ for (int i = 0 ; i < 1000 && !HasFatalFailure(); i++) {
+ //////////////////////////////////////////////////////////////////////////
+ // Randomise
+ //////////////////////////////////////////////////////////////////////////
+
+ bit_depth = randomise.choice(8, 10, 12);
+
+ const int hi = 1 << bit_depth;
+ const int lo = hi - 2;
+
+ randomise(dst_ref, lo, hi);
+ randomise(dst_tst, lo, hi);
+
+ randomise(src0, lo, hi);
+ randomise(src1, lo, hi);
+
+ randomise(mask, 63, 65);
+
+ Common();
+ }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1_C_COMPARE, BlendMask6TestHBD,
+ ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
+ &vpx_highbd_blend_mask6_sse4_1)));
+#endif // HAVE_SSE4_1
+#endif // CONFIG_VP9_HIGHBITDEPTH
+} // namespace
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
new file mode 100644
index 0000000..50ad4c5
--- /dev/null
+++ b/test/function_equivalence_test.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+namespace libvpx_test {
+template <typename T>
+class FunctionEquivalenceTest :
+ public ::testing::TestWithParam< std::tr1::tuple< T, T > > {
+ public:
+ virtual ~FunctionEquivalenceTest() {}
+
+ virtual void SetUp() {
+ ref_func_ = std::tr1::get<0>(this->GetParam());
+ tst_func_ = std::tr1::get<1>(this->GetParam());
+ }
+
+ virtual void TearDown() {
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ T ref_func_;
+ T tst_func_;
+};
+
+} // namespace libvpx_test
+#endif // TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/test/randomise.h b/test/randomise.h
new file mode 100644
index 0000000..fbf419c
--- /dev/null
+++ b/test/randomise.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_RANDOMISE_H_
+#define TEST_RANDOMISE_H_
+
+#include <stdint.h>
+
+#include <limits>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+
+namespace libvpx_test {
+
+// TODO(any): Replace this when built with C++11
+#define STATIC_ASSERT_INTEGER_TYPE_(T) \
+ GTEST_COMPILE_ASSERT_(std::numeric_limits<T>::is_integer, \
+ integer_type_required);
+
+/**
+ * Deterministic random number generator with various convenience methods.
+ */
+class Randomise {
+ public:
+ Randomise() {
+ rnd_.Reset(ACMRandom::DeterministicSeed());
+ }
+
+ virtual ~Randomise() { }
+
+ // Uniformly distributed random number from the range
+ // [std::numeric_limits<R>::min(), and std::numeric_limits<R>::max()]
+ template<typename R>
+ R uniform() {
+ STATIC_ASSERT_INTEGER_TYPE_(R);
+ }
+
+ // Uniformly distributed random number from the range
+ // [0, hi)
+ template<typename R, typename H>
+ R uniform(H hi) {
+ assert(hi > 0);
+ R v = uniform<R>();
+ if (std::numeric_limits<R>::is_signed && v < 0)
+ return -v % hi;
+ else
+ return v % hi;
+ }
+
+ // Uniformly distributed random number from the range
+ // [lo, hi)
+ template<typename R, typename L, typename H>
+ R uniform(L lo, H hi) {
+ assert(hi > lo);
+ return uniform<R, H>(hi - lo) + lo;
+ }
+
+ // Randomly pick and return one of the arguments
+ template<typename T>
+ T choice(T v0, T v1) {
+ switch (uniform<int>(2)) {
+ case 0: return v0;
+ default: return v1;
+ }
+ }
+
+ // Randomly pick and return one of the arguments
+ template<typename T>
+ T choice(T v0, T v1, T v2) {
+ switch (uniform<int>(3)) {
+ case 0: return v0;
+ case 1: return v1;
+ default: return v2;
+ }
+ }
+
+ template<typename T>
+ void operator()(T &e) { // NOLINT
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ e = uniform<T>();
+ }
+
+ template<typename T, typename H>
+ void operator()(T &e, H hi) { // NOLINT
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ e = uniform<T, H>(hi);
+ }
+
+ template<typename T, typename L, typename H>
+ void operator()(T &e, L lo, H hi) { // NOLINT
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ e = uniform<T, L, H>(lo, hi);
+ }
+
+ template<typename T, size_t n>
+ void operator()(T (&arr)[n]) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ arr[i] = uniform<T>();
+ }
+ }
+
+ template<typename T, size_t n, typename H>
+ void operator()(T (&arr)[n], H hi) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ arr[i] = uniform<T, H>(hi);
+ }
+ }
+
+ template<typename T, size_t n, typename L, typename H>
+ void operator()(T (&arr)[n], L lo, H hi) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ arr[i] = uniform<T, L, H>(lo, hi);
+ }
+ }
+
+ template<typename T, size_t n, size_t m>
+ void operator()(T (&arr)[n][m]) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ for (size_t j = 0; j < m ; j++) {
+ arr[i][j] = uniform<T>();
+ }
+ }
+ }
+
+ template<typename T, size_t n, size_t m, typename H>
+ void operator()(T (&arr)[n][m], H hi) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ for (size_t j = 0; j < m ; j++) {
+ arr[i][j] = uniform<T, H>(hi);
+ }
+ }
+ }
+
+ template<typename T, size_t n, size_t m, typename L, typename H>
+ void operator()(T (&arr)[n][m], L lo, H hi) {
+ STATIC_ASSERT_INTEGER_TYPE_(T);
+ for (size_t i = 0; i < n ; i++) {
+ for (size_t j = 0; j < m ; j++) {
+ arr[i][j] = uniform<T, L, H>(lo, hi);
+ }
+ }
+ }
+
+ private:
+ libvpx_test::ACMRandom rnd_;
+};
+
+// Add further specialisations as necessary
+
+template<>
+bool Randomise::uniform<bool>() {
+ return rnd_.Rand8() & 1 ? true : false;
+}
+
+template<>
+uint8_t Randomise::uniform<uint8_t>() {
+ return rnd_.Rand8();
+}
+
+template<>
+uint16_t Randomise::uniform<uint16_t>() {
+ return rnd_.Rand16();
+}
+
+template<>
+uint32_t Randomise::uniform<uint32_t>() {
+ const uint32_t l = uniform<uint16_t>();
+ const uint32_t h = uniform<uint16_t>();
+ return h << 16 | l;
+}
+
+template<>
+uint64_t Randomise::uniform<uint64_t>() {
+ const uint64_t l = uniform<uint32_t>();
+ const uint64_t h = uniform<uint32_t>();
+ return h << 32 | l;
+}
+
+template<>
+int8_t Randomise::uniform<int8_t>() { return uniform<uint8_t>(); }
+
+template<>
+int16_t Randomise::uniform<int16_t>() { return uniform<uint16_t>(); }
+
+template<>
+int32_t Randomise::uniform<int32_t>() { return uniform<uint32_t>(); }
+
+template<>
+int64_t Randomise::uniform<int64_t>() { return uniform<uint64_t>(); }
+
+} // namespace libvpx_test
+
+#endif // TEST_RANDOMISE_H_
diff --git a/test/snapshot.h b/test/snapshot.h
new file mode 100644
index 0000000..b67edde
--- /dev/null
+++ b/test/snapshot.h
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_SNAPSHOT_H_
+#define TEST_SNAPSHOT_H_
+
+#include <map>
+
+namespace libvpx_test {
+
+/**
+ * Allows capturing and retrieving snapshots of arbitrary blobs of memory,
+ * blob size is based on compile time type information.
+ *
+ * Usage:
+ * void example() {
+ * Snapshot snapshot;
+ *
+ * int foo = 4;
+ *
+ * snapshot(foo);
+ *
+ * foo = 10;
+ *
+ * assert(snapshot.get(foo) == 4); // Pass
+ * assert(snapshot.get(foo) == foo); // Fail (4 != 10)
+ *
+ * char bar[10][10];
+ * memset(bar, 3, sizeof(bar));
+ *
+ * snapshot(bar);
+ *
+ * memset(bar, 8, sizeof(bar));
+ *
+ * assert(sum(bar) == 800); // Pass
+ * assert(sum(snapshot.get(bar)) == 300); // Pass
+ * }
+ */
+class Snapshot {
+ public:
+ virtual ~Snapshot() {
+ for (snapshot_map_t::iterator it = snapshots_.begin();
+ it != snapshots_.end(); it++) {
+ delete[] it->second;
+ }
+ }
+
+ /**
+ * Take new snapshot for object
+ */
+ template<typename E>
+ void take(const E &e) {
+ const void *const key = reinterpret_cast<const void*>(&e);
+
+ snapshot_map_t::iterator it = snapshots_.find(key);
+
+ if (it != snapshots_.end())
+ delete[] it->second;
+
+ char *const buf = new char[sizeof(E)];
+
+ memcpy(buf, &e, sizeof(E));
+
+ snapshots_[key] = buf;
+ }
+
+ /**
+ * Same as 'take'
+ */
+ template<typename E>
+ void operator()(const E &e) {
+ take(e);
+ }
+
+ /**
+ * Retrieve last snapshot for object
+ */
+ template<typename E>
+ const E& get(const E &e) const {
+ const void *const key = reinterpret_cast<const void*>(&e);
+
+ snapshot_map_t::const_iterator it = snapshots_.find(key);
+
+ assert(it != snapshots_.end());
+
+ return *reinterpret_cast<const E*>(it->second);
+ }
+
+ private:
+ typedef std::map<const void*, const char*> snapshot_map_t;
+
+ snapshot_map_t snapshots_;
+};
+
+} // namespace libvpx_test
+
+#endif // TEST_SNAPSHOT_H_
diff --git a/test/test.mk b/test/test.mk
index 77b00a5..339e274 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -178,6 +178,7 @@
ifeq ($(CONFIG_EXT_INTER),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
endif
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index d6ac4bb..825fff3 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -11,6 +11,7 @@
#include <assert.h>
#include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
@@ -410,119 +411,6 @@
return mask;
}
-static void build_masked_compound(uint8_t *dst, int dst_stride,
- uint8_t *dst1, int dst1_stride,
- uint8_t *dst2, int dst2_stride,
- const uint8_t *mask,
- int h, int w, int subh, int subw) {
- int i, j;
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = mask[i * MASK_MASTER_STRIDE + j];
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
-
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
- mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_highbd(uint8_t *dst_8, int dst_stride,
- uint8_t *dst1_8, int dst1_stride,
- uint8_t *dst2_8, int dst2_stride,
- const uint8_t *mask,
- int h, int w, int subh, int subw) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *dst1 = CONVERT_TO_SHORTPTR(dst1_8);
- uint16_t *dst2 = CONVERT_TO_SHORTPTR(dst2_8);
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = mask[i * MASK_MASTER_STRIDE + j];
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j)] +
- mask[(2 * i) * MASK_MASTER_STRIDE + (2 * j + 1)] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + (2 * j + 1)] + 2) >> 2;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[i * MASK_MASTER_STRIDE + (2 * j)] +
- mask[i * MASK_MASTER_STRIDE + (2 * j + 1)] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- int m = (mask[(2 * i) * MASK_MASTER_STRIDE + j] +
- mask[(2 * i + 1) * MASK_MASTER_STRIDE + j] + 1) >> 1;
- dst[i * dst_stride + j] = (dst1[i * dst1_stride + j] * m +
- dst2[i * dst2_stride + j] *
- ((1 << WEDGE_WEIGHT_BITS) - m) +
- (1 << (WEDGE_WEIGHT_BITS - 1))) >>
- WEDGE_WEIGHT_BITS;
- }
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
#if CONFIG_SUPERTX
static void build_masked_compound_wedge_extend(
@@ -537,9 +425,11 @@
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- build_masked_compound(dst, dst_stride,
- dst, dst_stride, dst2, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_blend_mask6(dst, dst_stride,
+ dst, dst_stride,
+ dst2, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -549,14 +439,16 @@
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
int wedge_offset_x, int wedge_offset_y,
- int h, int w) {
+ int h, int w, int bd) {
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- build_masked_compound_highbd(dst_8, dst_stride,
- dst_8, dst_stride, dst2_8, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_highbd_blend_mask6(dst_8, dst_stride,
+ dst_8, dst_stride,
+ dst2_8, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -573,9 +465,11 @@
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
sb_type, 0, 0);
- build_masked_compound(dst, dst_stride,
- dst, dst_stride, dst2, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_blend_mask6(dst, dst_stride,
+ dst, dst_stride,
+ dst2, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
@@ -583,16 +477,18 @@
uint8_t *dst2_8, int dst2_stride,
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
- int h, int w) {
+ int h, int w, int bd) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(wedge_index, wedge_sign,
sb_type, 0, 0);
- build_masked_compound_highbd(dst_8, dst_stride,
- dst_8, dst_stride, dst2_8, dst2_stride, mask,
- h, w, subh, subw);
+ vpx_highbd_blend_mask6(dst_8, dst_stride,
+ dst_8, dst_stride,
+ dst2_8, dst2_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_SUPERTX
@@ -641,7 +537,7 @@
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
mi->mbmi.sb_type,
- wedge_offset_x, wedge_offset_y, h, w);
+ wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
else
build_masked_compound_wedge_extend(
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@@ -655,7 +551,7 @@
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
- mi->mbmi.sb_type, h, w);
+ mi->mbmi.sb_type, h, w, xd->cur_buf->bit_depth);
else
build_masked_compound_wedge(
dst, dst_stride, tmp_dst, MAX_SB_SIZE,
@@ -1872,10 +1768,11 @@
bsize, 0, 0);
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
- build_masked_compound(comppred, compstride,
- intrapred, intrastride,
- interpred, interstride, mask,
- bh, bw, subh, subw);
+ vpx_blend_mask6(comppred, compstride,
+ intrapred, intrastride,
+ interpred, interstride,
+ mask, MASK_MASTER_STRIDE,
+ bh, bw, subh, subw);
}
return;
}
@@ -1995,7 +1892,6 @@
uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
- (void) bd;
if (use_wedge_interintra) {
if (is_interintra_wedge_used(bsize)) {
@@ -2003,10 +1899,11 @@
bsize, 0, 0);
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
- build_masked_compound_highbd(comppred8, compstride,
- intrapred8, intrastride,
- interpred8, interstride, mask,
- bh, bw, subh, subw);
+ vpx_highbd_blend_mask6(comppred8, compstride,
+ intrapred8, intrastride,
+ interpred8, interstride,
+ mask, MASK_MASTER_STRIDE,
+ bh, bw, subh, subw, bd);
}
return;
}
@@ -2460,7 +2357,7 @@
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
mi->mbmi.sb_type,
- wedge_offset_x, wedge_offset_y, h, w);
+ wedge_offset_x, wedge_offset_y, h, w, xd->cur_buf->bit_depth);
} else {
build_masked_compound_wedge_extend(
dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
@@ -2484,7 +2381,8 @@
MAX_SB_SIZE,
mi->mbmi.interinter_wedge_index,
mi->mbmi.interinter_wedge_sign,
- mi->mbmi.sb_type, h, w);
+ mi->mbmi.sb_type, h, w,
+ xd->cur_buf->bit_depth);
else
#endif // CONFIG_VP9_HIGHBITDEPTH
build_masked_compound_wedge(dst, dst_buf->stride, tmp_dst, MAX_SB_SIZE,
diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
new file mode 100644
index 0000000..584ee6a
--- /dev/null
+++ b/vpx_dsp/blend_mask6.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 = mask[i * mask_stride + j];
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+ mask[i * mask_stride + (2 * j + 1)], 1);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+ mask[(2 * i + 1) * mask_stride + j], 1);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
+ uint8_t *src0_8, uint32_t src0_stride,
+ uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 = mask[i * mask_stride + j];
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
+ mask[i * mask_stride + (2 * j + 1)], 1);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ } else {
+ for (i = 0; i < h; ++i)
+ for (j = 0; j < w; ++j) {
+ const int m0 =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
+ mask[(2 * i + 1) * mask_stride + j], 1);
+ const int m1 = ((1 << MASK_BITS) - m0);
+ dst[i * dst_stride + j] =
+ ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
+ src1[i * src1_stride + j] * m1, MASK_BITS);
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 46ef5fc..430cae1 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -65,6 +65,15 @@
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.h
DSP_SRCS-$(HAVE_DSPR2) += mips/common_dspr2.c
+# inter predictions
+
+ifeq ($(CONFIG_VP10),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-yes += blend_mask6.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
+endif #CONFIG_EXT_INTER
+endif #CONFIG_VP10
+
# interpolation filters
DSP_SRCS-yes += vpx_convolve.c
DSP_SRCS-yes += vpx_convolve.h
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 3571eea..7aaa89f 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -32,6 +32,8 @@
#define IMPLIES(a, b) (!(a) || (b)) // Logical 'a implies b' (or 'a -> b')
+#define IS_POWER_OF_TWO(x) (((x) & ((x) - 1)) == 0)
+
// These can be used to give a hint about branch outcomes.
// This can have an effect, even if your target processor has a
// good branch predictor, as these hints can affect basic block
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index ad524a2..7bae037 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -1358,10 +1358,10 @@
}
} # CONFIG_VP9_HIGHBITDEPTH
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
#
# Masked Variance / Masked Subpixel Variance
#
-if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
foreach (@block_sizes) {
($w, $h) = @$_;
add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
@@ -1381,6 +1381,14 @@
}
}
}
+
+ add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+ specialize "vpx_blend_mask6", qw/sse4_1/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+ specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
+ }
}
#
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_mask6_sse4.c
new file mode 100644
index 0000000..5de3e23
--- /dev/null
+++ b/vpx_dsp/x86/blend_mask6_sse4.c
@@ -0,0 +1,1146 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+#define MASK_BITS 6
+
+static INLINE __m128i mm_loadl_32(const void *a) {
+ return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i mm_loadl_64(const void *a) {
+ return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i mm_loadu_128(const void *a) {
+ return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void mm_storel_32(void *const a, const __m128i v) {
+ *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void mm_storel_64(void *const a, const __m128i v) {
+ _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void mm_storeu_128(void *const a, const __m128i v) {
+ _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i mm_round_epu16(__m128i v_val_w) {
+ return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i mm_roundn_epu16(__m128i v_val_w, int bits) {
+ const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+ return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = mm_loadl_32(src0);
+ const __m128i v_s1_b = mm_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = mm_loadl_64(src0);
+ const __m128i v_s1_b = mm_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_b = mm_loadl_64(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_m0l_b = mm_loadl_64(mask + c);
+ const __m128i v_m0h_b = mm_loadl_64(mask + c + 8);
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+ const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_r_b = mm_loadu_128(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_rl_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rh_b = mm_loadu_128(mask + 2 * c + 16);
+ const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+ const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+ const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+ const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_32(mask);
+ const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zero = _mm_setzero_si128();
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ra_b = mm_loadu_128(mask + c);
+ const __m128i v_rb_b = mm_loadu_128(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_mask6_sx_sy_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_sy_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ (void)w;
+
+ do {
+ const __m128i v_ra_b = mm_loadu_128(mask);
+ const __m128i v_rb_b = mm_loadu_128(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ mm_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_sx_sy_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_ral_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rah_b = mm_loadu_128(mask + 2 * c + 16);
+ const __m128i v_rbl_b = mm_loadu_128(mask + mask_stride + 2 * c);
+ const __m128i v_rbh_b = mm_loadu_128(mask + mask_stride + 2 * c + 16);
+ const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+ const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+ const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+ const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+ const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
+ v_zmask_b);
+ const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
+ v_zmask_b);
+ const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+ const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+ const __m128i v_m0l_w = mm_roundn_epu16(v_rsl_w, 2);
+ const __m128i v_m0h_w = mm_roundn_epu16(v_rsh_w, 2);
+ const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+ const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0l_w, v_m1l_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0h_w, v_m1h_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ mm_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx) {
+ typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+ uint8_t *src0, uint32_t src0_stride,
+ uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w);
+
+ static blend_fn blend[3][2][2] = { // width_index X subx X suby
+ { // w % 16 == 0
+ {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
+ {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
+ {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+ }, { // w == 8
+ {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
+ {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+ }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadl_64(src0);
+ const __m128i v_s1_w = mm_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadu_128(src0);
+ const __m128i v_s1_w = mm_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = mm_roundn_epu16(v_sum_w, MASK_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadl_64(src0);
+ const __m128i v_s1_w = mm_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = mm_loadu_128(src0);
+ const __m128i v_s1_w = mm_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
+ const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = mm_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_m0_b = mm_loadl_32(mask);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static inline void blend_mask6_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_m0_b = mm_loadl_64(mask + c);
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_r_b = mm_loadl_64(mask);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_r_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+ const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_ra_b = mm_loadl_32(mask);
+ const __m128i v_rb_b = mm_loadl_32(mask + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = mm_loadl_64(mask + c);
+ const __m128i v_rb_b = mm_loadl_64(mask + c + mask_stride);
+ const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+ const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ const __m128i v_ra_b = mm_loadl_64(mask);
+ const __m128i v_rb_b = mm_loadl_64(mask + mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ mm_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ (void)w;
+ blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
+}
+
+static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, blend_unit_fn blend) {
+ const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+ 0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+ const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+
+ do {
+ int c;
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_ra_b = mm_loadu_128(mask + 2 * c);
+ const __m128i v_rb_b = mm_loadu_128(mask + 2 * c +mask_stride);
+ const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+ const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+ const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+ v_zmask_b);
+ const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+ const __m128i v_m0_w = mm_roundn_epu16(v_rs_w, 2);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ mm_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 2 * mask_stride;
+ } while (--h);
+}
+
+static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
+}
+
+static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w) {
+ blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
+ uint8_t *src0_8, uint32_t src0_stride,
+ uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd) {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+ uint16_t *src0, uint32_t src0_stride,
+ uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w);
+
+ static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby
+ { // bd == 8 or 10
+ { // w % 8 == 0
+ {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
+ {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
+ {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+ }
+ },
+ { // bd == 12
+ { // w % 8 == 0
+ {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
+ {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+ }, { // w == 4
+ {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
+ {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+ }
+ }
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 4);
+ assert(w >= 4);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH