Merge "Cosmetics for vp10/common/vp10_rtcd_defs.pl" into nextgenv2
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 0000000..03e9b7d
--- /dev/null
+++ b/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,374 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+using std::tr1::make_tuple;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+ static const int kIterations = 10000;
+ static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
+ static const int kMaxHeight = MAX_SB_SIZE;
+ static const int kBufSize = kMaxWidth * kMaxHeight;
+ static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+ static const int kMaxMaskSize = kMaxMaskWidth;
+
+ BlendA64Mask1DTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+ virtual ~BlendA64Mask1DTest() {}
+
+ virtual void Execute(T *p_src0, T *p_src1) = 0;
+
+ void Common() {
+ w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+ h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+
+ dst_offset_ = rng_(33);
+ dst_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ src0_offset_ = rng_(33);
+ src0_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ src1_offset_ = rng_(33);
+ src1_stride_ = rng_(kMaxWidth + 1 - w_) + w_;
+
+ T *p_src0;
+ T *p_src1;
+
+ switch (rng_(3)) {
+ case 0: // Separate sources
+ p_src0 = src0_;
+ p_src1 = src1_;
+ break;
+ case 1: // src0 == dst
+ p_src0 = dst_tst_;
+ src0_stride_ = dst_stride_;
+ src0_offset_ = dst_offset_;
+ p_src1 = src1_;
+ break;
+ case 2: // src1 == dst
+ p_src0 = src0_;
+ p_src1 = dst_tst_;
+ src1_stride_ = dst_stride_;
+ src1_offset_ = dst_offset_;
+ break;
+ default:
+ FAIL();
+ }
+
+ Execute(p_src0, p_src1);
+
+ for (int r = 0 ; r < h_ ; ++r) {
+ for (int c = 0 ; c < w_ ; ++c) {
+ ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+ dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+ }
+ }
+ }
+
+ ACMRandom rng_;
+
+ T dst_ref_[kBufSize];
+ T dst_tst_[kBufSize];
+ size_t dst_stride_;
+ size_t dst_offset_;
+
+ T src0_[kBufSize];
+ size_t src0_stride_;
+ size_t src0_offset_;
+
+ T src1_[kBufSize];
+ size_t src1_stride_;
+ size_t src1_offset_;
+
+ uint8_t mask_[kMaxMaskSize];
+
+ int w_;
+ int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+ void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ ref_func_(dst_ref_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_,
+ mask_, h_, w_);
+
+ tst_func_(dst_tst_ + dst_offset_, dst_stride_,
+ p_src0 + src0_offset_, src0_stride_,
+ p_src1 + src1_offset_, src1_stride_,
+ mask_, h_, w_);
+ }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_.Rand8();
+ dst_tst_[i] = rng_.Rand8();
+
+ src0_[i] = rng_.Rand8();
+ src1_[i] = rng_.Rand8();
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+ Common();
+ }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(2) + 254;
+ dst_tst_[i] = rng_(2) + 254;
+ src0_[i] = rng_(2) + 254;
+ src1_[i] = rng_(2) + 254;
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+ Common();
+ }
+}
+
+static void blend_a64_hmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[col];
+
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+ h, w, 0, 0);
+}
+
+static void blend_a64_vmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+ [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[row];
+
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+ h, w, 0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, BlendA64Mask1DTest8B,
+ ::testing::Values(
+ make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_c),
+ make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, BlendA64Mask1DTest8B,
+ ::testing::Values(
+ make_tuple(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1),
+ make_tuple(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd);
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+ void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, h_, w_, bit_depth_);
+
+ ASM_REGISTER_STATE_CHECK(
+ tst_func_(CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+ CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+ CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+ mask_, h_, w_, bit_depth_));
+ }
+
+ int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+ for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0:
+ bit_depth_ = 8;
+ break;
+ case 1:
+ bit_depth_ = 10;
+ break;
+ default:
+ bit_depth_ = 12;
+ break;
+ }
+
+ const int hi = 1 << bit_depth_;
+
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(hi);
+ dst_tst_[i] = rng_(hi);
+ src0_[i] = rng_(hi);
+ src1_[i] = rng_(hi);
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+ Common();
+ }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+ for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+ switch (rng_(3)) {
+ case 0:
+ bit_depth_ = 8;
+ break;
+ case 1:
+ bit_depth_ = 10;
+ break;
+ default:
+ bit_depth_ = 12;
+ break;
+ }
+
+ const int hi = 1 << bit_depth_;
+ const int lo = hi - 2;
+
+ for (int i = 0 ; i < kBufSize ; ++i) {
+ dst_ref_[i] = rng_(hi - lo) + lo;
+ dst_tst_[i] = rng_(hi - lo) + lo;
+ src0_[i] = rng_(hi - lo) + lo;
+ src1_[i] = rng_(hi - lo) + lo;
+ }
+
+ for (int i = 0 ; i < kMaxMaskSize ; ++i)
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+ Common();
+ }
+}
+
+static void highbd_blend_a64_hmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[col];
+
+ vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize,
+ h, w, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+ [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+ for (int row = 0 ; row < h ; ++row)
+ for (int col = 0 ; col < w ; ++col)
+ mask2d[row][col] = mask[row];
+
+ vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ &mask2d[0][0],
+ BlendA64Mask1DTestHBD::kMaxMaskSize,
+ h, w, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+ C, BlendA64Mask1DTestHBD,
+ ::testing::Values(
+ make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c),
+ make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, BlendA64Mask1DTestHBD,
+ ::testing::Values(
+ make_tuple(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_sse4_1),
+ make_tuple(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_sse4_1)));
+#endif // HAVE_SSE4_1
+
+#endif // CONFIG_VP9_HIGHBITDEPTH
+} // namespace
diff --git a/test/blend_mask6_test.cc b/test/blend_a64_mask_test.cc
similarity index 77%
rename from test/blend_mask6_test.cc
rename to test/blend_a64_mask_test.cc
index 6afaad7..08ee91d 100644
--- a/test/blend_mask6_test.cc
+++ b/test/blend_a64_mask_test.cc
@@ -26,6 +26,8 @@
#include "test/acm_random.h"
#include "vp10/common/enums.h"
+#include "vpx_dsp/blend.h"
+
using libvpx_test::ACMRandom;
using libvpx_test::FunctionEquivalenceTest;
using std::tr1::make_tuple;
@@ -33,7 +35,7 @@
namespace {
template<typename F, typename T>
-class BlendMask6Test : public FunctionEquivalenceTest<F> {
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
protected:
static const int kIterations = 10000;
static const int kMaxWidth = MAX_SB_SIZE * 5; // * 5 to cover longer strides
@@ -42,15 +44,15 @@
static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
- BlendMask6Test() : rng_(ACMRandom::DeterministicSeed()) {}
+ BlendA64MaskTest() : rng_(ACMRandom::DeterministicSeed()) {}
- virtual ~BlendMask6Test() {}
+ virtual ~BlendA64MaskTest() {}
- virtual void Execute(T *p_src0, T *p_src1) = 0;
+ virtual void Execute(const T *p_src0, const T *p_src1) = 0;
void Common() {
- w_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
- h_ = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 2) + 2);
+ w_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
+ h_ = 1 << rng_(MAX_SB_SIZE_LOG2 + 1);
subx_ = rng_(2);
suby_ = rng_(2);
@@ -131,14 +133,14 @@
//////////////////////////////////////////////////////////////////////////////
typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx);
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx);
-class BlendMask6Test8B : public BlendMask6Test<F8B, uint8_t> {
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
protected:
- void Execute(uint8_t *p_src0, uint8_t *p_src1) {
+ void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
ref_func_(dst_ref_ + dst_offset_, dst_stride_,
p_src0 + src0_offset_, src0_stride_,
p_src1 + src1_offset_, src1_stride_,
@@ -153,7 +155,7 @@
}
};
-TEST_P(BlendMask6Test8B, RandomValues) {
+TEST_P(BlendA64MaskTest8B, RandomValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
for (int i = 0 ; i < kBufSize ; ++i) {
dst_ref_[i] = rng_.Rand8();
@@ -164,13 +166,13 @@
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
Common();
}
}
-TEST_P(BlendMask6Test8B, ExtremeValues) {
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
for (int i = 0 ; i < kBufSize ; ++i) {
dst_ref_[i] = rng_(2) + 254;
@@ -180,7 +182,7 @@
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(2) + 63;
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
Common();
}
@@ -188,8 +190,9 @@
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
- SSE4_1_C_COMPARE, BlendMask6Test8B,
- ::testing::Values(make_tuple(&vpx_blend_mask6_c, &vpx_blend_mask6_sse4_1)));
+ SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+ ::testing::Values(make_tuple(vpx_blend_a64_mask_c,
+ vpx_blend_a64_mask_sse4_1)));
#endif // HAVE_SSE4_1
#if CONFIG_VP9_HIGHBITDEPTH
@@ -198,14 +201,14 @@
//////////////////////////////////////////////////////////////////////////////
typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx, int bd);
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd);
-class BlendMask6TestHBD : public BlendMask6Test<FHBD, uint16_t> {
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
protected:
- void Execute(uint16_t *p_src0, uint16_t *p_src1) {
+ void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
ref_func_(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
@@ -223,7 +226,7 @@
int bit_depth_;
};
-TEST_P(BlendMask6TestHBD, RandomValues) {
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
switch (rng_(3)) {
case 0:
@@ -247,13 +250,13 @@
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
Common();
}
}
-TEST_P(BlendMask6TestHBD, ExtremeValues) {
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
switch (rng_(3)) {
case 0:
@@ -278,7 +281,7 @@
}
for (int i = 0 ; i < kMaxMaskSize ; ++i)
- mask_[i] = rng_(65);
+ mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
Common();
}
@@ -286,9 +289,9 @@
#if HAVE_SSE4_1
INSTANTIATE_TEST_CASE_P(
- SSE4_1_C_COMPARE, BlendMask6TestHBD,
- ::testing::Values(make_tuple(&vpx_highbd_blend_mask6_c,
- &vpx_highbd_blend_mask6_sse4_1)));
+ SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+ ::testing::Values(make_tuple(vpx_highbd_blend_a64_mask_c,
+ vpx_highbd_blend_a64_mask_sse4_1)));
#endif // HAVE_SSE4_1
#endif // CONFIG_VP9_HIGHBITDEPTH
} // namespace
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index cd0b136..753a7e4 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -188,8 +188,8 @@
const unsigned int w_y = img1->d_w;
const unsigned int h_y = img1->d_h;
- const unsigned int w_uv = ROUNDZ_POWER_OF_TWO(w_y, img1->x_chroma_shift);
- const unsigned int h_uv = ROUNDZ_POWER_OF_TWO(h_y, img1->y_chroma_shift);
+ const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
+ const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
if (img1->fmt != img2->fmt
|| img1->cs != img2->cs
diff --git a/test/test.mk b/test/test.mk
index fcd565c..67fe705 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -178,11 +178,12 @@
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc
ifeq ($(CONFIG_EXT_INTER),yes)
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_mask6_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
endif
diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
index 7999087..66f267f 100644
--- a/test/vp10_convolve_optimz_test.cc
+++ b/test/vp10_convolve_optimz_test.cc
@@ -197,6 +197,7 @@
using std::tr1::make_tuple;
+#if HAVE_SSSE3 && CONFIG_EXT_INTERP
const BlockDimension kBlockDim[] = {
make_tuple(2, 2),
make_tuple(2, 4),
@@ -218,7 +219,6 @@
make_tuple(128, 128),
};
-#if HAVE_SSSE3 && CONFIG_EXT_INTERP
// 10/12-tap filters
const INTERP_FILTER kFilter[] = {6, 4, 2};
diff --git a/test/vp10_wedge_utils_test.cc b/test/vp10_wedge_utils_test.cc
index 930a598..7a541b2 100644
--- a/test/vp10_wedge_utils_test.cc
+++ b/test/vp10_wedge_utils_test.cc
@@ -104,7 +104,7 @@
p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
}
- vpx_blend_mask6(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+ vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
vpx_subtract_block(h, w, r0, w, s, w, p0, w);
vpx_subtract_block(h, w, r1, w, s, w, p1, w);
diff --git a/vp10/common/quant_common.c b/vp10/common/quant_common.c
index 3093ab7..a1ce23e 100644
--- a/vp10/common/quant_common.c
+++ b/vp10/common/quant_common.c
@@ -42,55 +42,55 @@
static const qprofile_type nuq[QUANT_PROFILES][QUANT_RANGES][COEF_BANDS] = {
{
{
- {{91, 133, 139}, 11}, // dc, band 0
- {{78, 122, 134}, 12}, // band 1
- {{83, 127, 139}, 22}, // band 2
- {{84, 117, 128}, 18}, // band 3
- {{88, 117, 129}, 20}, // band 4
- {{93, 122, 134}, 21} // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0} // band 5
}, {
- {{91, 133, 139}, 11}, // dc, band 0
- {{78, 122, 134}, 12}, // band 1
- {{83, 127, 139}, 22}, // band 2
- {{84, 117, 128}, 18}, // band 3
- {{88, 117, 129}, 20}, // band 4
- {{93, 122, 134}, 21} // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0} // band 5
}
},
#if QUANT_PROFILES > 1
{
{
- {{86, 122, 134}, 6}, // dc, band 0
- {{78, 122, 134}, 15}, // band 1
- {{78, 122, 134}, 17}, // band 2
- {{84, 122, 134}, 22}, // band 3
- {{88, 122, 134}, 23}, // band 4
- {{88, 122, 134}, 23} // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0} // band 5
}, {
- {{86, 122, 134}, 6}, // dc, band 0
- {{78, 122, 134}, 15}, // band 1
- {{78, 122, 134}, 17}, // band 2
- {{84, 122, 134}, 22}, // band 3
- {{88, 122, 134}, 23}, // band 4
- {{88, 122, 134}, 23} // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0} // band 5
}
},
#if QUANT_PROFILES > 2
{
{
- {{86, 122, 134}, 6}, // dc, band 0
- {{78, 122, 135}, 14}, // band 1
- {{78, 122, 134}, 16}, // band 2
- {{84, 122, 133}, 22}, // band 3
- {{88, 122, 134}, 23}, // band 4
- {{88, 122, 134}, 27}, // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0}, // band 5
}, {
- {{86, 122, 134}, 6}, // dc, band 0
- {{78, 122, 135}, 14}, // band 1
- {{78, 122, 134}, 16}, // band 2
- {{84, 122, 133}, 22}, // band 3
- {{88, 122, 134}, 23}, // band 4
- {{88, 122, 134}, 27}, // band 5
+ {{64, 128, 128}, 0}, // dc, band 0
+ {{64, 128, 128}, 0}, // band 1
+ {{64, 128, 128}, 0}, // band 2
+ {{64, 128, 128}, 0}, // band 3
+ {{64, 128, 128}, 0}, // band 4
+ {{64, 128, 128}, 0}, // band 5
}
}
#endif // QUANT_PROFILES > 2
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index 70cf5e7..53fd1a6 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -15,6 +15,7 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/blend.h"
#include "vp10/common/blockd.h"
#include "vp10/common/reconinter.h"
@@ -448,8 +449,8 @@
#if CONFIG_SUPERTX
static void build_masked_compound_wedge_extend(
uint8_t *dst, int dst_stride,
- uint8_t *src0, int src0_stride,
- uint8_t *src1, int src1_stride,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
int wedge_index,
int wedge_sign,
BLOCK_SIZE sb_type,
@@ -459,18 +460,18 @@
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- vpx_blend_mask6(dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, MASK_MASTER_STRIDE,
- h, w, subh, subw);
+ vpx_blend_a64_mask(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
static void build_masked_compound_wedge_extend_highbd(
uint8_t *dst_8, int dst_stride,
- uint8_t *src0_8, int src0_stride,
- uint8_t *src1_8, int src1_stride,
+ const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
int wedge_index, int wedge_sign,
BLOCK_SIZE sb_type,
int wedge_offset_x, int wedge_offset_y,
@@ -479,52 +480,54 @@
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_soft_mask(
wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
- vpx_highbd_blend_mask6(dst_8, dst_stride,
- src0_8, src0_stride,
- src1_8, src1_stride,
- mask, MASK_MASTER_STRIDE,
- h, w, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, MASK_MASTER_STRIDE,
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
#endif // CONFIG_SUPERTX
-static void build_masked_compound_wedge(uint8_t *dst, int dst_stride,
- uint8_t *src0, int src0_stride,
- uint8_t *src1, int src1_stride,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE sb_type,
- int h, int w) {
+static void build_masked_compound_wedge(
+ uint8_t *dst, int dst_stride,
+ const uint8_t *src0, int src0_stride,
+ const uint8_t *src1, int src1_stride,
+ int wedge_index, int wedge_sign,
+ BLOCK_SIZE sb_type,
+ int h, int w) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
sb_type);
- vpx_blend_mask6(dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
- h, w, subh, subw);
+ vpx_blend_a64_mask(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+ h, w, subh, subw);
}
#if CONFIG_VP9_HIGHBITDEPTH
-static void build_masked_compound_wedge_highbd(uint8_t *dst_8, int dst_stride,
- uint8_t *src0_8, int src0_stride,
- uint8_t *src1_8, int src1_stride,
- int wedge_index, int wedge_sign,
- BLOCK_SIZE sb_type,
- int h, int w, int bd) {
+static void build_masked_compound_wedge_highbd(
+ uint8_t *dst_8, int dst_stride,
+ const uint8_t *src0_8, int src0_stride,
+ const uint8_t *src1_8, int src1_stride,
+ int wedge_index, int wedge_sign,
+ BLOCK_SIZE sb_type,
+ int h, int w, int bd) {
// Derive subsampling from h and w passed in. May be refactored to
// pass in subsampling factors directly.
const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
sb_type);
- vpx_highbd_blend_mask6(dst_8, dst_stride,
- src0_8, src0_stride,
- src1_8, src1_stride,
- mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
- h, w, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+ h, w, subh, subw, bd);
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -1064,177 +1067,123 @@
28, 18, 10, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
};
-static void generate_1dmask(int length, uint8_t *mask, int plane) {
+static const uint8_t* get_supertx_mask(int length, int plane) {
switch (length) {
case 8:
- memcpy(mask, plane ? mask_8_uv : mask_8, length);
- break;
+ return plane ? mask_8_uv : mask_8;
case 16:
- memcpy(mask, plane ? mask_16_uv : mask_16, length);
- break;
+ return plane ? mask_16_uv : mask_16;
case 32:
- memcpy(mask, plane ? mask_32_uv : mask_32, length);
- break;
+ return plane ? mask_32_uv : mask_32;
default:
assert(0);
}
+ return NULL;
}
void vp10_build_masked_inter_predictor_complex(
MACROBLOCKD *xd,
- uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+ uint8_t *dst, int dst_stride,
+ const uint8_t *pre, int pre_stride,
int mi_row, int mi_col,
int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
PARTITION_TYPE partition, int plane) {
- int i, j;
const struct macroblockd_plane *pd = &xd->plane[plane];
- uint8_t mask[MAX_TX_SIZE];
- int top_w = 4 << b_width_log2_lookup[top_bsize];
- int top_h = 4 << b_height_log2_lookup[top_bsize];
- int w = 4 << b_width_log2_lookup[bsize];
- int h = 4 << b_height_log2_lookup[bsize];
- int w_offset = (mi_col - mi_col_ori) * MI_SIZE;
- int h_offset = (mi_row - mi_row_ori) * MI_SIZE;
+ const int ssx = pd->subsampling_x;
+ const int ssy = pd->subsampling_y;
+ const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+ const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+ const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+ const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+ const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+ const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+ int w_remain, h_remain;
#if CONFIG_VP9_HIGHBITDEPTH
- uint16_t *dst16= CONVERT_TO_SHORTPTR(dst);
- uint16_t *dst216 = CONVERT_TO_SHORTPTR(dst2);
- int b_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+ const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
#endif // CONFIG_VP9_HIGHBITDEPTH
assert(bsize <= BLOCK_32X32);
-
- top_w >>= pd->subsampling_x;
- top_h >>= pd->subsampling_y;
- w >>= pd->subsampling_x;
- h >>= pd->subsampling_y;
- w_offset >>= pd->subsampling_x;
- h_offset >>= pd->subsampling_y;
+ assert(IMPLIES(plane == 0, ssx == 0));
+ assert(IMPLIES(plane == 0, ssy == 0));
switch (partition) {
- case PARTITION_HORZ:
- {
+ case PARTITION_HORZ: {
+ const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+ w_remain = top_w;
+ h_remain = top_h - h_offset - h;
+ dst += h_offset * dst_stride;
+ pre += h_offset * pre_stride;
+
#if CONFIG_VP9_HIGHBITDEPTH
- if (b_hdb) {
- uint16_t *dst_tmp = dst16 + h_offset * dst_stride;
- uint16_t *dst2_tmp = dst216 + h_offset * dst2_stride;
- generate_1dmask(h, mask + h_offset,
- plane && xd->plane[plane].subsampling_y);
-
- for (i = h_offset; i < h_offset + h; i++) {
- for (j = 0; j < top_w; j++) {
- const int m = mask[i]; assert(m >= 0 && m <= 64);
- if (m == 64)
- continue;
-
- if (m == 0)
- dst_tmp[j] = dst2_tmp[j];
- else
- dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
- dst2_tmp[j] * (64 - m), 6);
- }
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
-
- for (; i < top_h; i ++) {
- memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint16_t));
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
- } else {
+ if (is_hdb)
+ vpx_highbd_blend_a64_vmask(dst, dst_stride,
+ dst, dst_stride,
+ pre, pre_stride,
+ mask, h, top_w, xd->bd);
+ else
#endif // CONFIG_VP9_HIGHBITDEPTH
- uint8_t *dst_tmp = dst + h_offset * dst_stride;
- uint8_t *dst2_tmp = dst2 + h_offset * dst2_stride;
- generate_1dmask(h, mask + h_offset,
- plane && xd->plane[plane].subsampling_y);
+ vpx_blend_a64_vmask(dst, dst_stride,
+ dst, dst_stride,
+ pre, pre_stride,
+ mask, h, top_w);
- for (i = h_offset; i < h_offset + h; i++) {
- for (j = 0; j < top_w; j++) {
- const int m = mask[i]; assert(m >= 0 && m <= 64);
- if (m == 64)
- continue;
-
- if (m == 0)
- dst_tmp[j] = dst2_tmp[j];
- else
- dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
- dst2_tmp[j] * (64 - m), 6);
- }
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
-
- for (; i < top_h; i ++) {
- memcpy(dst_tmp, dst2_tmp, top_w * sizeof(uint8_t));
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
-#if CONFIG_VP9_HIGHBITDEPTH
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
- }
-
+ dst += h * dst_stride;
+ pre += h * pre_stride;
break;
- case PARTITION_VERT:
- {
-#if CONFIG_VP9_HIGHBITDEPTH
- if (b_hdb) {
- uint16_t *dst_tmp = dst16;
- uint16_t *dst2_tmp = dst216;
- generate_1dmask(w, mask + w_offset,
- plane && xd->plane[plane].subsampling_x);
-
- for (i = 0; i < top_h; i++) {
- for (j = w_offset; j < w_offset + w; j++) {
- const int m = mask[j]; assert(m >= 0 && m <= 64);
- if (m == 64)
- continue;
-
- if (m == 0)
- dst_tmp[j] = dst2_tmp[j];
- else
- dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
- dst2_tmp[j] * (64 - m), 6);
- }
- memcpy(dst_tmp + j, dst2_tmp + j,
- (top_w - w_offset - w) * sizeof(uint16_t));
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
- } else {
-#endif // CONFIG_VP9_HIGHBITDEPTH
- uint8_t *dst_tmp = dst;
- uint8_t *dst2_tmp = dst2;
- generate_1dmask(w, mask + w_offset,
- plane && xd->plane[plane].subsampling_x);
-
- for (i = 0; i < top_h; i++) {
- for (j = w_offset; j < w_offset + w; j++) {
- const int m = mask[j]; assert(m >= 0 && m <= 64);
- if (m == 64)
- continue;
-
- if (m == 0)
- dst_tmp[j] = dst2_tmp[j];
- else
- dst_tmp[j] = ROUND_POWER_OF_TWO(dst_tmp[j] * m +
- dst2_tmp[j] * (64 - m), 6);
- }
- memcpy(dst_tmp + j, dst2_tmp + j,
- (top_w - w_offset - w) * sizeof(uint8_t));
- dst_tmp += dst_stride;
- dst2_tmp += dst2_stride;
- }
-#if CONFIG_VP9_HIGHBITDEPTH
- }
-#endif // CONFIG_VP9_HIGHBITDEPTH
}
+ case PARTITION_VERT: {
+ const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+ w_remain = top_w - w_offset - w;
+ h_remain = top_h;
+ dst += w_offset;
+ pre += w_offset;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (is_hdb)
+ vpx_highbd_blend_a64_hmask(dst, dst_stride,
+ dst, dst_stride,
+ pre, pre_stride,
+ mask, top_h, w, xd->bd);
+ else
+#endif // CONFIG_VP9_HIGHBITDEPTH
+ vpx_blend_a64_hmask(dst, dst_stride,
+ dst, dst_stride,
+ pre, pre_stride,
+ mask, top_h, w);
+
+ dst += w;
+ pre += w;
break;
- default:
+ }
+ default: {
assert(0);
+ return;
+ }
}
- (void) xd;
+
+ if (w_remain == 0 || h_remain == 0) {
+ return;
+ }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+ if (is_hdb) {
+ dst = (uint8_t*)CONVERT_TO_SHORTPTR(dst);
+ pre = (const uint8_t*)CONVERT_TO_SHORTPTR(pre);
+ dst_stride *= 2;
+ pre_stride *= 2;
+ w_remain *= 2;
+ }
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+ do {
+ memcpy(dst, pre, w_remain * sizeof(uint8_t));
+ dst += dst_stride;
+ pre += pre_stride;
+ } while (--h_remain);
}
void vp10_build_inter_predictors_sb_sub8x8_extend(
@@ -1878,12 +1827,10 @@
BLOCK_SIZE plane_bsize,
uint8_t *comppred,
int compstride,
- uint8_t *interpred,
+ const uint8_t *interpred,
int interstride,
- uint8_t *intrapred,
+ const uint8_t *intrapred,
int intrastride) {
- const int scale_bits = 8;
- const int scale_max = (1 << scale_bits);
const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
const int size_scale = ii_size_scales[plane_bsize];
@@ -1896,11 +1843,11 @@
bsize);
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
- vpx_blend_mask6(comppred, compstride,
- intrapred, intrastride,
- interpred, interstride,
- mask, 4 * num_4x4_blocks_wide_lookup[bsize],
- bh, bw, subh, subw);
+ vpx_blend_a64_mask(comppred, compstride,
+ intrapred, intrastride,
+ interpred, interstride,
+ mask, 4 * num_4x4_blocks_wide_lookup[bsize],
+ bh, bw, subh, subw);
}
return;
}
@@ -1911,10 +1858,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[i * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1924,10 +1870,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[j * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1939,10 +1884,9 @@
int scale = (ii_weights1d[i * size_scale] * 3 +
ii_weights1d[j * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1954,10 +1898,9 @@
int scale = (ii_weights1d[j * size_scale] * 3 +
ii_weights1d[i * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1967,10 +1910,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[(i < j ? i : j) * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1981,10 +1923,9 @@
int scale = (ii_weights1d[i * size_scale] +
ii_weights1d[j * size_scale]) >> 1;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -1995,10 +1936,8 @@
for (i = 0; i < bh; ++i) {
for (j = 0; j < bw; ++j) {
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- interpred[i * interstride + j] +
- intrapred[i * intrastride + j],
- 1);
+ VPX_BLEND_AVG(intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2014,20 +1953,18 @@
BLOCK_SIZE plane_bsize,
uint8_t *comppred8,
int compstride,
- uint8_t *interpred8,
+ const uint8_t *interpred8,
int interstride,
- uint8_t *intrapred8,
+ const uint8_t *intrapred8,
int intrastride, int bd) {
- const int scale_bits = 8;
- const int scale_max = (1 << scale_bits);
const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
const int size_scale = ii_size_scales[plane_bsize];
int i, j;
uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
- uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
- uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+ const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+ const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
if (use_wedge_interintra) {
if (is_interintra_wedge_used(bsize)) {
@@ -2036,11 +1973,11 @@
bsize);
const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
- vpx_highbd_blend_mask6(comppred8, compstride,
- intrapred8, intrastride,
- interpred8, interstride,
- mask, bw,
- bh, bw, subh, subw, bd);
+ vpx_highbd_blend_a64_mask(comppred8, compstride,
+ intrapred8, intrastride,
+ interpred8, interstride,
+ mask, bw,
+ bh, bw, subh, subw, bd);
}
return;
}
@@ -2051,10 +1988,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[i * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2064,10 +2000,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[j * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2079,10 +2014,9 @@
int scale = (ii_weights1d[i * size_scale] * 3 +
ii_weights1d[j * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2094,10 +2028,9 @@
int scale = (ii_weights1d[j * size_scale] * 3 +
ii_weights1d[i * size_scale]) >> 2;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2107,10 +2040,9 @@
for (j = 0; j < bw; ++j) {
int scale = ii_weights1d[(i < j ? i : j) * size_scale];
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2121,10 +2053,9 @@
int scale = (ii_weights1d[i * size_scale] +
ii_weights1d[j * size_scale]) >> 1;
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- (scale_max - scale) * interpred[i * interstride + j] +
- scale * intrapred[i * intrastride + j],
- scale_bits);
+ VPX_BLEND_A256(scale,
+ intrapred[i * intrastride + j],
+ interpred[i * interstride + j]);
}
}
break;
@@ -2135,10 +2066,8 @@
for (i = 0; i < bh; ++i) {
for (j = 0; j < bw; ++j) {
comppred[i * compstride + j] =
- ROUND_POWER_OF_TWO(
- interpred[i * interstride + j] +
- intrapred[i * intrastride + j],
- 1);
+ VPX_BLEND_AVG(interpred[i * interstride + j],
+ intrapred[i * intrastride + j]);
}
}
break;
@@ -2239,8 +2168,8 @@
void vp10_combine_interintra(MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- uint8_t *inter_pred, int inter_stride,
- uint8_t *intra_pred, int intra_stride) {
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride) {
const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 4ede3e9..ac4a004 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -396,7 +396,8 @@
struct macroblockd_plane;
void vp10_build_masked_inter_predictor_complex(
MACROBLOCKD *xd,
- uint8_t *dst, int dst_stride, uint8_t *dst2, int dst2_stride,
+ uint8_t *dst, int dst_stride,
+ const uint8_t *pre, int pre_stride,
int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
PARTITION_TYPE partition, int plane);
@@ -631,8 +632,8 @@
void vp10_combine_interintra(
MACROBLOCKD *xd,
BLOCK_SIZE bsize, int plane,
- uint8_t *inter_pred, int inter_stride,
- uint8_t *intra_pred, int intra_stride);
+ const uint8_t *inter_pred, int inter_stride,
+ const uint8_t *intra_pred, int intra_stride);
void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
uint8_t *upred,
uint8_t *vpred,
diff --git a/vp10/common/warped_motion.c b/vp10/common/warped_motion.c
index 4990bb3..3b924ea 100644
--- a/vp10/common/warped_motion.c
+++ b/vp10/common/warped_motion.c
@@ -85,19 +85,19 @@
for (i = 0; i < n; ++i) {
const int x = *(points++), y = *(points++);
if (subsampling_x)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
WARPEDPIXEL_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
((x << WARPEDMODEL_PREC_BITS)) + mat[0],
WARPEDPIXEL_PREC_BITS);
if (subsampling_y)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
WARPEDPIXEL_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
((y << WARPEDMODEL_PREC_BITS)) + mat[1],
WARPEDPIXEL_PREC_BITS);
points += stride_points - 2;
@@ -115,21 +115,21 @@
for (i = 0; i < n; ++i) {
const int x = *(points++), y = *(points++);
if (subsampling_x)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
mat[0] * 2 * x + mat[1] * 2 * y + mat[2] +
(mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
WARPEDDIFF_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
- WARPEDDIFF_PREC_BITS);
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
+ WARPEDDIFF_PREC_BITS);
if (subsampling_y)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
-mat[1] * 2 * x + mat[0] * 2 * y + mat[3] +
(-mat[1] + mat[0] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
WARPEDDIFF_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
- WARPEDDIFF_PREC_BITS);
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
+ WARPEDDIFF_PREC_BITS);
points += stride_points - 2;
proj += stride_proj - 2;
}
@@ -145,21 +145,21 @@
for (i = 0; i < n; ++i) {
const int x = *(points++), y = *(points++);
if (subsampling_x)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
mat[0] * 2 * x + mat[1] * 2 * y + mat[4] +
(mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
WARPEDDIFF_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
- WARPEDDIFF_PREC_BITS);
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
+ WARPEDDIFF_PREC_BITS);
if (subsampling_y)
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
mat[2] * 2 * x + mat[3] * 2 * y + mat[5] +
(mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
WARPEDDIFF_PREC_BITS + 1);
else
- *(proj++) = ROUNDZ_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
- WARPEDDIFF_PREC_BITS);
+ *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
+ WARPEDDIFF_PREC_BITS);
points += stride_points - 2;
proj += stride_proj - 2;
}
@@ -357,7 +357,7 @@
const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
const int64_t v3 = x * (p[1] - p[-1]);
const int64_t v4 = 2 * p[0];
- return (int32_t)ROUNDZ_POWER_OF_TWO_SIGNED(
+ return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
(v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
(v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
(v2 << WARPEDPIXEL_PREC_BITS) + v1,
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index cabfc40..389e40b 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -965,7 +965,12 @@
}
#else
if (!vp10_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+ assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
+ assert(mbmi->interp_filter[1] == EIGHTTAP_REGULAR);
+#else
assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
+#endif
return;
}
#endif // CONFIG_DUAL_FILTER
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index 3307393..3810be5 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -160,7 +160,7 @@
next_shortcut = shortcut;
/* Only add a trellis state for non-zero coefficients. */
- if (x) {
+ if (UNLIKELY(x)) {
error0 = tokens[next][0].error;
error1 = tokens[next][1].error;
/* Evaluate the first possibility for this state. */
@@ -204,7 +204,7 @@
rate1 = tokens[next][1].rate;
// The threshold of 3 is empirically obtained.
- if (abs(x) > 3) {
+ if (UNLIKELY(abs(x) > 3)) {
shortcut = 0;
} else {
#if CONFIG_NEW_QUANT
@@ -233,7 +233,7 @@
best_index[i][1] = best_index[i][0];
next = i;
- if (!(--band_left)) {
+ if (UNLIKELY(!(--band_left))) {
--band_counts;
band_left = *band_counts;
--token_costs;
@@ -255,7 +255,7 @@
}
if (next_shortcut) {
- if (next < default_eob) {
+ if (LIKELY(next < default_eob)) {
if (t0 != EOB_TOKEN) {
token_cache[rc] = vp10_pt_energy_class[t0];
pt = get_coef_context(nb, token_cache, i + 1);
@@ -350,7 +350,7 @@
/* Don't update next, because we didn't add a new node. */
}
- if (!(--band_left)) {
+ if (UNLIKELY(!(--band_left))) {
--band_counts;
band_left = *band_counts;
--token_costs;
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index 7cc65e6..c64d57e 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -44,6 +44,24 @@
#include "vp10/encoder/rdopt.h"
#include "vp10/encoder/aq_variance.h"
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+static const int filter_sets[25][2] = {
+ {0, 0}, {0, 1}, {0, 2}, {0, 3}, {0, 4},
+ {1, 0}, {1, 1}, {1, 2}, {1, 3}, {1, 4},
+ {2, 0}, {2, 1}, {2, 2}, {2, 3}, {2, 4},
+ {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 4},
+ {4, 0}, {4, 1}, {4, 2}, {4, 3}, {4, 4},
+};
+#else
+static const int filter_sets[9][2] = {
+ {0, 0}, {0, 1}, {0, 2},
+ {1, 0}, {1, 1}, {1, 2},
+ {2, 0}, {2, 1}, {2, 2},
+};
+#endif
+#endif
+
#if CONFIG_EXT_REFS
#define LAST_FRAME_MODE_MASK ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | \
@@ -403,18 +421,6 @@
{{INTRA_FRAME, NONE}},
};
-#if CONFIG_DUAL_FILTER
-// TODO(jingning): The magic number 9 here really means the combination
-// of prediction filter types for vertical and horizontal directions.
-// It will be replaced after we integrate the dual filter experiment with
-// the ext-interp experiment.
-static int filter_sets[9][2] = {
- {0, 0}, {0, 1}, {0, 2},
- {1, 0}, {1, 1}, {1, 2},
- {2, 0}, {2, 1}, {2, 2},
-};
-#endif
-
static INLINE int write_uniform_cost(int n, int v) {
int l = get_unsigned_bits(n), m = (1 << l) - n;
if (l == 0)
@@ -1211,7 +1217,7 @@
sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- sse = ROUNDZ_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+ sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
#endif // CONFIG_VP9_HIGHBITDEPTH
sse = (int64_t)sse * 16;
@@ -3021,7 +3027,7 @@
#if CONFIG_VP9_HIGHBITDEPTH
if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
- tmp = ROUNDZ_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+ tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
#endif // CONFIG_VP9_HIGHBITDEPTH
*bsse += tmp * 16;
@@ -5357,7 +5363,7 @@
this_mode == NEWMV &&
#endif // CONFIG_EXT_INTER
#if CONFIG_DUAL_FILTER
- 1) {
+ (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) {
#else
(mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) {
#endif
@@ -6658,7 +6664,7 @@
mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
- sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -6720,7 +6726,7 @@
for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
- sse = ROUNDZ_POWER_OF_TWO(sse, bd_round);
+ sse = ROUND_POWER_OF_TWO(sse, bd_round);
model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
rd = RDCOST(x->rdmult, x->rddiv, rate, dist);
@@ -7181,7 +7187,11 @@
int64_t tmp_dist_sum = 0;
#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+ for (i = 0; i < 25; ++i) {
+#else
for (i = 0; i < 9; ++i) {
+#endif
#else
for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
#endif
@@ -7899,6 +7909,10 @@
*rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
*distortion = skip_sse_sb;
+ *psse = skip_sse_sb;
+ *rate_y = 0;
+ *rate_uv = 0;
+ *skippable = 1;
}
#if CONFIG_OBMC || CONFIG_WARPED_MOTION
@@ -10479,7 +10493,11 @@
b_mode_info tmp_best_bmodes[16]; // Should this be 4 ?
MB_MODE_INFO tmp_best_mbmode;
#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+ BEST_SEG_INFO bsi[25];
+#else
BEST_SEG_INFO bsi[9];
+#endif
#else
BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
#endif
@@ -10546,7 +10564,11 @@
} else {
#if CONFIG_DUAL_FILTER
for (switchable_filter_index = 0;
+#if CONFIG_EXT_INTERP
+ switchable_filter_index < 25;
+#else
switchable_filter_index < 9;
+#endif
++switchable_filter_index) {
#else
for (switchable_filter_index = 0;
@@ -10578,7 +10600,8 @@
#if CONFIG_EXT_INTERP
#if CONFIG_DUAL_FILTER
if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
- mbmi->interp_filter[0] != EIGHTTAP_REGULAR) // invalid config
+ (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+ mbmi->interp_filter[1] != EIGHTTAP_REGULAR)) // invalid config
continue;
#else
if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
@@ -10664,9 +10687,11 @@
#if CONFIG_EXT_INTERP
#if CONFIG_DUAL_FILTER
if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
- mbmi->interp_filter[0] != EIGHTTAP_REGULAR)
- for (i = 0; i < 4; ++i)
- mbmi->interp_filter[i] = EIGHTTAP_REGULAR;
+ (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+ mbmi->interp_filter[1] != EIGHTTAP_REGULAR)) {
+ mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+ mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+ }
#else
if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
mbmi->interp_filter != EIGHTTAP_REGULAR)
diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h
new file mode 100644
index 0000000..109183a
--- /dev/null
+++ b/vpx_dsp/blend.h
@@ -0,0 +1,40 @@
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_BLEND_H_
+#define VPX_DSP_BLEND_H_
+
+#include "vpx_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the vpx_blend_* functions in vpx_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A64_ROUND_BITS 6
+#define VPX_BLEND_A64_MAX_ALPHA (1 << VPX_BLEND_A64_ROUND_BITS) // 64
+
+#define VPX_BLEND_A64(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1), \
+ VPX_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A256_ROUND_BITS 8
+#define VPX_BLEND_A256_MAX_ALPHA (1 << VPX_BLEND_A256_ROUND_BITS) // 256
+
+#define VPX_BLEND_A256(a, v0, v1) \
+ ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1), \
+ VPX_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define VPX_BLEND_AVG(v0, v1) ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif // VPX_DSP_BLEND_H_
diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000..90f3415
--- /dev/null
+++ b/vpx_dsp/blend_a64_hmask.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_hmask_c(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_c(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c
new file mode 100644
index 0000000..1649798
--- /dev/null
+++ b/vpx_dsp/blend_a64_mask.c
@@ -0,0 +1,151 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int subh, int subw, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (subw == 0 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = mask[i * mask_stride + j];
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 1) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m =
+ ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j)] +
+ mask[(2 * i) * mask_stride + (2 * j + 1)] +
+ mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+ 2);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else if (subw == 1 && subh == 0) {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+ mask[i * mask_stride + (2 * j + 1)]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ } else {
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; ++j) {
+ const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+ mask[(2 * i + 1) * mask_stride + j]);
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000..5d48a83
--- /dev/null
+++ b/vpx_dsp/blend_a64_vmask.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_vmask_c(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ int i, j;
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_vmask_c(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ int i, j;
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ for (i = 0; i < h; ++i) {
+ const int m = mask[i];
+ for (j = 0; j < w; ++j) {
+ dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+ src0[i * src0_stride + j],
+ src1[i * src1_stride + j]);
+ }
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_mask6.c b/vpx_dsp/blend_mask6.c
deleted file mode 100644
index 584ee6a..0000000
--- a/vpx_dsp/blend_mask6.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-
-#include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-
-#include "./vpx_dsp_rtcd.h"
-
-#define MASK_BITS 6
-
-void vpx_blend_mask6_c(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int subh, int subw) {
- int i, j;
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 = mask[i * mask_stride + j];
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
- mask[i * mask_stride + (2 * j + 1)], 1);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
- mask[(2 * i + 1) * mask_stride + j], 1);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_blend_mask6_c(uint8_t *dst_8, uint32_t dst_stride,
- uint8_t *src0_8, uint32_t src0_stride,
- uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int subh, int subw, int bd) {
- int i, j;
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
- uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
-
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
-
- assert(h >= 4);
- assert(w >= 4);
- assert(IS_POWER_OF_TWO(h));
- assert(IS_POWER_OF_TWO(w));
-
- assert(bd == 8 || bd == 10 || bd == 12);
-
- if (subw == 0 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 = mask[i * mask_stride + j];
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else if (subw == 1 && subh == 1) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
- mask[(2 * i + 1) * mask_stride + (2 * j)] +
- mask[(2 * i) * mask_stride + (2 * j + 1)] +
- mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
- 2);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else if (subw == 1 && subh == 0) {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[i * mask_stride + (2 * j)] +
- mask[i * mask_stride + (2 * j + 1)], 1);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- } else {
- for (i = 0; i < h; ++i)
- for (j = 0; j < w; ++j) {
- const int m0 =
- ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + j] +
- mask[(2 * i + 1) * mask_stride + j], 1);
- const int m1 = ((1 << MASK_BITS) - m0);
- dst[i * dst_stride + j] =
- ROUND_POWER_OF_TWO(src0[i * src0_stride + j] * m0 +
- src1[i * src1_stride + j] * m1, MASK_BITS);
- }
- }
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 3c519b6..3eb7a9f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -70,10 +70,14 @@
# inter predictions
ifeq ($(CONFIG_VP10),yes)
-ifeq ($(CONFIG_EXT_INTER),yes)
-DSP_SRCS-yes += blend_mask6.c
-DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_mask6_sse4.c
-endif #CONFIG_EXT_INTER
+DSP_SRCS-yes += blend.h
+DSP_SRCS-yes += blend_a64_mask.c
+DSP_SRCS-yes += blend_a64_hmask.c
+DSP_SRCS-yes += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
endif #CONFIG_VP10
# interpolation filters
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index e630994..02c8727 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -959,6 +959,27 @@
} # CONFIG_VP9_HIGHBITDEPTH
} # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
+if (vpx_config("CONFIG_VP10") eq "yes") {
+ #
+ # Alpha blending with mask
+ #
+ add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+ add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+ specialize "vpx_blend_a64_mask", qw/sse4_1/;
+ specialize "vpx_blend_a64_hmask", qw/sse4_1/;
+ specialize "vpx_blend_a64_vmask", qw/sse4_1/;
+
+ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+ add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+ add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+ specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/;
+ specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/;
+ specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/;
+ }
+} # CONFIG_VP10
+
if (vpx_config("CONFIG_ENCODERS") eq "yes") {
#
# Block subtraction
@@ -1384,14 +1405,6 @@
}
}
}
-
- add_proto qw/void vpx_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
- specialize "vpx_blend_mask6", qw/sse4_1/;
-
- if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- add_proto qw/void vpx_highbd_blend_mask6/, "uint8_t *dst, uint32_t dst_stride, uint8_t *src0, uint32_t src0_stride, uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
- specialize "vpx_highbd_blend_mask6", qw/sse4_1/;
- }
}
#
diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000..a10e077
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx/vpx_integer.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void vpx_blend_a64_hmask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ vpx_blend_a64_mask_sse4_1(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w,
+ int bd) {
+ vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, 0, h, w, 0, 0, bd);
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_mask6_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c
similarity index 61%
rename from vpx_dsp/x86/blend_mask6_sse4.c
rename to vpx_dsp/x86/blend_a64_mask_sse4.c
index 28693a4..cdb40c2 100644
--- a/vpx_dsp/x86/blend_mask6_sse4.c
+++ b/vpx_dsp/x86/blend_a64_mask_sse4.c
@@ -15,62 +15,24 @@
#include "vpx/vpx_integer.h"
#include "vpx_ports/mem.h"
#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
#include "./vpx_dsp_rtcd.h"
-#define MASK_BITS 6
-
-//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-static INLINE __m128i blend_4(uint8_t*src0, uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = xx_loadl_32(src0);
- const __m128i v_s1_b = xx_loadl_32(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8(uint8_t*src0, uint8_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_b = xx_loadl_64(src0);
- const __m128i v_s1_b = xx_loadl_64(src1);
- const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
- const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
- return v_res_w;
-}
-
//////////////////////////////////////////////////////////////////////////////
// No sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6_w4_sse4_1(
+static void blend_a64_mask_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -92,13 +54,13 @@
} while (--h);
}
-static void blend_mask6_w8_sse4_1(
+static void blend_a64_mask_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -120,13 +82,13 @@
} while (--h);
}
-static void blend_mask6_w16n_sse4_1(
+static void blend_a64_mask_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -158,15 +120,15 @@
// Horizontal sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6_sx_w4_sse4_1(
+static void blend_a64_mask_sx_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -190,15 +152,15 @@
} while (--h);
}
-static void blend_mask6_sx_w8_sse4_1(
+static void blend_a64_mask_sx_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -222,15 +184,15 @@
} while (--h);
}
-static void blend_mask6_sx_w16n_sse4_1(
+static void blend_a64_mask_sx_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -265,13 +227,13 @@
// Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6_sy_w4_sse4_1(
+static void blend_a64_mask_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -296,13 +258,13 @@
} while (--h);
}
-static void blend_mask6_sy_w8_sse4_1(
+static void blend_a64_mask_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -327,14 +289,14 @@
} while (--h);
}
-static void blend_mask6_sy_w16n_sse4_1(
+static void blend_a64_mask_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zero = _mm_setzero_si128();
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -368,15 +330,15 @@
// Horizontal and Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static void blend_mask6_sx_sy_w4_sse4_1(
+static void blend_a64_mask_sx_sy_w4_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -405,15 +367,15 @@
} while (--h);
}
-static void blend_mask6_sx_sy_w8_sse4_1(
+static void blend_a64_mask_sx_sy_w8_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
(void)w;
@@ -442,15 +404,15 @@
} while (--h);
}
-static void blend_mask6_sx_sy_w16n_sse4_1(
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -495,146 +457,67 @@
// Dispatch
//////////////////////////////////////////////////////////////////////////////
-void vpx_blend_mask6_sse4_1(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx) {
+void vpx_blend_a64_mask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx) {
typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
- uint8_t *src0, uint32_t src0_stride,
- uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w);
- static blend_fn blend[3][2][2] = { // width_index X subx X suby
+ // Dimensions are: width_index X subx X suby
+ static const blend_fn blend[3][2][2] = {
{ // w % 16 == 0
- {blend_mask6_w16n_sse4_1, blend_mask6_sy_w16n_sse4_1},
- {blend_mask6_sx_w16n_sse4_1, blend_mask6_sx_sy_w16n_sse4_1}
+ {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
+ {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
}, { // w == 4
- {blend_mask6_w4_sse4_1, blend_mask6_sy_w4_sse4_1},
- {blend_mask6_sx_w4_sse4_1, blend_mask6_sx_sy_w4_sse4_1}
+ {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
+ {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
}, { // w == 8
- {blend_mask6_w8_sse4_1, blend_mask6_sy_w8_sse4_1},
- {blend_mask6_sx_w8_sse4_1, blend_mask6_sx_sy_w8_sse4_1}
+ {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
+ {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
}
};
assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
- assert(h >= 4);
- assert(w >= 4);
+ assert(h >= 1);
+ assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
- blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, mask_stride,
- h, w);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_blend_a64_mask_c(dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w, suby, subx);
+ } else {
+ blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+ }
}
#if CONFIG_VP9_HIGHBITDEPTH
//////////////////////////////////////////////////////////////////////////////
-// Common kernels
-//////////////////////////////////////////////////////////////////////////////
-
-typedef __m128i (*blend_unit_fn)(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w);
-
-static INLINE __m128i blend_4_b10(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b10(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
- const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
-
- const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
-
- const __m128i v_res_w = xx_roundn_epu16(v_sum_w, MASK_BITS);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_4_b12(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadl_64(src0);
- const __m128i v_s1_w = xx_loadl_64(src1);
-
- // Interleave
- const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
-
- // Scale
- const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d, MASK_BITS - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
-static INLINE __m128i blend_8_b12(uint16_t*src0, uint16_t *src1,
- const __m128i v_m0_w, const __m128i v_m1_w) {
- const __m128i v_s0_w = xx_loadu_128(src0);
- const __m128i v_s1_w = xx_loadu_128(src1);
-
- // Interleave
- const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
- const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
- const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
- const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
-
- // Multiply-Add
- const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
- const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
-
- // Scale
- const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d, MASK_BITS - 1);
- const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d, MASK_BITS - 1);
-
- // Pack
- const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
-
- // Round
- const __m128i v_res_w = xx_round_epu16(v_pssum_d);
-
- return v_res_w;
-}
-
-//////////////////////////////////////////////////////////////////////////////
// No sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6_bn_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_m0_b = xx_loadl_32(mask);
@@ -652,37 +535,37 @@
} while (--h);
}
-static void blend_mask6_b10_w4_sse4_1(
+static void blend_a64_mask_b10_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6_b12_w4_sse4_1(
+static void blend_a64_mask_b12_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static inline void blend_mask6_bn_w8n_sse4_1(
+static inline void blend_a64_mask_bn_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -702,41 +585,41 @@
} while (--h);
}
-static void blend_mask6_b10_w8n_sse4_1(
+static void blend_a64_mask_b10_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6_b12_w8n_sse4_1(
+static void blend_a64_mask_b12_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Horizontal sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6_bn_sx_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_r_b = xx_loadl_64(mask);
@@ -756,39 +639,39 @@
} while (--h);
}
-static void blend_mask6_b10_sx_w4_sse4_1(
+static void blend_a64_mask_b10_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6_b12_sx_w4_sse4_1(
+static void blend_a64_mask_b12_sx_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6_bn_sx_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -810,39 +693,39 @@
} while (--h);
}
-static void blend_mask6_b10_sx_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6_b12_sx_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6_bn_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_ra_b = xx_loadl_32(mask);
@@ -863,37 +746,37 @@
} while (--h);
}
-static void blend_mask6_b10_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6_b12_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6_bn_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -916,41 +799,41 @@
} while (--h);
}
-static void blend_mask6_b10_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6_b12_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Horizontal and Vertical sub-sampling
//////////////////////////////////////////////////////////////////////////////
-static INLINE void blend_mask6_bn_sx_sy_w4_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
const __m128i v_ra_b = xx_loadl_64(mask);
@@ -975,39 +858,39 @@
} while (--h);
}
-static void blend_mask6_b10_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b10);
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b10);
}
-static void blend_mask6_b12_sx_sy_w4_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
(void)w;
- blend_mask6_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h,
- blend_4_b12);
+ blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h,
+ blend_4_b12);
}
-static INLINE void blend_mask6_bn_sx_sy_w8n_sse4_1(
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w, blend_unit_fn blend) {
const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
- const __m128i v_maxval_w = _mm_set1_epi16(1 << MASK_BITS);
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
do {
int c;
@@ -1034,82 +917,91 @@
} while (--h);
}
-static void blend_mask6_b10_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b10);
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b10);
}
-static void blend_mask6_b12_sx_sy_w8n_sse4_1(
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w) {
- blend_mask6_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
- src1_stride, mask, mask_stride, h, w,
- blend_8_b12);
+ blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, mask_stride, h, w,
+ blend_8_b12);
}
//////////////////////////////////////////////////////////////////////////////
// Dispatch
//////////////////////////////////////////////////////////////////////////////
-void vpx_highbd_blend_mask6_sse4_1(uint8_t *dst_8, uint32_t dst_stride,
- uint8_t *src0_8, uint32_t src0_stride,
- uint8_t *src1_8, uint32_t src1_stride,
- const uint8_t *mask, uint32_t mask_stride,
- int h, int w, int suby, int subx, int bd) {
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
- uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
- uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
-
+void vpx_highbd_blend_a64_mask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, uint32_t mask_stride,
+ int h, int w, int suby, int subx, int bd) {
typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
- uint16_t *src0, uint32_t src0_stride,
- uint16_t *src1, uint32_t src1_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
const uint8_t *mask, uint32_t mask_stride,
int h, int w);
- static blend_fn blend[2][2][2][2] = { // bd_index X width_index X subx X suby
+ // Dimensions are: bd_index X width_index X subx X suby
+ static const blend_fn blend[2][2][2][2] = {
{ // bd == 8 or 10
{ // w % 8 == 0
- {blend_mask6_b10_w8n_sse4_1, blend_mask6_b10_sy_w8n_sse4_1},
- {blend_mask6_b10_sx_w8n_sse4_1, blend_mask6_b10_sx_sy_w8n_sse4_1}
+ {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
+ {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
}, { // w == 4
- {blend_mask6_b10_w4_sse4_1, blend_mask6_b10_sy_w4_sse4_1},
- {blend_mask6_b10_sx_w4_sse4_1, blend_mask6_b10_sx_sy_w4_sse4_1}
+ {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
+ {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
}
},
{ // bd == 12
{ // w % 8 == 0
- {blend_mask6_b12_w8n_sse4_1, blend_mask6_b12_sy_w8n_sse4_1},
- {blend_mask6_b12_sx_w8n_sse4_1, blend_mask6_b12_sx_sy_w8n_sse4_1}
+ {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
+ {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
}, { // w == 4
- {blend_mask6_b12_w4_sse4_1, blend_mask6_b12_sy_w4_sse4_1},
- {blend_mask6_b12_sx_w4_sse4_1, blend_mask6_b12_sx_sy_w4_sse4_1}
+ {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
+ {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
}
}
};
- assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
- assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
- assert(h >= 4);
- assert(w >= 4);
+ assert(h >= 1);
+ assert(w >= 1);
assert(IS_POWER_OF_TWO(h));
assert(IS_POWER_OF_TWO(w));
assert(bd == 8 || bd == 10 || bd == 12);
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, mask_stride,
+ h, w, suby, subx, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
- blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
- src0, src0_stride,
- src1, src1_stride,
- mask, mask_stride,
- h, w);
+ blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, mask_stride,
+ h, w);
+ }
}
#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000..4b0f38d
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,293 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_32(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ (void)w;
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+ xx_storel_64(dst, v_res_b);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 16) {
+ const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+ v_m0_w, v_m1_w);
+ const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+ v_m0_w, v_m1_w);
+
+ const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+ xx_storeu_128(dst + c, v_res_b);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_vmask_sse4_1(
+ uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ typedef void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+ const uint8_t *src0, uint32_t src0_stride,
+ const uint8_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimension: width_index
+ static const blend_fn blend[9] = {
+ blend_a64_vmask_w16n_sse4_1, // w % 16 == 0
+ vpx_blend_a64_vmask_c, // w == 1
+ vpx_blend_a64_vmask_c, // w == 2
+ NULL, // INVALID
+ blend_a64_vmask_w4_sse4_1, // w == 4
+ NULL, // INVALID
+ NULL, // INVALID
+ NULL, // INVALID
+ blend_a64_vmask_w8_sse4_1, // w == 8
+ };
+
+ assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+ assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ blend[w & 0xf](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+ const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+ xx_storel_64(dst, v_res_w);
+
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h,
+ blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ (void)w;
+ blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h,
+ blend_4_b12);
+}
+
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+ const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+ do {
+ int c;
+ const __m128i v_m0_w = _mm_set1_epi16(*mask);
+ const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+ for (c = 0; c < w; c += 8) {
+ const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+ xx_storeu_128(dst + c, v_res_w);
+ }
+ dst += dst_stride;
+ src0 += src0_stride;
+ src1 += src1_stride;
+ mask += 1;
+ } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w,
+ blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(
+ uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w) {
+ blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+ src1_stride, mask, h, w,
+ blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_vmask_sse4_1(
+ uint8_t *dst_8, uint32_t dst_stride,
+ const uint8_t *src0_8, uint32_t src0_stride,
+ const uint8_t *src1_8, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w, int bd) {
+ typedef void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+ const uint16_t *src0, uint32_t src0_stride,
+ const uint16_t *src1, uint32_t src1_stride,
+ const uint8_t *mask, int h, int w);
+
+ // Dimensions are: bd_index X width_index
+ static const blend_fn blend[2][2] = {
+ { // bd == 8 or 10
+ blend_a64_vmask_b10_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b10_w4_sse4_1, // w == 4
+ }, { // bd == 12
+ blend_a64_vmask_b12_w8n_sse4_1, // w % 8 == 0
+ blend_a64_vmask_b12_w4_sse4_1, // w == 4
+ }
+ };
+
+ assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+ assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+ assert(h >= 1);
+ assert(w >= 1);
+ assert(IS_POWER_OF_TWO(h));
+ assert(IS_POWER_OF_TWO(w));
+
+ assert(bd == 8 || bd == 10 || bd == 12);
+
+ if (UNLIKELY((h | w) & 3)) { // if (w <= 2 || h <= 2)
+ vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
+ src0_8, src0_stride,
+ src1_8, src1_stride,
+ mask, h, w, bd);
+ } else {
+ uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+ const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+ const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+ blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
+ src0, src0_stride,
+ src1, src1_stride,
+ mask, h, w);
+ }
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000..9b74f90
--- /dev/null
+++ b/vpx_dsp/x86/blend_sse4.h
@@ -0,0 +1,145 @@
+/*
+* Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+* Use of this source code is governed by a BSD-style license
+* that can be found in the LICENSE file in the root of the source
+* tree. An additional intellectual property rights grant can be found
+* in the file PATENTS. All contributing project authors may
+* be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_X86_BLEND_SSE4_H_
+#define VPX_DSP_X86_BLEND_SSE4_H_
+
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_32(src0);
+ const __m128i v_s1_b = xx_loadl_32(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_b = xx_loadl_64(src0);
+ const __m128i v_s1_b = xx_loadl_64(src1);
+ const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+ const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+ const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+ const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+ const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadl_64(src0);
+ const __m128i v_s1_w = xx_loadl_64(src1);
+
+ // Interleave
+ const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+ // Scale
+ const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+ const __m128i v_m0_w, const __m128i v_m1_w) {
+ const __m128i v_s0_w = xx_loadu_128(src0);
+ const __m128i v_s1_w = xx_loadu_128(src1);
+
+ // Interleave
+ const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+ const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+ const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+ const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+ // Multiply-Add
+ const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+ const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+ // Scale
+ const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+ const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
+ VPX_BLEND_A64_ROUND_BITS - 1);
+
+ // Pack
+ const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+ // Round
+ const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+ return v_res_w;
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VPX_DSP_X86_BLEND_SSE4_H_
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 1dca1a8..48549ce 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -38,24 +38,15 @@
#define __builtin_prefetch(x)
#endif
-/* Shift down with rounding for use when n > 0 */
+/* Shift down with rounding for use when n >= 0, value >= 0 */
#define ROUND_POWER_OF_TWO(value, n) \
- (((value) + (1 << ((n) - 1))) >> (n))
+ (((value) + (((1 << (n)) >> 1))) >> (n))
-/* Shift down with rounding for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO(value, n) \
- ((n) ? (((value) + (1 << ((n) - 1))) >> (n)) : (value))
-
-/* Shift down with rounding for signed integers, for use when n > 0 */
+/* Shift down with rounding for signed integers, for use when n >= 0 */
#define ROUND_POWER_OF_TWO_SIGNED(value, n) \
(((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
: ROUND_POWER_OF_TWO((value), (n)))
-/* Shift down with rounding for signed integers, for use when n >= 0 */
-#define ROUNDZ_POWER_OF_TWO_SIGNED(value, n) \
- (((value) < 0) ? -ROUNDZ_POWER_OF_TWO(-(value), (n)) \
- : ROUNDZ_POWER_OF_TWO((value), (n)))
-
#define ALIGN_POWER_OF_TWO(value, n) \
(((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))