Optimization EXT_INTRA's filtered intra predictor (SSE4.1)
- Add unit tests to verify the bit-exact result.
- In speed test, function speed (for each mode/tx_size)
improves about 23%~35%.
- On E5-2680, park_joy_1080p, 10 frames, --kf-max-dist=1,
encoding time improves about 1%~2%.
Change-Id: Id89f313d44eea562c02e775a6253dc4df7e046a9
diff --git a/test/reconintra_predictors_test.cc b/test/reconintra_predictors_test.cc
new file mode 100644
index 0000000..38720ba
--- /dev/null
+++ b/test/reconintra_predictors_test.cc
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/enums.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left);
+
+// Note:
+// Test parameter list:
+// Reference predictor, optimized predictor, prediction mode, block size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, int> PredParams;
+
+const int MaxBlkSize = 32;
+
+// By default, disable speed test
+#define PREDICTORS_SPEED_TEST (0)
+
+#if PREDICTORS_SPEED_TEST
+const int MaxTestNum = 100000;
+#else
+const int MaxTestNum = 100;
+#endif
+
+class VP10IntraPredOptimzTest : public ::testing::TestWithParam<PredParams> {
+ public:
+ virtual ~VP10IntraPredOptimzTest() {}
+ virtual void SetUp() {
+ PredFuncMode funcMode = GET_PARAM(0);
+ predFuncRef_ = std::tr1::get<0>(funcMode);
+ predFunc_ = std::tr1::get<1>(funcMode);
+ mode_ = std::tr1::get<2>(funcMode);
+ blockSize_ = GET_PARAM(1);
+
+ alloc_ = (uint8_t *)malloc((3 * MaxBlkSize + 2) * sizeof(alloc_[0]));
+ predRef_ =
+ (uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(predRef_[0]));
+ pred_ = (uint8_t *)malloc(MaxBlkSize * MaxBlkSize * sizeof(pred_[0]));
+ }
+
+ virtual void TearDown() {
+ delete[] alloc_;
+ delete[] predRef_;
+ delete[] pred_;
+ libvpx_test::ClearSystemState();
+ }
+
+ protected:
+ void RunTest() const {
+ int tstIndex = 0;
+ int stride = blockSize_;
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxBlkSize + 1;
+ while (tstIndex < MaxTestNum) {
+ PrepareBuffer();
+ predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
+ ASM_REGISTER_STATE_CHECK(
+ predFunc_(pred_, stride, blockSize_, &above[1], left));
+ DiffPred(tstIndex);
+ tstIndex += 1;
+ }
+ }
+
+ void RunSpeedTestC() const {
+ int tstIndex = 0;
+ int stride = blockSize_;
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxBlkSize + 1;
+ PrepareBuffer();
+ while (tstIndex < MaxTestNum) {
+ predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
+ tstIndex += 1;
+ }
+ }
+
+ void RunSpeedTestSSE() const {
+ int tstIndex = 0;
+ int stride = blockSize_;
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxBlkSize + 1;
+ PrepareBuffer();
+ while (tstIndex < MaxTestNum) {
+ predFunc_(predRef_, stride, blockSize_, &above[1], left);
+ tstIndex += 1;
+ }
+ }
+
+ private:
+ void PrepareBuffer() const {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int i = 0;
+ while (i < (3 * MaxBlkSize + 2)) {
+ alloc_[i] = rnd.Rand8();
+ i += 1;
+ }
+ }
+
+ void DiffPred(int testNum) const {
+ int i = 0;
+ while (i < blockSize_ * blockSize_) {
+ EXPECT_EQ(predRef_[i], pred_[i])
+ << "Error at position: " << i << " "
+ << "Block size: " << blockSize_ << " "
+ << "Test number: " << testNum;
+ i += 1;
+ }
+ }
+
+ Predictor predFunc_;
+ Predictor predFuncRef_;
+ int mode_;
+ int blockSize_;
+ uint8_t *alloc_;
+ uint8_t *pred_;
+ uint8_t *predRef_;
+};
+
+TEST_P(VP10IntraPredOptimzTest, BitExactCheck) {
+ RunTest();
+}
+
+#if PREDICTORS_SPEED_TEST
+TEST_P(VP10IntraPredOptimzTest, SpeedCheckC) {
+ RunSpeedTestC();
+}
+
+TEST_P(VP10IntraPredOptimzTest, SpeedCheckSSE) {
+ RunSpeedTestSSE();
+}
+#endif
+
+using std::tr1::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+ make_tuple(vp10_dc_filter_predictor_c, vp10_dc_filter_predictor_sse4_1,
+ DC_PRED),
+ make_tuple(vp10_v_filter_predictor_c, vp10_v_filter_predictor_sse4_1,
+ V_PRED),
+ make_tuple(vp10_h_filter_predictor_c, vp10_h_filter_predictor_sse4_1,
+ H_PRED),
+ make_tuple(vp10_d45_filter_predictor_c, vp10_d45_filter_predictor_sse4_1,
+ D45_PRED),
+ make_tuple(vp10_d135_filter_predictor_c, vp10_d135_filter_predictor_sse4_1,
+ D135_PRED),
+ make_tuple(vp10_d117_filter_predictor_c, vp10_d117_filter_predictor_sse4_1,
+ D117_PRED),
+ make_tuple(vp10_d153_filter_predictor_c, vp10_d153_filter_predictor_sse4_1,
+ D153_PRED),
+ make_tuple(vp10_d207_filter_predictor_c, vp10_d207_filter_predictor_sse4_1,
+ D207_PRED),
+ make_tuple(vp10_d63_filter_predictor_c, vp10_d63_filter_predictor_sse4_1,
+ D63_PRED),
+ make_tuple(vp10_tm_filter_predictor_c, vp10_tm_filter_predictor_sse4_1,
+ TM_PRED),
+};
+
+const int kBlkSize[] = {4, 8, 16, 32};
+
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, VP10IntraPredOptimzTest,
+ ::testing::Combine(
+ ::testing::ValuesIn(kPredFuncMdArray),
+ ::testing::ValuesIn(kBlkSize)));
+
+} // namespace
diff --git a/test/test.mk b/test/test.mk
index 4b4752f..346a9ba 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -149,6 +149,10 @@
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
endif
+ifeq ($(CONFIG_EXT_INTRA),yes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += reconintra_predictors_test.cc
+endif
+
ifeq ($(CONFIG_OBMC),yes)
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_sad_test.cc
LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_variance_test.cc
diff --git a/vp10/common/intra_filters.h b/vp10/common/intra_filters.h
new file mode 100644
index 0000000..664a7d6
--- /dev/null
+++ b/vp10/common/intra_filters.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_INTRA_FILTERS_H_
+#define VP10_COMMON_INTRA_FILTERS_H_
+
+#define FILTER_INTRA_PREC_BITS (10)
+
+static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
+ {
+ {735, 881, -537, -54},
+ {1005, 519, -488, -11},
+ {383, 990, -343, -6},
+ {442, 805, -542, 319},
+ {658, 616, -133, -116},
+ {875, 442, -141, -151},
+ {386, 741, -23, -80},
+ {390, 1027, -446, 51},
+ {679, 606, -523, 262},
+ {903, 922, -778, -23},
+ },
+ {
+ {648, 803, -444, 16},
+ {972, 620, -576, 7},
+ {561, 967, -499, -5},
+ {585, 762, -468, 144},
+ {596, 619, -182, -9},
+ {895, 459, -176, -153},
+ {557, 722, -126, -129},
+ {601, 839, -523, 105},
+ {562, 709, -499, 251},
+ {803, 872, -695, 43},
+ },
+ {
+ {423, 728, -347, 111},
+ {963, 685, -665, 23},
+ {281, 1024, -480, 216},
+ {640, 596, -437, 78},
+ {429, 669, -259, 99},
+ {740, 646, -415, 23},
+ {568, 771, -346, 40},
+ {404, 833, -486, 209},
+ {398, 712, -423, 307},
+ {939, 935, -887, 17},
+ },
+ {
+ {477, 737, -393, 150},
+ {881, 630, -546, 67},
+ {506, 984, -443, -20},
+ {114, 459, -270, 528},
+ {433, 528, 14, 3},
+ {837, 470, -301, -30},
+ {181, 777, 89, -107},
+ {-29, 716, -232, 259},
+ {589, 646, -495, 255},
+ {740, 884, -728, 77},
+ },
+};
+
+#endif // VP10_COMMON_INTRA_FILTERS_H_
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index cdcca4a..19d0c3d 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -10,6 +10,7 @@
#include <math.h>
+#include "./vp10_rtcd.h"
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_ports/system_state.h"
@@ -20,7 +21,9 @@
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_once.h"
-
+#if CONFIG_EXT_INTRA
+#include "vp10/common/intra_filters.h"
+#endif
#include "vp10/common/reconintra.h"
#include "vp10/common/onyxc_int.h"
@@ -390,7 +393,6 @@
}
#if CONFIG_EXT_INTRA
-#define FILTER_INTRA_PREC_BITS 10
static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = {
NEED_LEFT | NEED_ABOVE, // FILTER_DC
@@ -719,57 +721,6 @@
}
}
-static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
- {
- {735, 881, -537, -54},
- {1005, 519, -488, -11},
- {383, 990, -343, -6},
- {442, 805, -542, 319},
- {658, 616, -133, -116},
- {875, 442, -141, -151},
- {386, 741, -23, -80},
- {390, 1027, -446, 51},
- {679, 606, -523, 262},
- {903, 922, -778, -23},
- },
- {
- {648, 803, -444, 16},
- {972, 620, -576, 7},
- {561, 967, -499, -5},
- {585, 762, -468, 144},
- {596, 619, -182, -9},
- {895, 459, -176, -153},
- {557, 722, -126, -129},
- {601, 839, -523, 105},
- {562, 709, -499, 251},
- {803, 872, -695, 43},
- },
- {
- {423, 728, -347, 111},
- {963, 685, -665, 23},
- {281, 1024, -480, 216},
- {640, 596, -437, 78},
- {429, 669, -259, 99},
- {740, 646, -415, 23},
- {568, 771, -346, 40},
- {404, 833, -486, 209},
- {398, 712, -423, 307},
- {939, 935, -887, 17},
- },
- {
- {477, 737, -393, 150},
- {881, 630, -546, 67},
- {506, 984, -443, -20},
- {114, 459, -270, 528},
- {433, 528, 14, 3},
- {837, 470, -301, -30},
- {181, 777, 89, -107},
- {-29, 716, -232, 259},
- {589, 646, -495, 255},
- {740, 884, -728, 77},
- },
-};
-
static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above,
const uint8_t *left,
@@ -815,63 +766,94 @@
}
}
-static void dc_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
- const uint8_t *above, const uint8_t *left) {
+void vp10_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
}
-static void v_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
}
-static void h_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
}
-static void d45_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d45_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
}
-static void d135_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d135_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
}
-static void d117_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
}
-static void d153_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
}
-static void d207_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d207_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
}
-static void d63_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_d63_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
}
-static void tm_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+void vp10_tm_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, int bs,
const uint8_t *above, const uint8_t *left) {
filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
}
-static void (*filter_intra_predictors[EXT_INTRA_MODES])(uint8_t *dst,
- ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) = {
- dc_filter_predictor, v_filter_predictor, h_filter_predictor,
- d45_filter_predictor, d135_filter_predictor, d117_filter_predictor,
- d153_filter_predictor, d207_filter_predictor, d63_filter_predictor,
- tm_filter_predictor,
-};
+static void filter_intra_predictors(int mode, uint8_t *dst,
+ ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ switch (mode) {
+ case DC_PRED:
+ vp10_dc_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case V_PRED:
+ vp10_v_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case H_PRED:
+ vp10_h_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D45_PRED:
+ vp10_d45_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D135_PRED:
+ vp10_d135_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D117_PRED:
+ vp10_d117_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D153_PRED:
+ vp10_d153_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D207_PRED:
+ vp10_d207_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case D63_PRED:
+ vp10_d63_filter_predictor(dst, stride, bs, above, left);
+ break;
+ case TM_PRED:
+ vp10_tm_filter_predictor(dst, stride, bs, above, left);
+ break;
+ default:
+ assert(0);
+ }
+}
#if CONFIG_VP9_HIGHBITDEPTH
static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
@@ -1491,8 +1473,8 @@
#if CONFIG_EXT_INTRA
if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
- filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
- const_above_row, left_col);
+ filter_intra_predictors(ext_intra_mode, dst, dst_stride, bs,
+ const_above_row, left_col);
return;
}
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index 0ca48a3..6dbcc65 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -298,6 +298,30 @@
specialize qw/quantize_32x32_fp_nuq/;
}
+# EXT_INTRA predictor functions
+if (vpx_config("CONFIG_EXT_INTRA") eq "yes") {
+ add_proto qw/void vp10_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_dc_filter_predictor sse4_1/;
+ add_proto qw/void vp10_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_v_filter_predictor sse4_1/;
+ add_proto qw/void vp10_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_h_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d45_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d45_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d135_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d135_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d117_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d153_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d207_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d207_filter_predictor sse4_1/;
+ add_proto qw/void vp10_d63_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_d63_filter_predictor sse4_1/;
+ add_proto qw/void vp10_tm_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left";
+ specialize qw/vp10_tm_filter_predictor sse4_1/;
+}
+
# High bitdepth functions
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
diff --git a/vp10/common/x86/reconintra_sse4.c b/vp10/common/x86/reconintra_sse4.c
new file mode 100644
index 0000000..851d850
--- /dev/null
+++ b/vp10/common/x86/reconintra_sse4.c
@@ -0,0 +1,593 @@
+/*
+ * Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <smmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/intra_filters.h"
+
+static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
+ __m128i *sum) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u0 = _mm_unpacklo_epi8(a, zero);
+ __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(u0, u1);
+}
+
+static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsSmall(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 4;
+ sum_value >>= 3;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsSmall(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 8;
+ sum_value >>= 4;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
+ __m128i *sum) {
+ const __m128i a = _mm_loadu_si128((const __m128i *)above);
+ const __m128i l = _mm_loadu_si128((const __m128i *)left);
+ const __m128i zero = _mm_setzero_si128();
+
+ __m128i u0 = _mm_unpacklo_epi8(a, zero);
+ __m128i u1 = _mm_unpacklo_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(u0, u1);
+
+ u0 = _mm_unpackhi_epi8(a, zero);
+ u1 = _mm_unpackhi_epi8(l, zero);
+
+ sum[0] = _mm_add_epi16(sum[0], u0);
+ sum[0] = _mm_add_epi16(sum[0], u1);
+}
+
+static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector, u;
+ uint16_t sum_value;
+
+ AddPixelsLarge(above, left, &sum_vector);
+
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 4 values
+ sum_vector = _mm_hadd_epi16(sum_vector, zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector, 2);
+ sum_vector = _mm_add_epi16(sum_vector, u);
+
+ sum_value = _mm_extract_epi16(sum_vector, 0);
+ sum_value += 16;
+ sum_value >>= 5;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
+ __m128i *params) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sum_vector[2], u;
+ uint16_t sum_value;
+
+ AddPixelsLarge(above, left, &sum_vector[0]);
+ AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
+
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 4 values
+ sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero); // still has 2 values
+
+ u = _mm_srli_si128(sum_vector[0], 2);
+ sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
+
+ sum_value = _mm_extract_epi16(sum_vector[0], 0);
+ sum_value += 32;
+ sum_value >>= 6;
+ *params = _mm_set1_epi32(sum_value);
+ return sum_value;
+}
+
+// Note:
+// params[4] : mean value, 4 int32_t repetition
+//
+static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
+ const uint8_t *left,
+ int bs, __m128i *params) {
+ int meanValue = 0;
+ switch (bs) {
+ case 4:
+ meanValue = GetMeanValue4x4(above, left, params);
+ break;
+ case 8:
+ meanValue = GetMeanValue8x8(above, left, params);
+ break;
+ case 16:
+ meanValue = GetMeanValue16x16(above, left, params);
+ break;
+ case 32:
+ meanValue = GetMeanValue32x32(above, left, params);
+ break;
+ default:
+ assert(0);
+ }
+ return meanValue;
+}
+
+// Note:
+// params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
+//
+static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
+ const TX_SIZE tx_size = (bs == 32) ? TX_32X32 :
+ ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+ // c0
+ params[0] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][0],
+ filter_intra_taps_4[tx_size][mode][0],
+ filter_intra_taps_4[tx_size][mode][0],
+ filter_intra_taps_4[tx_size][mode][0]);
+ // c1
+ params[1] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][1],
+ filter_intra_taps_4[tx_size][mode][1],
+ filter_intra_taps_4[tx_size][mode][1],
+ filter_intra_taps_4[tx_size][mode][1]);
+ // c2
+ params[2] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][2],
+ filter_intra_taps_4[tx_size][mode][2],
+ filter_intra_taps_4[tx_size][mode][2],
+ filter_intra_taps_4[tx_size][mode][2]);
+ // c3
+ params[3] = _mm_set_epi32(filter_intra_taps_4[tx_size][mode][3],
+ filter_intra_taps_4[tx_size][mode][3],
+ filter_intra_taps_4[tx_size][mode][3],
+ filter_intra_taps_4[tx_size][mode][3]);
+}
+
+static const int maxBlkSize = 32;
+
+static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+ __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
+ __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
+ __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ *((int *)dst) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
+ p0 = _mm_srli_si128(p0, 4);
+ *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
+}
+
+static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3;
+ int r = 0;
+
+ while (r < 8) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ r += 1;
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += stride;
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)dst, p0);
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3;
+ int r = 0;
+
+ while (r < 16) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)(dst + 8), p0);
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
+ ptrdiff_t stride) {
+ const int predStride = (maxBlkSize << 1) + 1;
+ __m128i p0, p1, p2, p3, p4, p5, p6, p7;
+ int r = 0;
+
+ while (r < 32) {
+ p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
+ p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
+ p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
+ p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
+
+ p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
+ p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
+ p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
+ p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
+
+ p0 = _mm_add_epi32(p0, mean[0]);
+ p1 = _mm_add_epi32(p1, mean[0]);
+ p2 = _mm_add_epi32(p2, mean[0]);
+ p3 = _mm_add_epi32(p3, mean[0]);
+
+ p4 = _mm_add_epi32(p4, mean[0]);
+ p5 = _mm_add_epi32(p5, mean[0]);
+ p6 = _mm_add_epi32(p6, mean[0]);
+ p7 = _mm_add_epi32(p7, mean[0]);
+
+ p0 = _mm_packus_epi32(p0, p1);
+ p1 = _mm_packus_epi32(p2, p3);
+ p0 = _mm_packus_epi16(p0, p1);
+
+ p4 = _mm_packus_epi32(p4, p5);
+ p5 = _mm_packus_epi32(p6, p7);
+ p4 = _mm_packus_epi16(p4, p5);
+
+ _mm_storel_epi64((__m128i *)dst, p0);
+ p0 = _mm_srli_si128(p0, 8);
+ _mm_storel_epi64((__m128i *)(dst + 8), p0);
+
+ _mm_storel_epi64((__m128i *)(dst + 16), p4);
+ p4 = _mm_srli_si128(p4, 8);
+ _mm_storel_epi64((__m128i *)(dst + 24), p4);
+
+ dst += stride;
+ r += 1;
+ }
+}
+
+static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
+ ptrdiff_t stride) {
+ switch (bs) {
+ case 4:
+ SavePred4x4(pred, mean, dst, stride);
+ break;
+ case 8:
+ SavePred8x8(pred, mean, dst, stride);
+ break;
+ case 16:
+ SavePred16x16(pred, mean, dst, stride);
+ break;
+ case 32:
+ SavePred32x32(pred, mean, dst, stride);
+ break;
+ default:
+ assert(0);
+ }
+}
+
+typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride);
+
+static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+
+ sum = _mm_extract_epi32(u0, 2);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 3) = x;
+
+ sum = _mm_extract_epi32(u0, 3);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 4) = x;
+}
+
+static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+
+ sum = _mm_extract_epi32(u0, 2);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 3) = x;
+}
+
+static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+
+ sum = _mm_extract_epi32(u0, 1);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 2) = x;
+}
+
+static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
+ const int predStride) {
+ __m128i u0, u1, u2;
+ int c0 = _mm_extract_epi32(prm[1], 0);
+ int x = *(pred + predStride);
+ int sum;
+
+ u0 = _mm_mullo_epi32(p[0], prm[2]);
+ u1 = _mm_mullo_epi32(p[1], prm[0]);
+ u2 = _mm_mullo_epi32(p[2], prm[3]);
+
+ u0 = _mm_add_epi32(u0, u1);
+ u0 = _mm_add_epi32(u0, u2);
+
+ sum = _mm_extract_epi32(u0, 0);
+ sum += c0 * x;
+ x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
+ *(pred + predStride + 1) = x;
+}
+
+static ProducePixelsFunc prodPixelsFuncTab[4] = {
+ ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels};
+
+static void ProducePixels(int *pred, const __m128i *prm, int remain) {
+ __m128i p[3];
+ const int predStride = (maxBlkSize << 1) + 1;
+ int index;
+
+ p[0] = _mm_loadu_si128((const __m128i *)pred);
+ p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
+ p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
+
+ if (remain <= 2) {
+ return;
+ }
+ if (remain > 5) {
+ index = 3;
+ } else {
+ index = remain - 3;
+ }
+ prodPixelsFuncTab[index](p, prm, pred, predStride);
+}
+
+// Note:
+// At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
+// the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
+static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
+ const int bs, const __m128i *prm, int meanValue,
+ uint8_t *dst, ptrdiff_t stride) {
+ int pred[33][65];
+ int r, c, colBound;
+ int remainings;
+
+ for (r = 0; r < bs; ++r) {
+ pred[r + 1][0] = (int)left[r] - meanValue;
+ }
+
+ above -= 1;
+ for (c = 0; c < 2 * bs + 1; ++c) {
+ pred[0][c] = (int)above[c] - meanValue;
+ }
+
+ r = 0;
+ c = 0;
+ while (r < bs) {
+ colBound = (bs << 1) - r;
+ for (c = 0; c < colBound; c += 4) {
+ remainings = colBound - c + 1;
+ ProducePixels(&pred[r][c], prm, remainings);
+ }
+ r += 1;
+ }
+
+ SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
+}
+
+static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
+ __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
+ int meanValue = 0;
+ meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
+ GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
+}
+
+void vp10_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, DC_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, V_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above, const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, H_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D45_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D135_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D117_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D153_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D207_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, D63_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
+
+void vp10_tm_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m128i prm[5];
+ GetIntraFilterParams(bs, TM_PRED, &prm[0]);
+ FilterPrediction(above, left, bs, prm, dst, stride);
+}
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 44e0635..295ec96 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -118,6 +118,11 @@
VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
endif
+ifeq ($(CONFIG_EXT_INTRA),yes)
+VP10_COMMON_SRCS-yes += common/intra_filters.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/reconintra_sse4.c
+endif
+
VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.c
VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_inv_txfm_sse2.h