Add sse4 implementation of lbd filter_intra modes
Speed up filter_intra_predictor() by 2.65x
Unit tests are also added.
Change-Id: I675fc07b0ec369aa3884bd8ebcd83d5c0f00e9cd
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index cd049a4..a89f8b3 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -89,4 +89,12 @@
return _mm_srai_epi32(v_tmp_d, bits);
}
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+ const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+ const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+ const __m128i v_tmp_d =
+ _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+ return _mm_srai_epi16(v_tmp_d, bits);
+}
+
#endif // AOM_DSP_X86_SYNONYMS_H_
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f0f1309..a740547 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -411,6 +411,12 @@
"${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c")
endif ()
+if (CONFIG_FILTER_INTRA)
+ set(AOM_AV1_COMMON_INTRIN_SSE4_1
+ ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+ "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
+endif ()
+
set(AOM_AV1_COMMON_SOURCES
${AOM_AV1_COMMON_SOURCES}
"${AOM_ROOT}/av1/common/warped_motion.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 53c38b3..5f11b07 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -168,20 +168,8 @@
# FILTER_INTRA predictor functions
if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
- add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
- add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
- add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
- add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
- add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
- add_proto qw/void av1_paeth_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-
- # High bitdepth functions
- add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
- add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
- add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
- add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
- add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
- add_proto qw/void av1_highbd_paeth_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
+ add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+ specialize qw/av1_filter_intra_predictor sse4_1/;
}
# High bitdepth functions
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index a16877f..d4dd816 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1090,64 +1090,65 @@
}
#if CONFIG_FILTER_INTRA
-static int filter_intra_taps_4x2procunit[FILTER_INTRA_MODES][8][7] = {
+DECLARE_ALIGNED(16, const int8_t,
+ av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
{
- { -6, 10, 0, 0, 0, 12, 0 },
- { -5, 2, 10, 0, 0, 9, 0 },
- { -3, 1, 1, 10, 0, 7, 0 },
- { -3, 1, 1, 2, 10, 5, 0 },
- { -4, 6, 0, 0, 0, 2, 12 },
- { -3, 2, 6, 0, 0, 2, 9 },
- { -3, 2, 2, 6, 0, 2, 7 },
- { -3, 1, 2, 2, 6, 3, 5 },
+ { -6, 10, 0, 0, 0, 12, 0, 0 },
+ { -5, 2, 10, 0, 0, 9, 0, 0 },
+ { -3, 1, 1, 10, 0, 7, 0, 0 },
+ { -3, 1, 1, 2, 10, 5, 0, 0 },
+ { -4, 6, 0, 0, 0, 2, 12, 0 },
+ { -3, 2, 6, 0, 0, 2, 9, 0 },
+ { -3, 2, 2, 6, 0, 2, 7, 0 },
+ { -3, 1, 2, 2, 6, 3, 5, 0 },
},
{
- { -10, 16, 0, 0, 0, 10, 0 },
- { -6, 0, 16, 0, 0, 6, 0 },
- { -4, 0, 0, 16, 0, 4, 0 },
- { -2, 0, 0, 0, 16, 2, 0 },
- { -10, 16, 0, 0, 0, 0, 10 },
- { -6, 0, 16, 0, 0, 0, 6 },
- { -4, 0, 0, 16, 0, 0, 4 },
- { -2, 0, 0, 0, 16, 0, 2 },
+ { -10, 16, 0, 0, 0, 10, 0, 0 },
+ { -6, 0, 16, 0, 0, 6, 0, 0 },
+ { -4, 0, 0, 16, 0, 4, 0, 0 },
+ { -2, 0, 0, 0, 16, 2, 0, 0 },
+ { -10, 16, 0, 0, 0, 0, 10, 0 },
+ { -6, 0, 16, 0, 0, 0, 6, 0 },
+ { -4, 0, 0, 16, 0, 0, 4, 0 },
+ { -2, 0, 0, 0, 16, 0, 2, 0 },
},
{
- { -8, 8, 0, 0, 0, 16, 0 },
- { -8, 0, 8, 0, 0, 16, 0 },
- { -8, 0, 0, 8, 0, 16, 0 },
- { -8, 0, 0, 0, 8, 16, 0 },
- { -4, 4, 0, 0, 0, 0, 16 },
- { -4, 0, 4, 0, 0, 0, 16 },
- { -4, 0, 0, 4, 0, 0, 16 },
- { -4, 0, 0, 0, 4, 0, 16 },
+ { -8, 8, 0, 0, 0, 16, 0, 0 },
+ { -8, 0, 8, 0, 0, 16, 0, 0 },
+ { -8, 0, 0, 8, 0, 16, 0, 0 },
+ { -8, 0, 0, 0, 8, 16, 0, 0 },
+ { -4, 4, 0, 0, 0, 0, 16, 0 },
+ { -4, 0, 4, 0, 0, 0, 16, 0 },
+ { -4, 0, 0, 4, 0, 0, 16, 0 },
+ { -4, 0, 0, 0, 4, 0, 16, 0 },
},
{
- { -2, 8, 0, 0, 0, 10, 0 },
- { -1, 3, 8, 0, 0, 6, 0 },
- { -1, 2, 3, 8, 0, 4, 0 },
- { 0, 1, 2, 3, 8, 2, 0 },
- { -1, 4, 0, 0, 0, 3, 10 },
- { -1, 3, 4, 0, 0, 4, 6 },
- { -1, 2, 3, 4, 0, 4, 4 },
- { -1, 2, 2, 3, 4, 3, 3 },
+ { -2, 8, 0, 0, 0, 10, 0, 0 },
+ { -1, 3, 8, 0, 0, 6, 0, 0 },
+ { -1, 2, 3, 8, 0, 4, 0, 0 },
+ { 0, 1, 2, 3, 8, 2, 0, 0 },
+ { -1, 4, 0, 0, 0, 3, 10, 0 },
+ { -1, 3, 4, 0, 0, 4, 6, 0 },
+ { -1, 2, 3, 4, 0, 4, 4, 0 },
+ { -1, 2, 2, 3, 4, 3, 3, 0 },
},
{
- { -12, 14, 0, 0, 0, 14, 0 },
- { -10, 0, 14, 0, 0, 12, 0 },
- { -9, 0, 0, 14, 0, 11, 0 },
- { -8, 0, 0, 0, 14, 10, 0 },
- { -10, 12, 0, 0, 0, 0, 14 },
- { -9, 1, 12, 0, 0, 0, 12 },
- { -8, 0, 0, 12, 0, 1, 11 },
- { -7, 0, 0, 1, 12, 1, 9 },
+ { -12, 14, 0, 0, 0, 14, 0, 0 },
+ { -10, 0, 14, 0, 0, 12, 0, 0 },
+ { -9, 0, 0, 14, 0, 11, 0, 0 },
+ { -8, 0, 0, 0, 14, 10, 0, 0 },
+ { -10, 12, 0, 0, 0, 0, 14, 0 },
+ { -9, 1, 12, 0, 0, 0, 12, 0 },
+ { -8, 0, 0, 12, 0, 1, 11, 0 },
+ { -7, 0, 0, 1, 12, 1, 9, 0 },
},
};
-static void filter_intra_predictor(uint8_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint8_t *above,
- const uint8_t *left, int mode) {
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
int r, c;
- int buffer[33][33];
+ uint8_t buffer[33][33];
const int bw = tx_size_wide[tx_size];
const int bh = tx_size_high[tx_size];
@@ -1157,100 +1158,47 @@
for (r = 0; r < bh + 1; ++r)
memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
- for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
-
- for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
for (r = 1; r < bh + 1; r += 2)
for (c = 1; c < bw + 1; c += 4) {
- const int p0 = buffer[r - 1][c - 1];
- const int p1 = buffer[r - 1][c];
- const int p2 = buffer[r - 1][c + 1];
- const int p3 = buffer[r - 1][c + 2];
- const int p4 = buffer[r - 1][c + 3];
- const int p5 = buffer[r][c - 1];
- const int p6 = buffer[r + 1][c - 1];
+ const uint8_t p0 = buffer[r - 1][c - 1];
+ const uint8_t p1 = buffer[r - 1][c];
+ const uint8_t p2 = buffer[r - 1][c + 1];
+ const uint8_t p3 = buffer[r - 1][c + 2];
+ const uint8_t p4 = buffer[r - 1][c + 3];
+ const uint8_t p5 = buffer[r][c - 1];
+ const uint8_t p6 = buffer[r + 1][c - 1];
for (int k = 0; k < 8; ++k) {
int r_offset = k >> 2;
int c_offset = k & 0x03;
buffer[r + r_offset][c + c_offset] =
- filter_intra_taps_4x2procunit[mode][k][0] * p0 +
- filter_intra_taps_4x2procunit[mode][k][1] * p1 +
- filter_intra_taps_4x2procunit[mode][k][2] * p2 +
- filter_intra_taps_4x2procunit[mode][k][3] * p3 +
- filter_intra_taps_4x2procunit[mode][k][4] * p4 +
- filter_intra_taps_4x2procunit[mode][k][5] * p5 +
- filter_intra_taps_4x2procunit[mode][k][6] * p6;
- buffer[r + r_offset][c + c_offset] =
clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
- buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+ av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6,
+ FILTER_INTRA_SCALE_BITS));
}
}
for (r = 0; r < bh; ++r) {
- for (c = 0; c < bw; ++c) {
- dst[c] = buffer[r + 1][c + 1];
- }
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
dst += stride;
}
}
-void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
- const uint8_t *above, const uint8_t *left) {
- filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_DC_PRED);
-}
-
-void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
- const uint8_t *above, const uint8_t *left) {
- filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_V_PRED);
-}
-
-void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
- const uint8_t *above, const uint8_t *left) {
- filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_H_PRED);
-}
-
-void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint8_t *above,
- const uint8_t *left) {
- filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_D153_PRED);
-}
-
-void av1_paeth_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint8_t *above,
- const uint8_t *left) {
- filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_PAETH_PRED);
-}
-
-static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
- ptrdiff_t stride, TX_SIZE tx_size,
- const uint8_t *above, const uint8_t *left) {
- switch (mode) {
- case FILTER_DC_PRED:
- av1_dc_filter_predictor(dst, stride, tx_size, above, left);
- break;
- case FILTER_V_PRED:
- av1_v_filter_predictor(dst, stride, tx_size, above, left);
- break;
- case FILTER_H_PRED:
- av1_h_filter_predictor(dst, stride, tx_size, above, left);
- break;
- case FILTER_D153_PRED:
- av1_d153_filter_predictor(dst, stride, tx_size, above, left);
- break;
- case FILTER_PAETH_PRED:
- av1_paeth_filter_predictor(dst, stride, tx_size, above, left);
- break;
- default: assert(0);
- }
-}
static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
TX_SIZE tx_size,
const uint16_t *above,
const uint16_t *left, int mode,
int bd) {
int r, c;
- int buffer[33][33];
+ uint16_t buffer[33][33];
const int bw = tx_size_wide[tx_size];
const int bh = tx_size_high[tx_size];
@@ -1260,104 +1208,40 @@
for (r = 0; r < bh + 1; ++r)
memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
- for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
-
- for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
for (r = 1; r < bh + 1; r += 2)
for (c = 1; c < bw + 1; c += 4) {
- const int p0 = buffer[r - 1][c - 1];
- const int p1 = buffer[r - 1][c];
- const int p2 = buffer[r - 1][c + 1];
- const int p3 = buffer[r - 1][c + 2];
- const int p4 = buffer[r - 1][c + 3];
- const int p5 = buffer[r][c - 1];
- const int p6 = buffer[r + 1][c - 1];
+ const uint16_t p0 = buffer[r - 1][c - 1];
+ const uint16_t p1 = buffer[r - 1][c];
+ const uint16_t p2 = buffer[r - 1][c + 1];
+ const uint16_t p3 = buffer[r - 1][c + 2];
+ const uint16_t p4 = buffer[r - 1][c + 3];
+ const uint16_t p5 = buffer[r][c - 1];
+ const uint16_t p6 = buffer[r + 1][c - 1];
for (int k = 0; k < 8; ++k) {
int r_offset = k >> 2;
int c_offset = k & 0x03;
buffer[r + r_offset][c + c_offset] =
- filter_intra_taps_4x2procunit[mode][k][0] * p0 +
- filter_intra_taps_4x2procunit[mode][k][1] * p1 +
- filter_intra_taps_4x2procunit[mode][k][2] * p2 +
- filter_intra_taps_4x2procunit[mode][k][3] * p3 +
- filter_intra_taps_4x2procunit[mode][k][4] * p4 +
- filter_intra_taps_4x2procunit[mode][k][5] * p5 +
- filter_intra_taps_4x2procunit[mode][k][6] * p6;
- buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
- FILTER_INTRA_SCALE_BITS),
- bd);
+ clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+ av1_filter_intra_taps[mode][k][0] * p0 +
+ av1_filter_intra_taps[mode][k][1] * p1 +
+ av1_filter_intra_taps[mode][k][2] * p2 +
+ av1_filter_intra_taps[mode][k][3] * p3 +
+ av1_filter_intra_taps[mode][k][4] * p4 +
+ av1_filter_intra_taps[mode][k][5] * p5 +
+ av1_filter_intra_taps[mode][k][6] * p6,
+ FILTER_INTRA_SCALE_BITS),
+ bd);
}
}
for (r = 0; r < bh; ++r) {
- for (c = 0; c < bw; ++c) {
- dst[c] = buffer[r + 1][c + 1];
- }
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
dst += stride;
}
}
-
-void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
- FILTER_DC_PRED, bd);
-}
-
-void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
- FILTER_V_PRED, bd);
-}
-
-void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
- FILTER_H_PRED, bd);
-}
-
-void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
- FILTER_D153_PRED, bd);
-}
-
-void av1_highbd_paeth_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size, const uint16_t *above,
- const uint16_t *left, int bd) {
- highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
- FILTER_PAETH_PRED, bd);
-}
-
-static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
- uint16_t *dst, ptrdiff_t stride,
- TX_SIZE tx_size,
- const uint16_t *above,
- const uint16_t *left, int bd) {
- switch (mode) {
- case FILTER_DC_PRED:
- av1_highbd_dc_filter_predictor(dst, stride, tx_size, above, left, bd);
- break;
- case FILTER_V_PRED:
- av1_highbd_v_filter_predictor(dst, stride, tx_size, above, left, bd);
- break;
- case FILTER_H_PRED:
- av1_highbd_h_filter_predictor(dst, stride, tx_size, above, left, bd);
- break;
- case FILTER_D153_PRED:
- av1_highbd_d153_filter_predictor(dst, stride, tx_size, above, left, bd);
- break;
- case FILTER_PAETH_PRED:
- av1_highbd_paeth_filter_predictor(dst, stride, tx_size, above, left, bd);
- break;
- default: assert(0);
- }
-}
#endif // CONFIG_FILTER_INTRA
#if CONFIG_INTRA_EDGE
@@ -1769,8 +1653,8 @@
#if CONFIG_FILTER_INTRA
if (use_filter_intra) {
- highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
- above_row, left_col, xd->bd);
+ highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode, xd->bd);
return;
}
#endif // CONFIG_FILTER_INTRA
@@ -1992,8 +1876,8 @@
#if CONFIG_FILTER_INTRA
if (use_filter_intra) {
- filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
- above_row, left_col);
+ av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+ filter_intra_mode);
return;
}
#endif // CONFIG_FILTER_INTRA
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index cfa3357..ba71694 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -66,6 +66,10 @@
}
#endif // CONFIG_INTRABC
+#if CONFIG_FILTER_INTRA
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+#endif
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000..fd3df2c
--- /dev/null
+++ b/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+ TX_SIZE tx_size, const uint8_t *above,
+ const uint8_t *left, int mode) {
+ int r, c;
+ uint8_t buffer[33][33];
+ const int bw = tx_size_wide[tx_size];
+ const int bh = tx_size_high[tx_size];
+
+ assert(bw <= 32 && bh <= 32);
+
+ // The initialization is just for silencing Jenkins static analysis warnings
+ for (r = 0; r < bh + 1; ++r)
+ memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+ for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+ memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+ const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+ const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+ const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+ const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+ const __m128i filter_intra_scale_bits =
+ _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+ for (r = 1; r < bh + 1; r += 2) {
+ for (c = 1; c < bw + 1; c += 4) {
+ DECLARE_ALIGNED(16, uint8_t, p[8]);
+ memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+ p[5] = buffer[r][c - 1];
+ p[6] = buffer[r + 1][c - 1];
+ p[7] = 0;
+ const __m128i p_b = xx_loadl_64(p);
+ const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+ const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+ const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+ const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+ const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+ const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+ const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+ const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+ // Rounding
+ const __m128i round_w =
+ _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+ const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+ const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+ // Storing
+ xx_storel_32(&buffer[r][c], out_r);
+ xx_storel_32(&buffer[r + 1][c], out_r1);
+ }
+ }
+
+ for (r = 0; r < bh; ++r) {
+ memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+ dst += stride;
+ }
+}
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
new file mode 100644
index 0000000..ed442a7
--- /dev/null
+++ b/test/filterintra_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tr1::tuple;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+ const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+// Test parameter list:
+// Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+ virtual ~AV1FilterIntraPredTest() {}
+ virtual void SetUp() {
+ PredFuncMode funcMode = GET_PARAM(0);
+ predFuncRef_ = std::tr1::get<0>(funcMode);
+ predFunc_ = std::tr1::get<1>(funcMode);
+ mode_ = std::tr1::get<2>(funcMode);
+ txSize_ = GET_PARAM(1);
+
+ alloc_ = new uint8_t[2 * MaxTxSize + 1];
+ predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+ pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+ }
+
+ virtual void TearDown() {
+ delete[] alloc_;
+ delete[] predRef_;
+ delete[] pred_;
+ libaom_test::ClearSystemState();
+ }
+
+ protected:
+ void RunTest() const {
+ int tstIndex = 0;
+ int stride = tx_size_wide[txSize_];
+ uint8_t *left = alloc_;
+ uint8_t *above = alloc_ + MaxTxSize;
+ while (tstIndex < MaxTestNum) {
+ PrepareBuffer();
+ predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+ ASM_REGISTER_STATE_CHECK(
+ predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+ DiffPred(tstIndex);
+ tstIndex += 1;
+ }
+ }
+
+ private:
+ void PrepareBuffer() const {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ int i = 0;
+ while (i < (2 * MaxTxSize + 1)) {
+ alloc_[i] = rnd.Rand8();
+ i++;
+ }
+ }
+
+ void DiffPred(int testNum) const {
+ int i = 0;
+ while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+ EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+ << "Tx size: " << tx_size_wide[txSize_]
+ << "x" << tx_size_high[txSize_] << " "
+ << "Test number: " << testNum;
+ i++;
+ }
+ }
+
+ Predictor predFunc_;
+ Predictor predFuncRef_;
+ int mode_;
+ TX_SIZE txSize_;
+ uint8_t *alloc_;
+ uint8_t *pred_;
+ uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+using std::tr1::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+ make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+ FILTER_DC_PRED),
+ make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+ FILTER_V_PRED),
+ make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+ FILTER_H_PRED),
+ make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+ FILTER_D153_PRED),
+ make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+ FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4, TX_8X8, TX_16X16, TX_32X32, TX_4X8,
+ TX_8X4, TX_8X16, TX_16X8, TX_16X32, TX_32X16,
+ TX_4X16, TX_16X4, TX_8X32, TX_32X8 };
+
+INSTANTIATE_TEST_CASE_P(
+ SSE4_1, AV1FilterIntraPredTest,
+ ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+ ::testing::ValuesIn(kTxSize)));
+} // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 9f9e89e..f49e141 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -149,6 +149,14 @@
"${AOM_ROOT}/test/intrabc_test.cc")
endif ()
+ if (CONFIG_FILTER_INTRA)
+ if (HAVE_SSE4_1)
+ set(AOM_UNIT_TEST_COMMON_SOURCES
+ ${AOM_UNIT_TEST_COMMON_SOURCES}
+ "${AOM_ROOT}/test/filterintra_test.cc")
+ endif ()
+ endif ()
+
if (CONFIG_CFL)
set(AOM_UNIT_TEST_COMMON_SOURCES
${AOM_UNIT_TEST_COMMON_SOURCES}