Add sse2 dc, dc_left, dc_top, dc_128, v, h
16x64 predictors.
Change-Id: I71ec2d180880911a19ef427da00f5c2b8c35b74b
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index c72e063..66828f7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -90,6 +90,7 @@
specialize qw/aom_dc_top_predictor_16x8 sse2/;
specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_16x64 sse2/;
specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
@@ -104,6 +105,7 @@
specialize qw/aom_dc_left_predictor_16x8 sse2/;
specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_16x64 sse2/;
specialize qw/aom_dc_left_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_left_predictor_32x32 msa neon sse2 avx2/;
specialize qw/aom_dc_left_predictor_32x64 sse2 avx2/;
@@ -118,6 +120,7 @@
specialize qw/aom_dc_128_predictor_16x8 sse2/;
specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_16x64 sse2/;
specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
@@ -132,6 +135,7 @@
specialize qw/aom_v_predictor_16x8 sse2/;
specialize qw/aom_v_predictor_16x16 neon msa sse2/;
specialize qw/aom_v_predictor_16x32 sse2/;
+specialize qw/aom_v_predictor_16x64 sse2/;
specialize qw/aom_v_predictor_32x16 sse2 avx2/;
specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
specialize qw/aom_v_predictor_32x64 sse2 avx2/;
@@ -146,6 +150,7 @@
specialize qw/aom_h_predictor_16x8 sse2/;
specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
specialize qw/aom_h_predictor_16x32 sse2/;
+specialize qw/aom_h_predictor_16x64 sse2/;
specialize qw/aom_h_predictor_32x16 sse2/;
specialize qw/aom_h_predictor_32x32 neon msa sse2 avx2/;
specialize qw/aom_paeth_predictor_4x4 ssse3/;
@@ -194,6 +199,7 @@
specialize qw/aom_dc_predictor_16x8 sse2/;
specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
specialize qw/aom_dc_predictor_16x32 sse2/;
+specialize qw/aom_dc_predictor_16x64 sse2/;
specialize qw/aom_dc_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_predictor_32x32 msa neon sse2 avx2/;
specialize qw/aom_dc_predictor_32x64 sse2 avx2/;
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 67bbd44..5f5a20f 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -182,6 +182,19 @@
dc_store_16xh(&row, 32, dst, stride);
}
+void aom_dc_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i sum_left = dc_sum_64(left);
+ __m128i sum_above = dc_sum_16(above);
+ sum_above = _mm_add_epi16(sum_left, sum_above);
+
+ uint32_t sum = _mm_cvtsi128_si32(sum_above);
+ sum += 40;
+ sum /= 80;
+ const __m128i row = _mm_set1_epi8((uint8_t)sum);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
__m128i sum_above = dc_sum_32(above);
@@ -315,6 +328,20 @@
dc_store_16xh(&row, 32, dst, stride);
}
+void aom_dc_top_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_16(above);
+ const __m128i eight = _mm_set1_epi16((uint16_t)8);
+ sum_above = _mm_add_epi16(sum_above, eight);
+ sum_above = _mm_srai_epi16(sum_above, 4);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
@@ -455,6 +482,20 @@
dc_store_16xh(&row, 32, dst, stride);
}
+void aom_dc_left_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ __m128i sum_left = dc_sum_64(left);
+ const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+ sum_left = _mm_add_epi16(sum_left, thirtytwo);
+ sum_left = _mm_srai_epi16(sum_left, 6);
+ sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+ sum_left = _mm_shufflelo_epi16(sum_left, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
@@ -569,6 +610,15 @@
dc_store_16xh(&row, 32, dst, stride);
}
+void aom_dc_128_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
@@ -652,6 +702,13 @@
dc_store_16xh(&row, 32, dst, stride);
}
+void aom_v_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row = _mm_load_si128((__m128i const *)above);
+ (void)left;
+ dc_store_16xh(&row, 64, dst, stride);
+}
+
static INLINE void v_predictor_32xh(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, int height) {
const __m128i row0 = _mm_load_si128((__m128i const *)above);
@@ -882,29 +939,38 @@
h_prediction_16x8_2(&left_col_8p, dst, stride);
}
-void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
- const uint8_t *above, const uint8_t *left) {
- __m128i left_col, left_col_8p;
- (void)above;
+static INLINE void h_predictor_16xh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *left, int count) {
int i = 0;
-
do {
- left_col = _mm_load_si128((const __m128i *)left);
- left_col_8p = _mm_unpacklo_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p, dst, stride);
+ const __m128i left_col = _mm_load_si128((const __m128i *)left);
+ const __m128i left_col_8p_lo = _mm_unpacklo_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_lo, dst, stride);
dst += stride << 2;
- h_prediction_16x8_2(&left_col_8p, dst, stride);
+ h_prediction_16x8_2(&left_col_8p_lo, dst, stride);
dst += stride << 2;
- left_col_8p = _mm_unpackhi_epi8(left_col, left_col);
- h_prediction_16x8_1(&left_col_8p, dst, stride);
+ const __m128i left_col_8p_hi = _mm_unpackhi_epi8(left_col, left_col);
+ h_prediction_16x8_1(&left_col_8p_hi, dst, stride);
dst += stride << 2;
- h_prediction_16x8_2(&left_col_8p, dst, stride);
+ h_prediction_16x8_2(&left_col_8p_hi, dst, stride);
dst += stride << 2;
left += 16;
i++;
- } while (i < 2);
+ } while (i < count);
+}
+
+void aom_h_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 2);
+}
+
+void aom_h_predictor_16x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ (void)above;
+ h_predictor_16xh(dst, stride, left, 4);
}
static INLINE void h_pred_store_32xh(const __m128i *row, int h, uint8_t *dst,
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 43a22f7..3b5e2cc 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -602,6 +602,11 @@
aom_dc_top_predictor_16x32_sse2,
aom_dc_128_predictor_16x32_sse2, aom_v_predictor_16x32_sse2,
aom_h_predictor_16x32_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_4, TX_16X64, aom_dc_predictor_16x64_sse2,
+ aom_dc_left_predictor_16x64_sse2,
+ aom_dc_top_predictor_16x64_sse2,
+ aom_dc_128_predictor_16x64_sse2, aom_v_predictor_16x64_sse2,
+ aom_h_predictor_16x64_sse2, NULL, NULL, NULL, NULL)
#endif // HAVE_SSE2
#if HAVE_SSSE3