Add sse2/avx2 version of aom_dc_top_predictor_wxh()
for width or height equal to 64.
Change-Id: I8ad49240f7353130c354317ecddc8ba334d832ad
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9437e13..021c4c9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -92,6 +92,10 @@
specialize qw/aom_dc_top_predictor_16x32 sse2/;
specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
specialize qw/aom_dc_left_predictor_4x8 sse2/;
specialize qw/aom_dc_left_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index c3d6829..89f0c5f 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -208,6 +208,62 @@
row_store_32xh(&row, 16, dst, stride);
}
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_32(above);
+ (void)left;
+
+ const __m256i sixteen = _mm256_set1_epi16(16);
+ sum = _mm256_add_epi16(sum, sixteen);
+ sum = _mm256_srai_epi16(sum, 5);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ __m256i sum = dc_sum_64(above);
+ (void)left;
+
+ const __m256i thirtytwo = _mm256_set1_epi16(32);
+ sum = _mm256_add_epi16(sum, thirtytwo);
+ sum = _mm256_srai_epi16(sum, 6);
+ const __m256i zero = _mm256_setzero_si256();
+ __m256i row = _mm256_shuffle_epi8(sum, zero);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above,
const uint8_t *left) {
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 56efc0e..380862d 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -277,6 +277,62 @@
dc_store_32xh(&row, 16, dst, stride);
}
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_32(above);
+ const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+ sum_above = _mm_add_epi16(sum_above, sixteen);
+ sum_above = _mm_srai_epi16(sum_above, 5);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)left;
+ __m128i sum_above = dc_sum_64(above);
+ const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+ sum_above = _mm_add_epi16(sum_above, thirtytwo);
+ sum_above = _mm_srai_epi16(sum_above, 6);
+ sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+ sum_above = _mm_shufflelo_epi16(sum_above, 0);
+ const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
// -----------------------------------------------------------------------------
// DC_LEFT
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index cb1735e..12e8079 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -687,7 +687,8 @@
aom_dc_top_predictor_32x16_sse2,
aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2, NULL,
+INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2,
+ aom_dc_top_predictor_32x64_sse2,
aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
NULL)
#endif // HAVE_SSE2
@@ -715,7 +716,8 @@
aom_dc_top_predictor_32x16_avx2,
aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2, NULL,
+INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2,
+ aom_dc_top_predictor_32x64_avx2,
aom_dc_128_predictor_32x64_avx2, NULL, NULL,
aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
#endif // HAVE_AVX2
@@ -760,13 +762,16 @@
aom_smooth_h_predictor_64x16_c)
#if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2, NULL,
+INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2,
+ aom_dc_top_predictor_64x64_sse2,
aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
NULL)
-INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2, NULL,
+INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2,
+ aom_dc_top_predictor_64x32_sse2,
aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
NULL)
-INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2, NULL,
+INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2,
+ aom_dc_top_predictor_64x16_sse2,
aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
NULL)
#endif
@@ -781,13 +786,16 @@
#endif
#if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2, NULL,
+INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2,
+ aom_dc_top_predictor_64x64_avx2,
aom_dc_128_predictor_64x64_avx2, NULL, NULL,
aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2, NULL,
+INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2,
+ aom_dc_top_predictor_64x32_avx2,
aom_dc_128_predictor_64x32_avx2, NULL, NULL,
aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2, NULL,
+INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2,
+ aom_dc_top_predictor_64x16_avx2,
aom_dc_128_predictor_64x16_avx2, NULL, NULL,
aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
#endif