Add sse2/avx2 version of aom_dc_top_predictor_wxh() for width or height equal to 64. Change-Id: I8ad49240f7353130c354317ecddc8ba334d832ad
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl index 9437e13..021c4c9 100755 --- a/aom_dsp/aom_dsp_rtcd_defs.pl +++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -92,6 +92,10 @@ specialize qw/aom_dc_top_predictor_16x32 sse2/; specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/; specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/; +specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/; +specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/; specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/; specialize qw/aom_dc_left_predictor_4x8 sse2/; specialize qw/aom_dc_left_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c index c3d6829..89f0c5f 100644 --- a/aom_dsp/x86/intrapred_avx2.c +++ b/aom_dsp/x86/intrapred_avx2.c
@@ -208,6 +208,62 @@ row_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_32(above); + (void)left; + + const __m256i sixteen = _mm256_set1_epi16(16); + sum = _mm256_add_epi16(sum, sixteen); + sum = _mm256_srai_epi16(sum, 5); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + __m256i sum = dc_sum_64(above); + (void)left; + + const __m256i thirtytwo = _mm256_set1_epi16(32); + sum = _mm256_add_epi16(sum, thirtytwo); + sum = _mm256_srai_epi16(sum, 6); + const __m256i zero = _mm256_setzero_si256(); + __m256i row = _mm256_shuffle_epi8(sum, zero); + row_store_64xh(&row, 16, dst, stride); +} + void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left) {
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c index 56efc0e..380862d 100644 --- a/aom_dsp/x86/intrapred_sse2.c +++ b/aom_dsp/x86/intrapred_sse2.c
@@ -277,6 +277,62 @@ dc_store_32xh(&row, 16, dst, stride); } +void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_32(above); + const __m128i sixteen = _mm_set1_epi16((uint16_t)16); + sum_above = _mm_add_epi16(sum_above, sixteen); + sum_above = _mm_srai_epi16(sum_above, 5); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_32xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 64, dst, stride); +} + +void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 32, dst, stride); +} + +void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride, + const uint8_t *above, + const uint8_t *left) { + (void)left; + __m128i sum_above = dc_sum_64(above); + const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32); + sum_above = _mm_add_epi16(sum_above, thirtytwo); + sum_above = _mm_srai_epi16(sum_above, 6); + sum_above = _mm_unpacklo_epi8(sum_above, sum_above); + sum_above = _mm_shufflelo_epi16(sum_above, 0); + const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above); + dc_store_64xh(&row, 16, dst, stride); +} + // ----------------------------------------------------------------------------- // DC_LEFT
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index cb1735e..12e8079 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc
@@ -687,7 +687,8 @@ aom_dc_top_predictor_32x16_sse2, aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2, aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL) -INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2, NULL, +INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2, + aom_dc_top_predictor_32x64_sse2, aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL, NULL) #endif // HAVE_SSE2 @@ -715,7 +716,8 @@ aom_dc_top_predictor_32x16_avx2, aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2, NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL) -INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2, NULL, +INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2, + aom_dc_top_predictor_32x64_avx2, aom_dc_128_predictor_32x64_avx2, NULL, NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL) #endif // HAVE_AVX2 @@ -760,13 +762,16 @@ aom_smooth_h_predictor_64x16_c) #if HAVE_SSE2 -INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2, NULL, +INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2, + aom_dc_top_predictor_64x64_sse2, aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL, NULL) -INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2, NULL, +INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2, + aom_dc_top_predictor_64x32_sse2, aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL) -INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2, NULL, +INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2, + aom_dc_top_predictor_64x16_sse2, aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL) #endif @@ -781,13 +786,16 @@ #endif #if HAVE_AVX2 -INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2, NULL, +INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2, + aom_dc_top_predictor_64x64_avx2, aom_dc_128_predictor_64x64_avx2, NULL, NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL) -INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2, NULL, +INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2, + aom_dc_top_predictor_64x32_avx2, aom_dc_128_predictor_64x32_avx2, NULL, NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL) -INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2, NULL, +INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2, + aom_dc_top_predictor_64x16_avx2, aom_dc_128_predictor_64x16_avx2, NULL, NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL) #endif