Add sse2/avx2 version of aom_dc_128_predictor_wxh()
for width or height equal to 64.
Change-Id: I48a03012cecd6a5c5d8751907d1a1feacdc1bcd0
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index fefa905..421524c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -112,6 +112,10 @@
specialize qw/aom_dc_128_predictor_16x32 sse2/;
specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
specialize qw/aom_v_predictor_4x4 neon msa sse2/;
specialize qw/aom_v_predictor_4x8 sse2/;
specialize qw/aom_v_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 9d901fc..8ebf3c9 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -32,6 +32,16 @@
}
}
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+ dst += stride;
+ }
+}
+
void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m256i sum_above = dc_sum_32(above);
@@ -211,6 +221,42 @@
row_store_32xh(&row, 16, dst, stride);
}
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+ row_store_64xh(&row, 16, dst, stride);
+}
+
void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
const uint8_t *above, const uint8_t *left) {
const __m256i row = _mm256_loadu_si256((const __m256i *)above);
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 2a83b90..f962c8e 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -51,6 +51,18 @@
}
}
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ int i;
+ for (i = 0; i < height; ++i) {
+ _mm_store_si128((__m128i *)dst, *row);
+ _mm_store_si128((__m128i *)(dst + 16), *row);
+ _mm_store_si128((__m128i *)(dst + 32), *row);
+ _mm_store_si128((__m128i *)(dst + 48), *row);
+ dst += stride;
+ }
+}
+
static INLINE __m128i dc_sum_4(const uint8_t *ref) {
__m128i x = _mm_loadl_epi64((__m128i const *)ref);
const __m128i zero = _mm_setzero_si128();
@@ -386,6 +398,42 @@
dc_store_32xh(&row, 16, dst, stride);
}
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ (void)above;
+ (void)left;
+ const __m128i row = _mm_set1_epi8((uint8_t)128);
+ dc_store_64xh(&row, 16, dst, stride);
+}
+
// -----------------------------------------------------------------------------
// V_PRED
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 904758c..cd41ea2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -687,6 +687,9 @@
aom_dc_top_predictor_32x16_sse2,
aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, NULL, NULL,
+ aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
+ NULL)
#endif // HAVE_SSE2
#if HAVE_SSSE3
@@ -712,7 +715,8 @@
aom_dc_top_predictor_32x16_avx2,
aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, NULL, NULL,
+ aom_dc_128_predictor_32x64_avx2, NULL, NULL,
aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
#endif // HAVE_AVX2
@@ -755,6 +759,18 @@
aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
aom_smooth_h_predictor_64x16_c)
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
+ NULL)
+INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
+ NULL)
+INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
+ NULL)
+#endif
+
#if HAVE_SSSE3
INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
aom_paeth_predictor_64x64_ssse3, NULL, NULL, NULL)
@@ -765,11 +781,14 @@
#endif
#if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x64_avx2, NULL, NULL,
aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x32_avx2, NULL, NULL,
aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, NULL, NULL,
+ aom_dc_128_predictor_64x16_avx2, NULL, NULL,
aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
#endif
// -----------------------------------------------------------------------------