Add sse2/avx2 version of aom_v_predictor_wxh()
for width or height equal to 64.
Change-Id: I6981b212bfc24cf17fd4b21bf9fd4d16a5249751
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ea40dcc..14f1a63 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -134,6 +134,10 @@
specialize qw/aom_v_predictor_16x32 sse2/;
specialize qw/aom_v_predictor_32x16 sse2 avx2/;
specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
specialize qw/aom_h_predictor_4x8 sse2/;
specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
specialize qw/aom_h_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 4ec3cca..5c852ac 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -44,6 +44,16 @@
}
}
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+ int height, uint8_t *dst,
+ ptrdiff_t stride) {
+ for (int i = 0; i < height; ++i) {
+ _mm256_storeu_si256((__m256i *)dst, *r0);
+ _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+ dst += stride;
+ }
+}
+
static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
ptrdiff_t stride) {
for (int i = 0; i < height; ++i) {
@@ -435,6 +445,37 @@
row_store_32xh(&row, 16, dst, stride);
}
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+ (void)left;
+ row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+ const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+ (void)left;
+ row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
// -----------------------------------------------------------------------------
// PAETH_PRED
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 387d0a2..c827464 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -665,6 +665,66 @@
}
}
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ (void)left;
+ for (int i = 0; i < 64; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+ const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+ (void)left;
+ for (int i = 0; i < 64; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ _mm_store_si128((__m128i *)(dst + 32), row2);
+ _mm_store_si128((__m128i *)(dst + 48), row3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+ const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+ (void)left;
+ for (int i = 0; i < 32; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ _mm_store_si128((__m128i *)(dst + 32), row2);
+ _mm_store_si128((__m128i *)(dst + 48), row3);
+ dst += stride;
+ }
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ const __m128i row0 = _mm_load_si128((__m128i const *)above);
+ const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+ const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+ const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+ (void)left;
+ for (int i = 0; i < 16; ++i) {
+ _mm_store_si128((__m128i *)dst, row0);
+ _mm_store_si128((__m128i *)(dst + 16), row1);
+ _mm_store_si128((__m128i *)(dst + 32), row2);
+ _mm_store_si128((__m128i *)(dst + 48), row3);
+ dst += stride;
+ }
+}
+
// -----------------------------------------------------------------------------
// H_PRED
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index dd4a839..9a27525 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -690,8 +690,8 @@
INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
aom_dc_left_predictor_32x64_sse2,
aom_dc_top_predictor_32x64_sse2,
- aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
- NULL)
+ aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+ NULL, NULL, NULL, NULL, NULL)
#endif // HAVE_SSE2
#if HAVE_SSSE3
@@ -720,8 +720,8 @@
INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
aom_dc_left_predictor_32x64_avx2,
aom_dc_top_predictor_32x64_avx2,
- aom_dc_128_predictor_32x64_avx2, NULL, NULL,
- aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
+ aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+ NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
#endif // HAVE_AVX2
#if HAVE_NEON
@@ -767,18 +767,18 @@
INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
aom_dc_left_predictor_64x64_sse2,
aom_dc_top_predictor_64x64_sse2,
- aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
- NULL)
+ aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+ NULL, NULL, NULL, NULL, NULL)
INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
aom_dc_left_predictor_64x32_sse2,
aom_dc_top_predictor_64x32_sse2,
- aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
- NULL)
+ aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+ NULL, NULL, NULL, NULL, NULL)
INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
aom_dc_left_predictor_64x16_sse2,
aom_dc_top_predictor_64x16_sse2,
- aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
- NULL)
+ aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+ NULL, NULL, NULL, NULL, NULL)
#endif
#if HAVE_SSSE3
@@ -794,18 +794,18 @@
INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
aom_dc_left_predictor_64x64_avx2,
aom_dc_top_predictor_64x64_avx2,
- aom_dc_128_predictor_64x64_avx2, NULL, NULL,
- aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
+ aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+ NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
aom_dc_left_predictor_64x32_avx2,
aom_dc_top_predictor_64x32_avx2,
- aom_dc_128_predictor_64x32_avx2, NULL, NULL,
- aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+ aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+ NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
aom_dc_left_predictor_64x16_avx2,
aom_dc_top_predictor_64x16_avx2,
- aom_dc_128_predictor_64x16_avx2, NULL, NULL,
- aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
+ aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+ NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
#endif
// -----------------------------------------------------------------------------
// High Bitdepth