Add sse2/avx2 version of aom_dc_128_predictor_wxh()

for width or height equal to 64.

Change-Id: I48a03012cecd6a5c5d8751907d1a1feacdc1bcd0
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index fefa905..421524c 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -112,6 +112,10 @@
 specialize qw/aom_dc_128_predictor_16x32 sse2/;
 specialize qw/aom_dc_128_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_128_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_128_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_128_predictor_64x16 sse2 avx2/;
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_v_predictor_4x8 sse2/;
 specialize qw/aom_v_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 9d901fc..8ebf3c9 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -32,6 +32,16 @@
   }
 }
 
+static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
+                                  ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r);
+    dst += stride;
+  }
+}
+
 void aom_dc_predictor_32x32_avx2(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
   const __m256i sum_above = dc_sum_32(above);
@@ -211,6 +221,42 @@
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m256i row = _mm256_set1_epi8((uint8_t)0x80);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_v_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                 const uint8_t *above, const uint8_t *left) {
   const __m256i row = _mm256_loadu_si256((const __m256i *)above);
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 2a83b90..f962c8e 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -51,6 +51,18 @@
   }
 }
 
+static INLINE void dc_store_64xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    _mm_store_si128((__m128i *)(dst + 32), *row);
+    _mm_store_si128((__m128i *)(dst + 48), *row);
+    dst += stride;
+  }
+}
+
 static INLINE __m128i dc_sum_4(const uint8_t *ref) {
   __m128i x = _mm_loadl_epi64((__m128i const *)ref);
   const __m128i zero = _mm_setzero_si128();
@@ -386,6 +398,42 @@
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_128_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_128_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // V_PRED
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 904758c..cd41ea2 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -687,6 +687,9 @@
                 aom_dc_top_predictor_32x16_sse2,
                 aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
                 aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, NULL, NULL,
+                aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -712,7 +715,8 @@
                 aom_dc_top_predictor_32x16_avx2,
                 aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
                 NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, NULL, NULL,
+                aom_dc_128_predictor_32x64_avx2, NULL, NULL,
                 aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
@@ -755,6 +759,18 @@
                 aom_smooth_predictor_64x16_c, aom_smooth_v_predictor_64x16_c,
                 aom_smooth_h_predictor_64x16_c)
 
+#if HAVE_SSE2
+INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL)
+INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL)
+INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL)
+#endif
+
 #if HAVE_SSSE3
 INTRA_PRED_TEST(SSSE3_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_64x64_ssse3, NULL, NULL, NULL)
@@ -765,11 +781,14 @@
 #endif
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x64_avx2, NULL, NULL,
                 aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x32_avx2, NULL, NULL,
                 aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, NULL, NULL,
+                aom_dc_128_predictor_64x16_avx2, NULL, NULL,
                 aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif
 // -----------------------------------------------------------------------------