Add sse2/avx2 version of aom_dc_top_predictor_wxh()

for width or height equal to 64.

Change-Id: I8ad49240f7353130c354317ecddc8ba334d832ad
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 9437e13..021c4c9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -92,6 +92,10 @@
 specialize qw/aom_dc_top_predictor_16x32 sse2/;
 specialize qw/aom_dc_top_predictor_32x16 sse2 avx2/;
 specialize qw/aom_dc_top_predictor_32x32 msa neon sse2 avx2/;
+specialize qw/aom_dc_top_predictor_32x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x64 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x32 sse2 avx2/;
+specialize qw/aom_dc_top_predictor_64x16 sse2 avx2/;
 specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
 specialize qw/aom_dc_left_predictor_4x8 sse2/;
 specialize qw/aom_dc_left_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index c3d6829..89f0c5f 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -208,6 +208,62 @@
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_32(above);
+  (void)left;
+
+  const __m256i sixteen = _mm256_set1_epi16(16);
+  sum = _mm256_add_epi16(sum, sixteen);
+  sum = _mm256_srai_epi16(sum, 5);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  __m256i sum = dc_sum_64(above);
+  (void)left;
+
+  const __m256i thirtytwo = _mm256_set1_epi16(32);
+  sum = _mm256_add_epi16(sum, thirtytwo);
+  sum = _mm256_srai_epi16(sum, 6);
+  const __m256i zero = _mm256_setzero_si256();
+  __m256i row = _mm256_shuffle_epi8(sum, zero);
+  row_store_64xh(&row, 16, dst, stride);
+}
+
 void aom_dc_left_predictor_32x16_avx2(uint8_t *dst, ptrdiff_t stride,
                                       const uint8_t *above,
                                       const uint8_t *left) {
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 56efc0e..380862d 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -277,6 +277,62 @@
   dc_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_dc_top_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 64, dst, stride);
+}
+
+void aom_dc_top_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_64(above);
+  const __m128i thirtytwo = _mm_set1_epi16((uint16_t)32);
+  sum_above = _mm_add_epi16(sum_above, thirtytwo);
+  sum_above = _mm_srai_epi16(sum_above, 6);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_64xh(&row, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // DC_LEFT
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index cb1735e..12e8079 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -687,7 +687,8 @@
                 aom_dc_top_predictor_32x16_sse2,
                 aom_dc_128_predictor_32x16_sse2, aom_v_predictor_32x16_sse2,
                 aom_h_predictor_32x16_sse2, NULL, NULL, NULL, NULL)
-INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2, NULL,
+INTRA_PRED_TEST(SSE2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_sse2,
+                aom_dc_top_predictor_32x64_sse2,
                 aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
                 NULL)
 #endif  // HAVE_SSE2
@@ -715,7 +716,8 @@
                 aom_dc_top_predictor_32x16_avx2,
                 aom_dc_128_predictor_32x16_avx2, aom_v_predictor_32x16_avx2,
                 NULL, aom_paeth_predictor_32x16_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2, NULL,
+INTRA_PRED_TEST(AVX2_3, TX_32X64, NULL, aom_dc_left_predictor_32x64_avx2,
+                aom_dc_top_predictor_32x64_avx2,
                 aom_dc_128_predictor_32x64_avx2, NULL, NULL,
                 aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
@@ -760,13 +762,16 @@
                 aom_smooth_h_predictor_64x16_c)
 
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2, NULL,
+INTRA_PRED_TEST(SSE2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_sse2,
+                aom_dc_top_predictor_64x64_sse2,
                 aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
                 NULL)
-INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2, NULL,
+INTRA_PRED_TEST(SSE2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_sse2,
+                aom_dc_top_predictor_64x32_sse2,
                 aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
                 NULL)
-INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2, NULL,
+INTRA_PRED_TEST(SSE2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_sse2,
+                aom_dc_top_predictor_64x16_sse2,
                 aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
                 NULL)
 #endif
@@ -781,13 +786,16 @@
 #endif
 
 #if HAVE_AVX2
-INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2, NULL,
+INTRA_PRED_TEST(AVX2_4, TX_64X64, NULL, aom_dc_left_predictor_64x64_avx2,
+                aom_dc_top_predictor_64x64_avx2,
                 aom_dc_128_predictor_64x64_avx2, NULL, NULL,
                 aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2, NULL,
+INTRA_PRED_TEST(AVX2_5, TX_64X32, NULL, aom_dc_left_predictor_64x32_avx2,
+                aom_dc_top_predictor_64x32_avx2,
                 aom_dc_128_predictor_64x32_avx2, NULL, NULL,
                 aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
-INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2, NULL,
+INTRA_PRED_TEST(AVX2_6, TX_64X16, NULL, aom_dc_left_predictor_64x16_avx2,
+                aom_dc_top_predictor_64x16_avx2,
                 aom_dc_128_predictor_64x16_avx2, NULL, NULL,
                 aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif