Add sse2/avx2 version of aom_v_predictor_wxh()

for width or height equal to 64.

Change-Id: I6981b212bfc24cf17fd4b21bf9fd4d16a5249751
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index ea40dcc..14f1a63 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -134,6 +134,10 @@
 specialize qw/aom_v_predictor_16x32 sse2/;
 specialize qw/aom_v_predictor_32x16 sse2 avx2/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2 avx2/;
+specialize qw/aom_v_predictor_32x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x64 sse2 avx2/;
+specialize qw/aom_v_predictor_64x32 sse2 avx2/;
+specialize qw/aom_v_predictor_64x16 sse2 avx2/;
 specialize qw/aom_h_predictor_4x8 sse2/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_h_predictor_8x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 4ec3cca..5c852ac 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -44,6 +44,16 @@
   }
 }
 
+static INLINE void row_store_32x2xh(const __m256i *r0, const __m256i *r1,
+                                    int height, uint8_t *dst,
+                                    ptrdiff_t stride) {
+  for (int i = 0; i < height; ++i) {
+    _mm256_storeu_si256((__m256i *)dst, *r0);
+    _mm256_storeu_si256((__m256i *)(dst + 32), *r1);
+    dst += stride;
+  }
+}
+
 static INLINE void row_store_64xh(const __m256i *r, int height, uint8_t *dst,
                                   ptrdiff_t stride) {
   for (int i = 0; i < height; ++i) {
@@ -435,6 +445,37 @@
   row_store_32xh(&row, 16, dst, stride);
 }
 
+void aom_v_predictor_32x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row = _mm256_loadu_si256((const __m256i *)above);
+  (void)left;
+  row_store_32xh(&row, 64, dst, stride);
+}
+
+void aom_v_predictor_64x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 64, dst, stride);
+}
+
+void aom_v_predictor_64x32_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 32, dst, stride);
+}
+
+void aom_v_predictor_64x16_avx2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m256i row0 = _mm256_loadu_si256((const __m256i *)above);
+  const __m256i row1 = _mm256_loadu_si256((const __m256i *)(above + 32));
+  (void)left;
+  row_store_32x2xh(&row0, &row1, 16, dst, stride);
+}
+
 // -----------------------------------------------------------------------------
 // PAETH_PRED
 
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
index 387d0a2..c827464 100644
--- a/aom_dsp/x86/intrapred_sse2.c
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -665,6 +665,66 @@
   }
 }
 
+void aom_v_predictor_32x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  (void)left;
+  for (int i = 0; i < 64; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x64_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  (void)left;
+  for (int i = 0; i < 64; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  (void)left;
+  for (int i = 0; i < 32; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
+void aom_v_predictor_64x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i row0 = _mm_load_si128((__m128i const *)above);
+  const __m128i row1 = _mm_load_si128((__m128i const *)(above + 16));
+  const __m128i row2 = _mm_load_si128((__m128i const *)(above + 32));
+  const __m128i row3 = _mm_load_si128((__m128i const *)(above + 48));
+  (void)left;
+  for (int i = 0; i < 16; ++i) {
+    _mm_store_si128((__m128i *)dst, row0);
+    _mm_store_si128((__m128i *)(dst + 16), row1);
+    _mm_store_si128((__m128i *)(dst + 32), row2);
+    _mm_store_si128((__m128i *)(dst + 48), row3);
+    dst += stride;
+  }
+}
+
 // -----------------------------------------------------------------------------
 // H_PRED
 
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index dd4a839..9a27525 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -690,8 +690,8 @@
 INTRA_PRED_TEST(SSE2_3, TX_32X64, aom_dc_predictor_32x64_sse2,
                 aom_dc_left_predictor_32x64_sse2,
                 aom_dc_top_predictor_32x64_sse2,
-                aom_dc_128_predictor_32x64_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL)
+                aom_dc_128_predictor_32x64_sse2, aom_v_predictor_32x64_sse2,
+                NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -720,8 +720,8 @@
 INTRA_PRED_TEST(AVX2_3, TX_32X64, aom_dc_predictor_32x64_avx2,
                 aom_dc_left_predictor_32x64_avx2,
                 aom_dc_top_predictor_32x64_avx2,
-                aom_dc_128_predictor_32x64_avx2, NULL, NULL,
-                aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
+                aom_dc_128_predictor_32x64_avx2, aom_v_predictor_32x64_avx2,
+                NULL, aom_paeth_predictor_32x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
 #if HAVE_NEON
@@ -767,18 +767,18 @@
 INTRA_PRED_TEST(SSE2_4, TX_64X64, aom_dc_predictor_64x64_sse2,
                 aom_dc_left_predictor_64x64_sse2,
                 aom_dc_top_predictor_64x64_sse2,
-                aom_dc_128_predictor_64x64_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL)
+                aom_dc_128_predictor_64x64_sse2, aom_v_predictor_64x64_sse2,
+                NULL, NULL, NULL, NULL, NULL)
 INTRA_PRED_TEST(SSE2_5, TX_64X32, aom_dc_predictor_64x32_sse2,
                 aom_dc_left_predictor_64x32_sse2,
                 aom_dc_top_predictor_64x32_sse2,
-                aom_dc_128_predictor_64x32_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL)
+                aom_dc_128_predictor_64x32_sse2, aom_v_predictor_64x32_sse2,
+                NULL, NULL, NULL, NULL, NULL)
 INTRA_PRED_TEST(SSE2_6, TX_64X16, aom_dc_predictor_64x16_sse2,
                 aom_dc_left_predictor_64x16_sse2,
                 aom_dc_top_predictor_64x16_sse2,
-                aom_dc_128_predictor_64x16_sse2, NULL, NULL, NULL, NULL, NULL,
-                NULL)
+                aom_dc_128_predictor_64x16_sse2, aom_v_predictor_64x16_sse2,
+                NULL, NULL, NULL, NULL, NULL)
 #endif
 
 #if HAVE_SSSE3
@@ -794,18 +794,18 @@
 INTRA_PRED_TEST(AVX2_4, TX_64X64, aom_dc_predictor_64x64_avx2,
                 aom_dc_left_predictor_64x64_avx2,
                 aom_dc_top_predictor_64x64_avx2,
-                aom_dc_128_predictor_64x64_avx2, NULL, NULL,
-                aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
+                aom_dc_128_predictor_64x64_avx2, aom_v_predictor_64x64_avx2,
+                NULL, aom_paeth_predictor_64x64_avx2, NULL, NULL, NULL)
 INTRA_PRED_TEST(AVX2_5, TX_64X32, aom_dc_predictor_64x32_avx2,
                 aom_dc_left_predictor_64x32_avx2,
                 aom_dc_top_predictor_64x32_avx2,
-                aom_dc_128_predictor_64x32_avx2, NULL, NULL,
-                aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
+                aom_dc_128_predictor_64x32_avx2, aom_v_predictor_64x32_avx2,
+                NULL, aom_paeth_predictor_64x32_avx2, NULL, NULL, NULL)
 INTRA_PRED_TEST(AVX2_6, TX_64X16, aom_dc_predictor_64x16_avx2,
                 aom_dc_left_predictor_64x16_avx2,
                 aom_dc_top_predictor_64x16_avx2,
-                aom_dc_128_predictor_64x16_avx2, NULL, NULL,
-                aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
+                aom_dc_128_predictor_64x16_avx2, aom_v_predictor_64x16_avx2,
+                NULL, aom_paeth_predictor_64x16_avx2, NULL, NULL, NULL)
 #endif
 // -----------------------------------------------------------------------------
 // High Bitdepth