Lowbd rect intrapred DC/LEFT/TOP/128 sse2 optimization

Add lowbd unit test functionality to intrapred_test.cc
Function speedup against C (i7-6700):
Predictor   DC     LEFT   TOP    128
4x8        ~1.4x  ~1.4x  ~1.7x  ~1.9x
8x4        ~1.2x  ~1.6x  ~1.6x  ~2.6x
8x16       ~1.4x  ~1.3x  ~1.4x  ~2.1x
16x8       ~2.0x  ~1.8x  ~2.3x  ~2.1x
16x32      ~2.0x  ~1.9x  ~1.8x  ~2.2x
32x16      ~2.0x  ~2.0x  ~1.9x  ~2.2x

Change-Id: I33db512020ca3c6853a9205a8079f3d00134f584
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 0da4392..89f294b 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -45,6 +45,7 @@
 set(AOM_DSP_COMMON_INTRIN_SSE2
     "${AOM_ROOT}/aom_dsp/x86/aom_asm_stubs.c"
     "${AOM_ROOT}/aom_dsp/x86/convolve.h"
+    "${AOM_ROOT}/aom_dsp/x86/intrapred_sse2.c"
     "${AOM_ROOT}/aom_dsp/x86/txfm_common_sse2.h"
     "${AOM_ROOT}/aom_dsp/x86/lpf_common_sse2.h"
     "${AOM_ROOT}/aom_dsp/x86/loopfilter_sse2.c")
diff --git a/aom_dsp/aom_dsp.mk b/aom_dsp/aom_dsp.mk
index aebeef7..8898e10 100644
--- a/aom_dsp/aom_dsp.mk
+++ b/aom_dsp/aom_dsp.mk
@@ -75,6 +75,8 @@
 DSP_SRCS-$(HAVE_SSSE3) += x86/intrapred_ssse3.asm
 DSP_SRCS-$(HAVE_SSSE3) += x86/aom_subpixel_8t_ssse3.asm
 
+DSP_SRCS-$(HAVE_SSE2) += x86/intrapred_sse2.c
+
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE)  += x86/highbd_intrapred_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a052fed..0d8a1c9 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -83,36 +83,61 @@
   }
 }
 
+specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_top_predictor_4x8 sse2/;
+specialize qw/aom_dc_top_predictor_8x4 sse2/;
+specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_8x16 sse2/;
+specialize qw/aom_dc_top_predictor_16x8 sse2/;
+specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_top_predictor_16x32 sse2/;
+specialize qw/aom_dc_top_predictor_32x16 sse2/;
+specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_left_predictor_4x8 sse2/;
+specialize qw/aom_dc_left_predictor_8x4 sse2/;
+specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_8x16 sse2/;
+specialize qw/aom_dc_left_predictor_16x8 sse2/;
+specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_left_predictor_16x32 sse2/;
+specialize qw/aom_dc_left_predictor_32x16 sse2/;
+specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_128_predictor_4x8 sse2/;
+specialize qw/aom_dc_128_predictor_8x4 sse2/;
+specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_8x16 sse2/;
+specialize qw/aom_dc_128_predictor_16x8 sse2/;
+specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_128_predictor_16x32 sse2/;
+specialize qw/aom_dc_128_predictor_32x16 sse2/;
+specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
+
 specialize qw/aom_d63e_predictor_4x4 ssse3/;
 specialize qw/aom_h_predictor_4x4 neon dspr2 msa sse2/;
 specialize qw/aom_d135_predictor_4x4 neon/;
 specialize qw/aom_d153_predictor_4x4 ssse3/;
 specialize qw/aom_v_predictor_4x4 neon msa sse2/;
 specialize qw/aom_dc_predictor_4x4 dspr2 msa neon sse2/;
-specialize qw/aom_dc_top_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_left_predictor_4x4 msa neon sse2/;
-specialize qw/aom_dc_128_predictor_4x4 msa neon sse2/;
+specialize qw/aom_dc_predictor_4x8 sse2/;
 specialize qw/aom_h_predictor_8x8 neon dspr2 msa sse2/;
 specialize qw/aom_d153_predictor_8x8 ssse3/;
 specialize qw/aom_v_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_predictor_8x4 sse2/;
 specialize qw/aom_dc_predictor_8x8 dspr2 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_8x8 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_8x8 neon msa sse2/;
+specialize qw/aom_dc_predictor_8x16 sse2/;
 specialize qw/aom_h_predictor_16x16 neon dspr2 msa sse2/;
 specialize qw/aom_d153_predictor_16x16 ssse3/;
 specialize qw/aom_v_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_predictor_16x8 sse2/;
 specialize qw/aom_dc_predictor_16x16 dspr2 neon msa sse2/;
-specialize qw/aom_dc_top_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_left_predictor_16x16 neon msa sse2/;
-specialize qw/aom_dc_128_predictor_16x16 neon msa sse2/;
+specialize qw/aom_dc_predictor_16x32 sse2/;
 specialize qw/aom_h_predictor_32x32 neon msa sse2/;
 specialize qw/aom_d153_predictor_32x32 ssse3/;
 specialize qw/aom_v_predictor_32x32 neon msa sse2/;
+specialize qw/aom_dc_predictor_32x16 sse2/;
 specialize qw/aom_dc_predictor_32x32 msa neon sse2/;
-specialize qw/aom_dc_top_predictor_32x32 msa neon sse2/;
-specialize qw/aom_dc_left_predictor_32x32 msa neon sse2/;
-specialize qw/aom_dc_128_predictor_32x32 msa neon sse2/;
 
 if (aom_config("CONFIG_HIGHBITDEPTH") eq "yes") {
   specialize qw/aom_highbd_v_predictor_4x4 sse2/;
diff --git a/aom_dsp/x86/intrapred_sse2.c b/aom_dsp/x86/intrapred_sse2.c
new file mode 100644
index 0000000..00716bd
--- /dev/null
+++ b/aom_dsp/x86/intrapred_sse2.c
@@ -0,0 +1,387 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./aom_dsp_rtcd.h"
+
+static INLINE void dc_store_4x8(uint32_t dc, uint8_t *dst, ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < 4; ++i) {
+    *(uint32_t *)dst = dc;
+    dst += stride;
+    *(uint32_t *)dst = dc;
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_8xh(const __m128i *row, int height, uint8_t *dst,
+                                ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_storel_epi64((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_16xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    dst += stride;
+  }
+}
+
+static INLINE void dc_store_32xh(const __m128i *row, int height, uint8_t *dst,
+                                 ptrdiff_t stride) {
+  int i;
+  for (i = 0; i < height; ++i) {
+    _mm_store_si128((__m128i *)dst, *row);
+    _mm_store_si128((__m128i *)(dst + 16), *row);
+    dst += stride;
+  }
+}
+
+static INLINE __m128i dc_sum_4(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_unpacklo_epi8(x, zero);
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_8(const uint8_t *ref) {
+  __m128i x = _mm_loadl_epi64((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  return _mm_sad_epu8(x, zero);
+}
+
+static INLINE __m128i dc_sum_16(const uint8_t *ref) {
+  __m128i x = _mm_load_si128((__m128i const *)ref);
+  const __m128i zero = _mm_setzero_si128();
+  x = _mm_sad_epu8(x, zero);
+  const __m128i high = _mm_unpackhi_epi64(x, x);
+  return _mm_add_epi16(x, high);
+}
+
+static INLINE __m128i dc_sum_32(const uint8_t *ref) {
+  __m128i x0 = _mm_load_si128((__m128i const *)ref);
+  __m128i x1 = _mm_load_si128((__m128i const *)(ref + 16));
+  const __m128i zero = _mm_setzero_si128();
+  x0 = _mm_sad_epu8(x0, zero);
+  x1 = _mm_sad_epu8(x1, zero);
+  x0 = _mm_add_epi16(x0, x1);
+  const __m128i high = _mm_unpackhi_epi64(x0, x0);
+  return _mm_add_epi16(x0, high);
+}
+
+// -----------------------------------------------------------------------------
+// DC_PRED
+
+void aom_dc_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_4(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum /= 12;
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  const uint32_t pred = _mm_cvtsi128_si32(row);
+  dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                               const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_4(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 6;
+  sum /= 12;
+
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_16(left);
+  __m128i sum_above = dc_sum_8(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum /= 24;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_8(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 12;
+  sum /= 24;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  const __m128i sum_left = dc_sum_32(left);
+  __m128i sum_above = dc_sum_16(above);
+  sum_above = _mm_add_epi16(sum_left, sum_above);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum /= 48;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                 const uint8_t *above, const uint8_t *left) {
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sum_left = dc_sum_16(left);
+  sum_above = _mm_add_epi16(sum_above, sum_left);
+
+  uint32_t sum = _mm_cvtsi128_si32(sum_above);
+  sum += 24;
+  sum /= 48;
+  const __m128i row = _mm_set1_epi8((uint8_t)sum);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_TOP
+
+void aom_dc_top_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_4(above);
+  const __m128i two = _mm_set1_epi16((int16_t)2);
+  sum_above = _mm_add_epi16(sum_above, two);
+  sum_above = _mm_srai_epi16(sum_above, 2);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  sum_above = _mm_packus_epi16(sum_above, sum_above);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_above);
+  dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_top_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_top_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_8(above);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_above = _mm_add_epi16(sum_above, four);
+  sum_above = _mm_srai_epi16(sum_above, 3);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  const __m128i row = _mm_shufflelo_epi16(sum_above, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_top_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_top_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_16(above);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_above = _mm_add_epi16(sum_above, eight);
+  sum_above = _mm_srai_epi16(sum_above, 4);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_top_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)left;
+  __m128i sum_above = dc_sum_32(above);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_above = _mm_add_epi16(sum_above, sixteen);
+  sum_above = _mm_srai_epi16(sum_above, 5);
+  sum_above = _mm_unpacklo_epi8(sum_above, sum_above);
+  sum_above = _mm_shufflelo_epi16(sum_above, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_above, sum_above);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_LEFT
+
+void aom_dc_left_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  sum_left = _mm_packus_epi16(sum_left, sum_left);
+
+  const uint32_t pred = _mm_cvtsi128_si32(sum_left);
+  dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_left_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_4(left);
+  const __m128i two = _mm_set1_epi16((uint16_t)2);
+  sum_left = _mm_add_epi16(sum_left, two);
+  sum_left = _mm_srai_epi16(sum_left, 2);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_left_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  const __m128i row = _mm_shufflelo_epi16(sum_left, 0);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_left_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_8(left);
+  const __m128i four = _mm_set1_epi16((uint16_t)4);
+  sum_left = _mm_add_epi16(sum_left, four);
+  sum_left = _mm_srai_epi16(sum_left, 3);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_left_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_32(left);
+  const __m128i sixteen = _mm_set1_epi16((uint16_t)16);
+  sum_left = _mm_add_epi16(sum_left, sixteen);
+  sum_left = _mm_srai_epi16(sum_left, 5);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_left_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                      const uint8_t *above,
+                                      const uint8_t *left) {
+  (void)above;
+  __m128i sum_left = dc_sum_16(left);
+  const __m128i eight = _mm_set1_epi16((uint16_t)8);
+  sum_left = _mm_add_epi16(sum_left, eight);
+  sum_left = _mm_srai_epi16(sum_left, 4);
+  sum_left = _mm_unpacklo_epi8(sum_left, sum_left);
+  sum_left = _mm_shufflelo_epi16(sum_left, 0);
+  const __m128i row = _mm_unpacklo_epi64(sum_left, sum_left);
+  dc_store_32xh(&row, 16, dst, stride);
+}
+
+// -----------------------------------------------------------------------------
+// DC_128
+
+void aom_dc_128_predictor_4x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const uint32_t pred = 0x80808080;
+  dc_store_4x8(pred, dst, stride);
+}
+
+void aom_dc_128_predictor_8x4_sse2(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 4, dst, stride);
+}
+
+void aom_dc_128_predictor_8x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_8xh(&row, 16, dst, stride);
+}
+
+void aom_dc_128_predictor_16x8_sse2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 8, dst, stride);
+}
+
+void aom_dc_128_predictor_16x32_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_16xh(&row, 32, dst, stride);
+}
+
+void aom_dc_128_predictor_32x16_sse2(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  (void)above;
+  (void)left;
+  const __m128i row = _mm_set1_epi8((uint8_t)128);
+  dc_store_32xh(&row, 16, dst, stride);
+}
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 512aab4..9306c8b 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -29,27 +29,32 @@
 
 const int count_test_block = 100000;
 
-typedef void (*IntraPred)(uint16_t *dst, ptrdiff_t stride,
-                          const uint16_t *above, const uint16_t *left, int bps);
+typedef void (*HighbdIntraPred)(uint16_t *dst, ptrdiff_t stride,
+                                const uint16_t *above, const uint16_t *left,
+                                int bps);
+typedef void (*IntraPred)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above,
+                          const uint8_t *left);
 
+template <typename FuncType>
 struct IntraPredFunc {
-  IntraPredFunc(IntraPred pred = NULL, IntraPred ref = NULL,
+  IntraPredFunc(FuncType pred = NULL, FuncType ref = NULL,
                 int block_width_value = 0, int block_height_value = 0,
                 int bit_depth_value = 0)
       : pred_fn(pred), ref_fn(ref), block_width(block_width_value),
         block_height(block_height_value), bit_depth(bit_depth_value) {}
 
-  IntraPred pred_fn;
-  IntraPred ref_fn;
+  FuncType pred_fn;
+  FuncType ref_fn;
   int block_width;
   int block_height;
   int bit_depth;
 };
 
-class AV1IntraPredTest : public ::testing::TestWithParam<IntraPredFunc> {
+template <typename FuncType, typename Pixel>
+class AV1IntraPredTest
+    : public ::testing::TestWithParam<IntraPredFunc<FuncType> > {
  public:
-  void RunTest(uint16_t *left_col, uint16_t *above_data, uint16_t *dst,
-               uint16_t *ref_dst) {
+  void RunTest(Pixel *left_col, Pixel *above_data, Pixel *dst, Pixel *ref_dst) {
     ACMRandom rnd(ACMRandom::DeterministicSeed());
     const int block_width = params_.block_width;
     const int block_height = params_.block_height;
@@ -82,17 +87,12 @@
 
  protected:
   virtual void SetUp() {
-    params_ = GetParam();
+    params_ = this->GetParam();
     stride_ = params_.block_width * 3;
     mask_ = (1 << params_.bit_depth) - 1;
   }
 
-  void Predict() {
-    const int bit_depth = params_.bit_depth;
-    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
-    ASM_REGISTER_STATE_CHECK(
-        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
-  }
+  virtual void Predict() = 0;
 
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
@@ -109,17 +109,36 @@
     }
   }
 
-  uint16_t *above_row_;
-  uint16_t *left_col_;
-  uint16_t *dst_;
-  uint16_t *ref_dst_;
+  Pixel *above_row_;
+  Pixel *left_col_;
+  Pixel *dst_;
+  Pixel *ref_dst_;
   ptrdiff_t stride_;
   int mask_;
 
-  IntraPredFunc params_;
+  IntraPredFunc<FuncType> params_;
 };
 
-TEST_P(AV1IntraPredTest, IntraPredTests) {
+class HighbdIntraPredTest : public AV1IntraPredTest<HighbdIntraPred, uint16_t> {
+ protected:
+  void Predict() {
+    const int bit_depth = params_.bit_depth;
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
+  }
+};
+
+class LowbdIntraPredTest : public AV1IntraPredTest<IntraPred, uint8_t> {
+ protected:
+  void Predict() {
+    params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.pred_fn(dst_, stride_, above_row_, left_col_));
+  }
+};
+
+TEST_P(HighbdIntraPredTest, IntraPredTests) {
   // max block size is 32
   DECLARE_ALIGNED(16, uint16_t, left_col[2 * 32]);
   DECLARE_ALIGNED(16, uint16_t, above_data[2 * 32 + 32]);
@@ -128,229 +147,82 @@
   RunTest(left_col, above_data, dst, ref_dst);
 }
 
-#define highbd_entry(type, width, height, opt, bd)                            \
-  IntraPredFunc(&aom_highbd_##type##_predictor_##width##x##height##_##opt,    \
-                &aom_highbd_##type##_predictor_##width##x##height##_c, width, \
-                height, bd)
+TEST_P(LowbdIntraPredTest, IntraPredTests) {
+  // max block size is 32
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  RunTest(left_col, above_data, dst, ref_dst);
+}
+
+#define highbd_entry(type, width, height, opt, bd)                          \
+  IntraPredFunc<HighbdIntraPred>(                                           \
+      &aom_highbd_##type##_predictor_##width##x##height##_##opt,            \
+      &aom_highbd_##type##_predictor_##width##x##height##_c, width, height, \
+      bd)
+
+#define highbd_intrapred(type, opt, bd)                                       \
+  highbd_entry(type, 4, 4, opt, bd), highbd_entry(type, 4, 8, opt, bd),       \
+      highbd_entry(type, 8, 4, opt, bd), highbd_entry(type, 8, 8, opt, bd),   \
+      highbd_entry(type, 8, 16, opt, bd), highbd_entry(type, 16, 8, opt, bd), \
+      highbd_entry(type, 16, 16, opt, bd),                                    \
+      highbd_entry(type, 16, 32, opt, bd),                                    \
+      highbd_entry(type, 32, 16, opt, bd), highbd_entry(type, 32, 32, opt, bd)
 
 #if HAVE_SSE2
 #if CONFIG_HIGHBITDEPTH
-const IntraPredFunc IntraPredTestVector8[] = {
-  highbd_entry(dc, 4, 4, sse2, 8),
-  highbd_entry(dc, 4, 8, sse2, 8),
-  highbd_entry(dc, 8, 4, sse2, 8),
-  highbd_entry(dc, 8, 8, sse2, 8),
-  highbd_entry(dc, 8, 16, sse2, 8),
-  highbd_entry(dc, 16, 8, sse2, 8),
-  highbd_entry(dc, 16, 16, sse2, 8),
-  highbd_entry(dc, 16, 32, sse2, 8),
-  highbd_entry(dc, 32, 16, sse2, 8),
-  highbd_entry(dc, 32, 32, sse2, 8),
-
-  highbd_entry(dc_left, 4, 4, sse2, 8),
-  highbd_entry(dc_left, 4, 8, sse2, 8),
-  highbd_entry(dc_top, 4, 4, sse2, 8),
-  highbd_entry(dc_top, 4, 8, sse2, 8),
-  highbd_entry(dc_128, 4, 4, sse2, 8),
-  highbd_entry(dc_128, 4, 8, sse2, 8),
-
-  highbd_entry(dc_left, 8, 4, sse2, 8),
-  highbd_entry(dc_top, 8, 4, sse2, 8),
-  highbd_entry(dc_128, 8, 4, sse2, 8),
-  highbd_entry(dc_left, 8, 8, sse2, 8),
-  highbd_entry(dc_top, 8, 8, sse2, 8),
-  highbd_entry(dc_128, 8, 8, sse2, 8),
-  highbd_entry(dc_left, 8, 16, sse2, 8),
-  highbd_entry(dc_top, 8, 16, sse2, 8),
-  highbd_entry(dc_128, 8, 16, sse2, 8),
-
-  highbd_entry(dc_left, 16, 8, sse2, 8),
-  highbd_entry(dc_top, 16, 8, sse2, 8),
-  highbd_entry(dc_128, 16, 8, sse2, 8),
-  highbd_entry(dc_left, 16, 16, sse2, 8),
-  highbd_entry(dc_top, 16, 16, sse2, 8),
-  highbd_entry(dc_128, 16, 16, sse2, 8),
-  highbd_entry(dc_left, 16, 32, sse2, 8),
-  highbd_entry(dc_top, 16, 32, sse2, 8),
-  highbd_entry(dc_128, 16, 32, sse2, 8),
-
-  highbd_entry(dc_left, 32, 16, sse2, 8),
-  highbd_entry(dc_top, 32, 16, sse2, 8),
-  highbd_entry(dc_128, 32, 16, sse2, 8),
-  highbd_entry(dc_left, 32, 32, sse2, 8),
-  highbd_entry(dc_top, 32, 32, sse2, 8),
-  highbd_entry(dc_128, 32, 32, sse2, 8),
-
-  highbd_entry(v, 4, 4, sse2, 8),
-  highbd_entry(v, 4, 8, sse2, 8),
-  highbd_entry(v, 8, 4, sse2, 8),
-  highbd_entry(v, 8, 8, sse2, 8),
-  highbd_entry(v, 8, 16, sse2, 8),
-  highbd_entry(v, 16, 8, sse2, 8),
-  highbd_entry(v, 16, 16, sse2, 8),
-  highbd_entry(v, 16, 32, sse2, 8),
-  highbd_entry(v, 32, 16, sse2, 8),
-  highbd_entry(v, 32, 32, sse2, 8),
-
-  highbd_entry(h, 4, 4, sse2, 8),
-  highbd_entry(h, 4, 8, sse2, 8),
-  highbd_entry(h, 8, 4, sse2, 8),
-  highbd_entry(h, 8, 8, sse2, 8),
-  highbd_entry(h, 8, 16, sse2, 8),
-  highbd_entry(h, 16, 8, sse2, 8),
-  highbd_entry(h, 16, 16, sse2, 8),
-  highbd_entry(h, 16, 32, sse2, 8),
-  highbd_entry(h, 32, 16, sse2, 8),
-  highbd_entry(h, 32, 32, sse2, 8),
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector8[] = {
+  highbd_intrapred(dc, sse2, 8),     highbd_intrapred(dc_left, sse2, 8),
+  highbd_intrapred(dc_top, sse2, 8), highbd_intrapred(dc_128, sse2, 8),
+  highbd_intrapred(h, sse2, 8),      highbd_intrapred(v, sse2, 8),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, AV1IntraPredTest,
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector8));
 
-const IntraPredFunc IntraPredTestVector10[] = {
-  highbd_entry(dc, 4, 4, sse2, 10),
-  highbd_entry(dc, 4, 8, sse2, 10),
-  highbd_entry(dc, 8, 4, sse2, 10),
-  highbd_entry(dc, 8, 8, sse2, 10),
-  highbd_entry(dc, 8, 16, sse2, 10),
-  highbd_entry(dc, 16, 8, sse2, 10),
-  highbd_entry(dc, 16, 16, sse2, 10),
-  highbd_entry(dc, 16, 32, sse2, 10),
-  highbd_entry(dc, 32, 16, sse2, 10),
-  highbd_entry(dc, 32, 32, sse2, 10),
-
-  highbd_entry(dc_left, 4, 4, sse2, 10),
-  highbd_entry(dc_left, 4, 8, sse2, 10),
-  highbd_entry(dc_top, 4, 4, sse2, 10),
-  highbd_entry(dc_top, 4, 8, sse2, 10),
-  highbd_entry(dc_128, 4, 4, sse2, 10),
-  highbd_entry(dc_128, 4, 8, sse2, 10),
-
-  highbd_entry(dc_left, 8, 4, sse2, 10),
-  highbd_entry(dc_top, 8, 4, sse2, 10),
-  highbd_entry(dc_128, 8, 4, sse2, 10),
-  highbd_entry(dc_left, 8, 8, sse2, 10),
-  highbd_entry(dc_top, 8, 8, sse2, 10),
-  highbd_entry(dc_128, 8, 8, sse2, 10),
-  highbd_entry(dc_left, 8, 16, sse2, 10),
-  highbd_entry(dc_top, 8, 16, sse2, 10),
-  highbd_entry(dc_128, 8, 16, sse2, 10),
-
-  highbd_entry(dc_left, 16, 8, sse2, 10),
-  highbd_entry(dc_top, 16, 8, sse2, 10),
-  highbd_entry(dc_128, 16, 8, sse2, 10),
-  highbd_entry(dc_left, 16, 16, sse2, 10),
-  highbd_entry(dc_top, 16, 16, sse2, 10),
-  highbd_entry(dc_128, 16, 16, sse2, 10),
-  highbd_entry(dc_left, 16, 32, sse2, 10),
-  highbd_entry(dc_top, 16, 32, sse2, 10),
-  highbd_entry(dc_128, 16, 32, sse2, 10),
-
-  highbd_entry(dc_left, 32, 16, sse2, 10),
-  highbd_entry(dc_top, 32, 16, sse2, 10),
-  highbd_entry(dc_128, 32, 16, sse2, 10),
-  highbd_entry(dc_left, 32, 32, sse2, 10),
-  highbd_entry(dc_top, 32, 32, sse2, 10),
-  highbd_entry(dc_128, 32, 32, sse2, 10),
-
-  highbd_entry(v, 4, 4, sse2, 10),
-  highbd_entry(v, 4, 8, sse2, 10),
-  highbd_entry(v, 8, 4, sse2, 10),
-  highbd_entry(v, 8, 8, sse2, 10),
-  highbd_entry(v, 8, 16, sse2, 10),
-  highbd_entry(v, 16, 8, sse2, 10),
-  highbd_entry(v, 16, 16, sse2, 10),
-  highbd_entry(v, 16, 32, sse2, 10),
-  highbd_entry(v, 32, 16, sse2, 10),
-  highbd_entry(v, 32, 32, sse2, 10),
-
-  highbd_entry(h, 4, 4, sse2, 10),
-  highbd_entry(h, 4, 8, sse2, 10),
-  highbd_entry(h, 8, 4, sse2, 10),
-  highbd_entry(h, 8, 8, sse2, 10),
-  highbd_entry(h, 8, 16, sse2, 10),
-  highbd_entry(h, 16, 8, sse2, 10),
-  highbd_entry(h, 16, 16, sse2, 10),
-  highbd_entry(h, 16, 32, sse2, 10),
-  highbd_entry(h, 32, 16, sse2, 10),
-  highbd_entry(h, 32, 32, sse2, 10),
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector10[] = {
+  highbd_intrapred(dc, sse2, 10),     highbd_intrapred(dc_left, sse2, 10),
+  highbd_intrapred(dc_top, sse2, 10), highbd_intrapred(dc_128, sse2, 10),
+  highbd_intrapred(h, sse2, 10),      highbd_intrapred(v, sse2, 10),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, AV1IntraPredTest,
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector10));
 
-const IntraPredFunc IntraPredTestVector12[] = {
-  highbd_entry(dc, 4, 4, sse2, 12),
-  highbd_entry(dc, 4, 8, sse2, 12),
-  highbd_entry(dc, 8, 4, sse2, 12),
-  highbd_entry(dc, 8, 8, sse2, 12),
-  highbd_entry(dc, 8, 16, sse2, 12),
-  highbd_entry(dc, 16, 8, sse2, 12),
-  highbd_entry(dc, 16, 16, sse2, 12),
-  highbd_entry(dc, 16, 32, sse2, 12),
-  highbd_entry(dc, 32, 16, sse2, 12),
-  highbd_entry(dc, 32, 32, sse2, 12),
-
-  highbd_entry(dc_left, 4, 4, sse2, 12),
-  highbd_entry(dc_left, 4, 8, sse2, 12),
-  highbd_entry(dc_top, 4, 4, sse2, 12),
-  highbd_entry(dc_top, 4, 8, sse2, 12),
-  highbd_entry(dc_128, 4, 4, sse2, 12),
-  highbd_entry(dc_128, 4, 8, sse2, 12),
-
-  highbd_entry(dc_left, 8, 4, sse2, 12),
-  highbd_entry(dc_top, 8, 4, sse2, 12),
-  highbd_entry(dc_128, 8, 4, sse2, 12),
-  highbd_entry(dc_left, 8, 8, sse2, 12),
-  highbd_entry(dc_top, 8, 8, sse2, 12),
-  highbd_entry(dc_128, 8, 8, sse2, 12),
-  highbd_entry(dc_left, 8, 16, sse2, 12),
-  highbd_entry(dc_top, 8, 16, sse2, 12),
-  highbd_entry(dc_128, 8, 16, sse2, 12),
-
-  highbd_entry(dc_left, 16, 8, sse2, 12),
-  highbd_entry(dc_top, 16, 8, sse2, 12),
-  highbd_entry(dc_128, 16, 8, sse2, 12),
-  highbd_entry(dc_left, 16, 16, sse2, 12),
-  highbd_entry(dc_top, 16, 16, sse2, 12),
-  highbd_entry(dc_128, 16, 16, sse2, 12),
-  highbd_entry(dc_left, 16, 32, sse2, 12),
-  highbd_entry(dc_top, 16, 32, sse2, 12),
-  highbd_entry(dc_128, 16, 32, sse2, 12),
-
-  highbd_entry(dc_left, 32, 16, sse2, 12),
-  highbd_entry(dc_top, 32, 16, sse2, 12),
-  highbd_entry(dc_128, 32, 16, sse2, 12),
-  highbd_entry(dc_left, 32, 32, sse2, 12),
-  highbd_entry(dc_top, 32, 32, sse2, 12),
-  highbd_entry(dc_128, 32, 32, sse2, 12),
-
-  highbd_entry(v, 4, 4, sse2, 12),
-  highbd_entry(v, 4, 8, sse2, 12),
-  highbd_entry(v, 8, 4, sse2, 12),
-  highbd_entry(v, 8, 8, sse2, 12),
-  highbd_entry(v, 8, 16, sse2, 12),
-  highbd_entry(v, 16, 8, sse2, 12),
-  highbd_entry(v, 16, 16, sse2, 12),
-  highbd_entry(v, 16, 32, sse2, 12),
-  highbd_entry(v, 32, 16, sse2, 12),
-  highbd_entry(v, 32, 32, sse2, 12),
-
-  highbd_entry(h, 4, 4, sse2, 12),
-  highbd_entry(h, 4, 8, sse2, 12),
-  highbd_entry(h, 8, 4, sse2, 12),
-  highbd_entry(h, 8, 8, sse2, 12),
-  highbd_entry(h, 8, 16, sse2, 12),
-  highbd_entry(h, 16, 8, sse2, 12),
-  highbd_entry(h, 16, 16, sse2, 12),
-  highbd_entry(h, 16, 32, sse2, 12),
-  highbd_entry(h, 32, 16, sse2, 12),
-  highbd_entry(h, 32, 32, sse2, 12),
+const IntraPredFunc<HighbdIntraPred> IntraPredTestVector12[] = {
+  highbd_intrapred(dc, sse2, 12),     highbd_intrapred(dc_left, sse2, 12),
+  highbd_intrapred(dc_top, sse2, 12), highbd_intrapred(dc_128, sse2, 12),
+  highbd_intrapred(h, sse2, 12),      highbd_intrapred(v, sse2, 12),
 };
 
-INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, AV1IntraPredTest,
+INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, HighbdIntraPredTest,
                         ::testing::ValuesIn(IntraPredTestVector12));
 
 #endif  // CONFIG_HIGHBITDEPTH
 #endif  // HAVE_SSE2
+
+#define lowbd_entry(type, width, height, opt)                                  \
+  IntraPredFunc<IntraPred>(&aom_##type##_predictor_##width##x##height##_##opt, \
+                           &aom_##type##_predictor_##width##x##height##_c,     \
+                           width, height, 8)
+
+#define lowbd_intrapred(type, opt)                                    \
+  lowbd_entry(type, 4, 4, opt), lowbd_entry(type, 4, 8, opt),         \
+      lowbd_entry(type, 8, 4, opt), lowbd_entry(type, 8, 8, opt),     \
+      lowbd_entry(type, 8, 16, opt), lowbd_entry(type, 16, 8, opt),   \
+      lowbd_entry(type, 16, 16, opt), lowbd_entry(type, 16, 32, opt), \
+      lowbd_entry(type, 32, 16, opt), lowbd_entry(type, 32, 32, opt)
+
+#if HAVE_SSE2
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVector[] = {
+  lowbd_intrapred(dc, sse2), lowbd_intrapred(dc_top, sse2),
+  lowbd_intrapred(dc_left, sse2), lowbd_intrapred(dc_128, sse2),
+};
+
+INSTANTIATE_TEST_CASE_P(SSE2, LowbdIntraPredTest,
+                        ::testing::ValuesIn(LowbdIntraPredTestVector));
+
+#endif  // HAVE_SSE2
+
 }  // namespace
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index c7f9665..7186f8c 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -122,7 +122,7 @@
 }
 
 void TestIntraPred4(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
+  static const char *const kSignatures4x4[kNumAv1IntraFuncs] = {
     "e7ed7353c3383fff942e500e9bfe82fe",
     "2a4a26fcc6ce005eadc08354d196c8a9",
     "269d92eff86f315d9c38fe7640d85b15",
@@ -142,11 +142,36 @@
     "723cf948137f7d8c7860d814e55ae67d",
 #endif  // CONFIG_SMOOTH_HV
   };
-  TestIntraPred(block_name, pred_funcs, kSignatures, 4, 4);
+  static const char *const kSignatures4x8[kNumAv1IntraFuncs] = {
+    "d9fbebdc85f71ab1e18461b2db4a2adc",
+    "5ccb2a68284bc9714d94b8a06ccadbb2",
+    "735d059abc2744f3ff3f9590f7191b37",
+    "d9fbebdc85f71ab1e18461b2db4a2adc",
+    "6819497c44cd0ace120add83672996ee",
+    "7e3244f5a2d3edf81c7e962a842b97f9",
+    "3fa52ee9acf5a25594cac684be263f32",
+    "c18dd23d57def4df4c6147c572dfc827",
+    "d007fbf7e43cb8f49702daa20f0c9153",
+    "5c0226c44c5df285728296b80cc6de4b",
+    "b55d7b558bebc8c2042dfac58b3c4688",
+    "6549362baa389b8faa2d954926b64e2f",
+    "809350f164cd4d1650850bb0f59c3260",
+    "1b60a394331eeab6927a6f8aaff57040",
+#if CONFIG_SMOOTH_HV
+    "5307de1bd7329ba6b281d2c1b0b457f9",
+    "24c58a8138339846d95568efb91751db",
+#endif
+  };
+  if (!strcmp(block_name, "intra4x4")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures4x4, 4, 4);
+  }
+  if (!strcmp(block_name, "intra4x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures4x8, 4, 8);
+  }
 }
 
 void TestIntraPred8(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
+  static const char *const kSignatures8x8[kNumAv1IntraFuncs] = {
     "d8bbae5d6547cfc17e4f5f44c8730e88",
     "373bab6d931868d41a601d9d88ce9ac3",
     "6fdd5ff4ff79656c14747598ca9e3706",
@@ -166,11 +191,59 @@
     "d19a8a73cc46b807f2c5e817576cc1e1",
 #endif  // CONFIG_SMOOTH_HV
   };
-  TestIntraPred(block_name, pred_funcs, kSignatures, 8, 8);
+  static const char *const kSignatures8x4[kNumAv1IntraFuncs] = {
+    "23f9fc11344426c9bee2e06d57dfd628",
+    "2d71a26d1bae1fb34734de7b42fc5eb7",
+    "5af9c1b2fd9d5721fad67b67b3f7c816",
+    "00d71b17be662753813d515f197d145e",
+    "bef10ec984427e28f4390f43809d10af",
+    "77773cdfb7ed6bc882ab202a64b0a470",
+    "cba356970f6b9a1b6024e1dbe4a66f9b",
+    "c58c21efc804242848e6f29a93a7984d",
+    "dc92cc45a51c7a397506cab19f74e66d",
+    "391f6a12224f81a3719ea09a2cf7a5ad",
+    "b74b8b11f7eb2bbf723b25f381104ca9",
+    "2234aaa06ca245624211cf53a0261017",
+    "2cc48bd66d6b0121b5221d52ccd732af",
+    "b302155e1c9eeeafe2ba2bf68e807a46",
+#if CONFIG_SMOOTH_HV
+    "561bc8d0e76d5041ebd5168fc6a115e1",
+    "81d0113fb1d0a9a24ffd6f1987b77948",
+#endif
+  };
+  static const char *const kSignatures8x16[kNumAv1IntraFuncs] = {
+    "c849de88b24f773dfcdd1d48d1209796",
+    "6cb807c1897b94866a0f3d3c56ed8695",
+    "d56db05a8ac7981762f5b877f486c4ef",
+    "b4bc01eb6e59a40922ad17715cafb04b",
+    "09d178439534f4062ae687c351f66d64",
+    "644501399cf73080ac606e5cef7ca09b",
+    "0e8e968fa177204d7e73d7e04ce69ebb",
+    "1d25f9287fdf7ba48a5105f1529b7e75",
+    "02cacccf3752451763a6a6e2e784494f",
+    "6044a1416d53e324ddc012d2e7763339",
+    "57ac6e8f3ab5e943c9280043eeb174b8",
+    "d51b9d65471194d9caebc7d67e75ef10",
+    "278076495180e17c065a95ab7278539a",
+    "9dd7f324816f242be408ffeb0c673732",
+#if CONFIG_SMOOTH_HV
+    "f520c4a20acfa0bea1d253c6f0f040fd",
+    "85f38df809df2c2d7c8b4a157a65cd44",
+#endif
+  };
+  if (!strcmp(block_name, "intra8x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x8, 8, 8);
+  }
+  if (!strcmp(block_name, "intra8x4")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x4, 8, 4);
+  }
+  if (!strcmp(block_name, "intra8x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures8x16, 8, 16);
+  }
 }
 
 void TestIntraPred16(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
+  static const char *const kSignatures16x16[kNumAv1IntraFuncs] = {
     "50971c07ce26977d30298538fffec619",
     "527a6b9e0dc5b21b98cf276305432bef",
     "7eff2868f80ebc2c43a4f367281d80f7",
@@ -190,11 +263,59 @@
     "0c5162bc28489756ddb847b5678e6f07",
 #endif  // CONFIG_SMOOTH_HV
   };
-  TestIntraPred(block_name, pred_funcs, kSignatures, 16, 16);
+  static const char *const kSignatures16x8[kNumAv1IntraFuncs] = {
+    "b4cbdbdf10ce13300b4063a3daf99e04",
+    "3731e1e6202064a9d0604d7c293ecee4",
+    "6c856188c4256a06452f0d5d70cac436",
+    "1f2192b4c8c497589484ea7bf9c944e8",
+    "84011bd4b7f565119d06787840e333a0",
+    "0e48949f7a6aa36f0d76b5d01f91124a",
+    "58114c06f6b9d8285e5020c7afd834ab",
+    "e37afe84a8b3c5e0f048d4652ecbe09e",
+    "c216348473fb029b45f8fb4f2862a7bd",
+    "0b7385155dcef742cc456d5741ae93a3",
+    "d55fadb221f0ea20266e57cd413e7b94",
+    "9bd6eb226c7e169b8d53cf70aea98b3a",
+    "60eff8064634b6c73b10681356baeee9",
+    "1559aeb081a9c0c71111d6093c2ff9fd",
+#if CONFIG_SMOOTH_HV
+    "c15479b739713773e5cabb748451987b",
+    "72e33ec12c9b67aea26d8d005fb82de2",
+#endif
+  };
+  static const char *const kSignatures16x32[kNumAv1IntraFuncs] = {
+    "abe5233d189cdbf79424721571bbaa7b",
+    "282759f81e3cfb2e2d396fe406b72a8b",
+    "e2224926c264f6f174cbc3167a233168",
+    "6814e85c2b33f8c9415d62e80394b47b",
+    "99cbbb60459c08a3061d72c4e4f6276a",
+    "1d1567d40b8e816f8c1f71e576fe0f87",
+    "5e989f9c748a0d2cd8c4ebf9d3fe1278",
+    "7135a2f419452a3a192a35156f68b019",
+    "06e10af5a726d2c81b8f8c708204f9fb",
+    "c0882f0e7ba1ffa0aeef6d5c751df6de",
+    "8477429e17d39a423f30e2082f651549",
+    "ba35068a30c2d1d10901e4bfabd02a11",
+    "36fdd371b624a075814d497c4832ec85",
+    "8ab8da61b727442b6ff692b40d0df018",
+#if CONFIG_SMOOTH_HV
+    "e35a10ad7fdf2327e821504a90f6a6eb",
+    "1f7211e727dc1de7d6a55d082fbdd821",
+#endif
+  };
+  if (!strcmp(block_name, "intra16x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x16, 16, 16);
+  }
+  if (!strcmp(block_name, "intra16x8")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x8, 16, 8);
+  }
+  if (!strcmp(block_name, "intra16x32")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures16x32, 16, 32);
+  }
 }
 
 void TestIntraPred32(const char *block_name, AvxPredFunc const *pred_funcs) {
-  static const char *const kSignatures[kNumAv1IntraFuncs] = {
+  static const char *const kSignatures32x32[kNumAv1IntraFuncs] = {
     "a0a618c900e65ae521ccc8af789729f2",
     "985aaa7c72b4a6c2fb431d32100cf13a",
     "10662d09febc3ca13ee4e700120daeb5",
@@ -214,7 +335,32 @@
     "cea23799fc3526e1b6a6ff02b42b82af",
 #endif  // CONFIG_SMOOTH_HV
   };
-  TestIntraPred(block_name, pred_funcs, kSignatures, 32, 32);
+  static const char *const kSignatures32x16[kNumAv1IntraFuncs] = {
+    "d1aeb8d5fdcfd3307922af01a798a4dc",
+    "b0bcb514ebfbee065faea9d34c12ae75",
+    "d6a18c63b4e909871c0137ca652fad23",
+    "fd047f2fc1b8ffb95d0eeef3e8796a45",
+    "645ab60779ea348fd93c81561c31bab9",
+    "4409633c9db8dff41ade4292a3a56e7f",
+    "b9b2935b2287a9a461ac5c11251ac706",
+    "43b05f808c0ac4fe8accd84d293b0488",
+    "1d2cb43872d20c205ffb185102bcd22a",
+    "2c1551b5e99592fd21053b5d14e397d9",
+    "cd499ef0dd41e2e38d5dac3319dfdd97",
+    "cd2610426637003f3b5d3984cb3320d5",
+    "5e36a11e069b31c2a739f3a9c7b37c24",
+    "e83b9483d702cfae496991c3c7fa92c0",
+#if CONFIG_SMOOTH_HV
+    "12f6ddf98c7f30a277307f1ea935b030",
+    "354321d6c32bbdb0739e4fa2acbf41e1",
+#endif
+  };
+  if (!strcmp(block_name, "intra32x32")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures32x32, 32, 32);
+  }
+  if (!strcmp(block_name, "intra32x16")) {
+    TestIntraPred(block_name, pred_funcs, kSignatures32x16, 32, 16);
+  }
 }
 
 }  // namespace
@@ -243,7 +389,7 @@
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
 
-INTRA_PRED_TEST(C, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_c,
+INTRA_PRED_TEST(C_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_c,
                 aom_dc_left_predictor_4x4_c, aom_dc_top_predictor_4x4_c,
                 aom_dc_128_predictor_4x4_c, aom_v_predictor_4x4_c,
                 aom_h_predictor_4x4_c, aom_d45e_predictor_4x4_c,
@@ -256,12 +402,37 @@
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_4x8_c
+#define smooth_h_pred_func aom_smooth_h_predictor_4x8_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+
+INTRA_PRED_TEST(C_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_c,
+                aom_dc_left_predictor_4x8_c, aom_dc_top_predictor_4x8_c,
+                aom_dc_128_predictor_4x8_c, aom_v_predictor_4x8_c,
+                aom_h_predictor_4x8_c, aom_d45e_predictor_4x8_c,
+                aom_d135_predictor_4x8_c, aom_d117_predictor_4x8_c,
+                aom_d153_predictor_4x8_c, aom_d207e_predictor_4x8_c,
+                aom_d63e_predictor_4x8_c, aom_paeth_predictor_4x8_c,
+                aom_smooth_predictor_4x8_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_sse2,
+INTRA_PRED_TEST(SSE2_1, TestIntraPred4, "intra4x4", aom_dc_predictor_4x4_sse2,
                 aom_dc_left_predictor_4x4_sse2, aom_dc_top_predictor_4x4_sse2,
                 aom_dc_128_predictor_4x4_sse2, aom_v_predictor_4x4_sse2,
                 aom_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred4, "intra4x8", aom_dc_predictor_4x8_sse2,
+                aom_dc_left_predictor_4x8_sse2, aom_dc_top_predictor_4x8_sse2,
+                aom_dc_128_predictor_4x8_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -302,7 +473,7 @@
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_c,
+INTRA_PRED_TEST(C_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_c,
                 aom_dc_left_predictor_8x8_c, aom_dc_top_predictor_8x8_c,
                 aom_dc_128_predictor_8x8_c, aom_v_predictor_8x8_c,
                 aom_h_predictor_8x8_c, aom_d45e_predictor_8x8_c,
@@ -314,12 +485,58 @@
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_8x4_c
+#define smooth_h_pred_func aom_smooth_h_predictor_8x4_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_c,
+                aom_dc_left_predictor_8x4_c, aom_dc_top_predictor_8x4_c,
+                aom_dc_128_predictor_8x4_c, aom_v_predictor_8x4_c,
+                aom_h_predictor_8x4_c, aom_d45e_predictor_8x4_c,
+                aom_d135_predictor_8x4_c, aom_d117_predictor_8x4_c,
+                aom_d153_predictor_8x4_c, aom_d207e_predictor_8x4_c,
+                aom_d63e_predictor_8x4_c, aom_paeth_predictor_8x4_c,
+                aom_smooth_predictor_8x4_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_8x16_c
+#define smooth_h_pred_func aom_smooth_h_predictor_8x16_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_c,
+                aom_dc_left_predictor_8x16_c, aom_dc_top_predictor_8x16_c,
+                aom_dc_128_predictor_8x16_c, aom_v_predictor_8x16_c,
+                aom_h_predictor_8x16_c, aom_d45e_predictor_8x16_c,
+                aom_d135_predictor_8x16_c, aom_d117_predictor_8x16_c,
+                aom_d153_predictor_8x16_c, aom_d207e_predictor_8x16_c,
+                aom_d63e_predictor_8x16_c, aom_paeth_predictor_8x16_c,
+                aom_smooth_predictor_8x16_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_sse2,
+INTRA_PRED_TEST(SSE2_1, TestIntraPred8, "intra8x8", aom_dc_predictor_8x8_sse2,
                 aom_dc_left_predictor_8x8_sse2, aom_dc_top_predictor_8x8_sse2,
                 aom_dc_128_predictor_8x8_sse2, aom_v_predictor_8x8_sse2,
                 aom_h_predictor_8x8_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred8, "intra8x4", aom_dc_predictor_8x4_sse2,
+                aom_dc_left_predictor_8x4_sse2, aom_dc_top_predictor_8x4_sse2,
+                aom_dc_128_predictor_8x4_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TestIntraPred8, "intra8x16", aom_dc_predictor_8x16_sse2,
+                aom_dc_left_predictor_8x16_sse2, aom_dc_top_predictor_8x16_sse2,
+                aom_dc_128_predictor_8x16_sse2, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -360,7 +577,7 @@
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_c,
+INTRA_PRED_TEST(C_1, TestIntraPred16, "intra16x16", aom_dc_predictor_16x16_c,
                 aom_dc_left_predictor_16x16_c, aom_dc_top_predictor_16x16_c,
                 aom_dc_128_predictor_16x16_c, aom_v_predictor_16x16_c,
                 aom_h_predictor_16x16_c, aom_d45e_predictor_16x16_c,
@@ -372,13 +589,66 @@
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_16x8_c
+#define smooth_h_pred_func aom_smooth_h_predictor_16x8_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred16, "intra16x8", aom_dc_predictor_16x8_c,
+                aom_dc_left_predictor_16x8_c, aom_dc_top_predictor_16x8_c,
+                aom_dc_128_predictor_16x8_c, aom_v_predictor_16x8_c,
+                aom_h_predictor_16x8_c, aom_d45e_predictor_16x8_c,
+                aom_d135_predictor_16x8_c, aom_d117_predictor_16x8_c,
+                aom_d153_predictor_16x8_c, aom_d207e_predictor_16x8_c,
+                aom_d63e_predictor_16x8_c, aom_paeth_predictor_16x8_c,
+                aom_smooth_predictor_16x8_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_16x32_c
+#define smooth_h_pred_func aom_smooth_h_predictor_16x32_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_3, TestIntraPred16, "intra16x32", aom_dc_predictor_16x32_c,
+                aom_dc_left_predictor_16x32_c, aom_dc_top_predictor_16x32_c,
+                aom_dc_128_predictor_16x32_c, aom_v_predictor_16x32_c,
+                aom_h_predictor_16x32_c, aom_d45e_predictor_16x32_c,
+                aom_d135_predictor_16x32_c, aom_d117_predictor_16x32_c,
+                aom_d153_predictor_16x32_c, aom_d207e_predictor_16x32_c,
+                aom_d63e_predictor_16x32_c, aom_paeth_predictor_16x32_c,
+                aom_smooth_predictor_16x32_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred16, "intra16x16",
+INTRA_PRED_TEST(SSE2_1, TestIntraPred16, "intra16x16",
                 aom_dc_predictor_16x16_sse2, aom_dc_left_predictor_16x16_sse2,
                 aom_dc_top_predictor_16x16_sse2,
                 aom_dc_128_predictor_16x16_sse2, aom_v_predictor_16x16_sse2,
                 aom_h_predictor_16x16_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred16, "intra16x8",
+                aom_dc_predictor_16x8_sse2, aom_dc_left_predictor_16x8_sse2,
+                aom_dc_top_predictor_16x8_sse2, aom_dc_128_predictor_16x8_sse2,
+                // aom_v_predictor_16x8_sse2,
+                // aom_h_predictor_16x8_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL)
+INTRA_PRED_TEST(SSE2_3, TestIntraPred16, "intra16x32",
+                aom_dc_predictor_16x32_sse2, aom_dc_left_predictor_16x32_sse2,
+                aom_dc_top_predictor_16x32_sse2,
+                aom_dc_128_predictor_16x32_sse2,
+                // aom_v_predictor_16x32_sse2,
+                // aom_h_predictor_16x32_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3
@@ -421,7 +691,7 @@
 #define smooth_v_pred_func NULL
 #define smooth_h_pred_func NULL
 #endif  // CONFIG_SMOOTH_HV
-INTRA_PRED_TEST(C, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_c,
+INTRA_PRED_TEST(C_1, TestIntraPred32, "intra32x32", aom_dc_predictor_32x32_c,
                 aom_dc_left_predictor_32x32_c, aom_dc_top_predictor_32x32_c,
                 aom_dc_128_predictor_32x32_c, aom_v_predictor_32x32_c,
                 aom_h_predictor_32x32_c, aom_d45e_predictor_32x32_c,
@@ -433,13 +703,40 @@
 #undef smooth_v_pred_func
 #undef smooth_h_pred_func
 
+#if CONFIG_SMOOTH_HV
+#define smooth_v_pred_func aom_smooth_v_predictor_32x16_c
+#define smooth_h_pred_func aom_smooth_h_predictor_32x16_c
+#else
+#define smooth_v_pred_func NULL
+#define smooth_h_pred_func NULL
+#endif  // CONFIG_SMOOTH_HV
+INTRA_PRED_TEST(C_2, TestIntraPred32, "intra32x16", aom_dc_predictor_32x16_c,
+                aom_dc_left_predictor_32x16_c, aom_dc_top_predictor_32x16_c,
+                aom_dc_128_predictor_32x16_c, aom_v_predictor_32x16_c,
+                aom_h_predictor_32x16_c, aom_d45e_predictor_32x16_c,
+                aom_d135_predictor_32x16_c, aom_d117_predictor_32x16_c,
+                aom_d153_predictor_32x16_c, aom_d207e_predictor_32x16_c,
+                aom_d63e_predictor_32x16_c, aom_paeth_predictor_32x16_c,
+                aom_smooth_predictor_32x16_c, smooth_v_pred_func,
+                smooth_h_pred_func)
+#undef smooth_v_pred_func
+#undef smooth_h_pred_func
+
 #if HAVE_SSE2
-INTRA_PRED_TEST(SSE2, TestIntraPred32, "intra32x32",
+INTRA_PRED_TEST(SSE2_1, TestIntraPred32, "intra32x32",
                 aom_dc_predictor_32x32_sse2, aom_dc_left_predictor_32x32_sse2,
                 aom_dc_top_predictor_32x32_sse2,
                 aom_dc_128_predictor_32x32_sse2, aom_v_predictor_32x32_sse2,
                 aom_h_predictor_32x32_sse2, NULL, NULL, NULL, NULL, NULL, NULL,
                 NULL, NULL, NULL, NULL)
+INTRA_PRED_TEST(SSE2_2, TestIntraPred32, "intra32x16",
+                aom_dc_predictor_32x16_sse2, aom_dc_left_predictor_32x16_sse2,
+                aom_dc_top_predictor_32x16_sse2,
+                aom_dc_128_predictor_32x16_sse2,
+                // aom_v_predictor_32x16_sse2,
+                // aom_h_predictor_32x16_sse2,
+                NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+                NULL, NULL)
 #endif  // HAVE_SSE2
 
 #if HAVE_SSSE3