Remove 4-tap filter intra

We reverted to using 3-tap filters. So 4-tap filters related code
will not be used any more.

Change-Id: I7f65cf227d2eb3e9785474e3b33d0bdbf489b1f1
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 35b6722..54dd08c 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -313,12 +313,6 @@
       ${AOM_AV1_ENCODER_INTRIN_SSE2}
       "${AOM_ROOT}/av1/encoder/x86/wedge_utils_sse2.c")
 
-if (CONFIG_FILTER_INTRA)
-  set(AOM_AV1_COMMON_INTRIN_SSE4_1
-      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
-      "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
-endif ()
-
 if (CONFIG_ACCOUNTING)
   set(AOM_AV1_DECODER_SOURCES
       ${AOM_AV1_DECODER_SOURCES}
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index fdb50ed..011cc45 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -147,10 +147,6 @@
 AV1_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht8x8_add_neon.c
 endif
 
-ifeq ($(CONFIG_FILTER_INTRA),yes)
-AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
-endif
-
 AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
 AV1_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/warp_plane_ssse3.c
 ifeq ($(CONFIG_HIGHBITDEPTH),yes)
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index a5c2fb7..2ee7ec6 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1066,7 +1066,6 @@
 #endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_FILTER_INTRA
-#if USE_3TAP_INTRA_FILTER
 static int filter_intra_taps_3[TX_SIZES_ALL][FILTER_INTRA_MODES][3] = {
   {
       { 697, 836, -509 },
@@ -1191,194 +1190,7 @@
       { 839, 911, -726 },
   }
 };
-#else
-static int filter_intra_taps_4[TX_SIZES_ALL][FILTER_INTRA_MODES][4] = {
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#if CONFIG_TX64X64
-  {
-      { 477, 737, -393, 150 },
-      { 881, 630, -546, 67 },
-      { 506, 984, -443, -20 },
-      { 114, 459, -270, 528 },
-      { 433, 528, 14, 3 },
-      { 837, 470, -301, -30 },
-      { 181, 777, 89, -107 },
-      { -29, 716, -232, 259 },
-      { 589, 646, -495, 255 },
-      { 740, 884, -728, 77 },
-  },
-#endif  // CONFIG_TX64X64
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 423, 728, -347, 111 },
-      { 963, 685, -665, 23 },
-      { 281, 1024, -480, 216 },
-      { 640, 596, -437, 78 },
-      { 429, 669, -259, 99 },
-      { 740, 646, -415, 23 },
-      { 568, 771, -346, 40 },
-      { 404, 833, -486, 209 },
-      { 398, 712, -423, 307 },
-      { 939, 935, -887, 17 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 735, 881, -537, -54 },
-      { 1005, 519, -488, -11 },
-      { 383, 990, -343, -6 },
-      { 442, 805, -542, 319 },
-      { 658, 616, -133, -116 },
-      { 875, 442, -141, -151 },
-      { 386, 741, -23, -80 },
-      { 390, 1027, -446, 51 },
-      { 679, 606, -523, 262 },
-      { 903, 922, -778, -23 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  },
-  {
-      { 648, 803, -444, 16 },
-      { 972, 620, -576, 7 },
-      { 561, 967, -499, -5 },
-      { 585, 762, -468, 144 },
-      { 596, 619, -182, -9 },
-      { 895, 459, -176, -153 },
-      { 557, 722, -126, -129 },
-      { 601, 839, -523, 105 },
-      { 562, 709, -499, 251 },
-      { 803, 872, -695, 43 },
-  }
-};
-#endif
 
-#if USE_3TAP_INTRA_FILTER
 static void filter_intra_predictors_3tap(uint8_t *dst, ptrdiff_t stride,
                                          TX_SIZE tx_size, const uint8_t *above,
                                          const uint8_t *left, int mode) {
@@ -1414,121 +1226,44 @@
     dst += stride;
   }
 }
-#else
-static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint8_t *above,
-                                         const uint8_t *left, int mode) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int buffer[65][129];
-#else
-  int buffer[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) buffer[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * buffer[r - 1][c] + c1 * buffer[r][c - 1] +
-              c2 * buffer[r - 1][c - 1] + c3 * buffer[r - 1][c + 1];
-      buffer[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      buffer[r][c] = clip_pixel(buffer[r][c] + mean) - mean;
-    }
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel(buffer[r + 1][c + 1] + mean);
-    }
-    dst += stride;
-  }
-}
-#endif
 
 void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                                const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_DC_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_DC_PRED);
-#endif
 }
 
 void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_V_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_V_PRED);
-#endif
 }
 
 void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
                               const uint8_t *above, const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_H_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_H_PRED);
-#endif
 }
 
 void av1_d117_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
                                  TX_SIZE tx_size, const uint8_t *above,
                                  const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_D117_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D117_PRED);
-#endif
 }
 
 void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
                                  TX_SIZE tx_size, const uint8_t *above,
                                  const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_D153_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_D153_PRED);
-#endif
 }
 
 void av1_paeth_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
                                   TX_SIZE tx_size, const uint8_t *above,
                                   const uint8_t *left) {
-#if USE_3TAP_INTRA_FILTER
   filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                FILTER_PAETH_PRED);
-#else
-  filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                               FILTER_PAETH_PRED);
-#endif
 }
 
 static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
@@ -1557,7 +1292,6 @@
   }
 }
 #if CONFIG_HIGHBITDEPTH
-#if USE_3TAP_INTRA_FILTER
 static void highbd_filter_intra_predictors_3tap(uint16_t *dst, ptrdiff_t stride,
                                                 TX_SIZE tx_size,
                                                 const uint16_t *above,
@@ -1595,126 +1329,47 @@
     dst += stride;
   }
 }
-#else
-static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
-                                                TX_SIZE tx_size,
-                                                const uint16_t *above,
-                                                const uint16_t *left, int mode,
-                                                int bd) {
-  int r, c;
-  int mean, ipred;
-#if CONFIG_TX64X64
-  int preds[65][129];
-#else
-  int preds[33][65];
-#endif  // CONFIG_TX64X64
-  const int c0 = filter_intra_taps_4[tx_size][mode][0];
-  const int c1 = filter_intra_taps_4[tx_size][mode][1];
-  const int c2 = filter_intra_taps_4[tx_size][mode][2];
-  const int c3 = filter_intra_taps_4[tx_size][mode][3];
-  const int bw = tx_size_wide[tx_size];
-  const int bh = tx_size_high[tx_size];
-
-  mean = 0;
-  for (r = 0; r < bh; ++r) {
-    mean += (int)left[r];
-  }
-  for (c = 0; c < bw; ++c) {
-    mean += (int)above[c];
-  }
-  mean = (mean + ((bw + bh) >> 1)) / (bw + bh);
-
-  for (r = 0; r < bh; ++r) preds[r + 1][0] = (int)left[r] - mean;
-
-  for (c = 0; c < 2 * bw + 1; ++c) preds[0][c] = (int)above[c - 1] - mean;
-
-  for (r = 1; r < bh + 1; ++r)
-    for (c = 1; c < 2 * bw + 1 - r; ++c) {
-      ipred = c0 * preds[r - 1][c] + c1 * preds[r][c - 1] +
-              c2 * preds[r - 1][c - 1] + c3 * preds[r - 1][c + 1];
-      preds[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
-      preds[r][c] = clip_pixel_highbd(preds[r][c] + mean, bd) - mean;
-    }
-
-  for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = clip_pixel_highbd(preds[r + 1][c + 1] + mean, bd);
-    }
-    dst += stride;
-  }
-}
-#endif
 
 void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                       TX_SIZE tx_size, const uint16_t *above,
                                       const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_DC_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_DC_PRED, bd);
-#endif
 }
 
 void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                      TX_SIZE tx_size, const uint16_t *above,
                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_V_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_V_PRED, bd);
-#endif
 }
 
 void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                      TX_SIZE tx_size, const uint16_t *above,
                                      const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_H_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_H_PRED, bd);
-#endif
 }
 
 void av1_highbd_d117_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                         TX_SIZE tx_size, const uint16_t *above,
                                         const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_D117_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D117_PRED, bd);
-#endif
 }
 
 void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                         TX_SIZE tx_size, const uint16_t *above,
                                         const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_D153_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_D153_PRED, bd);
-#endif
 }
 
 void av1_highbd_paeth_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
                                          TX_SIZE tx_size, const uint16_t *above,
                                          const uint16_t *left, int bd) {
-#if USE_3TAP_INTRA_FILTER
   highbd_filter_intra_predictors_3tap(dst, stride, tx_size, above, left,
                                       FILTER_PAETH_PRED, bd);
-#else
-  highbd_filter_intra_predictors_4tap(dst, stride, tx_size, above, left,
-                                      FILTER_PAETH_PRED, bd);
-#endif
 }
 
 static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
deleted file mode 100644
index 038c58f..0000000
--- a/av1/common/x86/filterintra_sse4.c
+++ /dev/null
@@ -1,899 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <smmintrin.h>
-
-#include "./av1_rtcd.h"
-#include "aom_ports/mem.h"
-#include "av1/common/enums.h"
-#include "av1/common/reconintra.h"
-
-#if USE_3TAP_INTRA_FILTER
-void filterintra_sse4_3tap_dummy_func(void);
-void filterintra_sse4_3tap_dummy_func(void) {}
-#else
-
-static INLINE void AddPixelsSmall(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-}
-
-static INLINE int GetMeanValue4x4(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue8x8(const uint8_t *above, const uint8_t *left,
-                                  __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsSmall(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE void AddPixelsLarge(const uint8_t *above, const uint8_t *left,
-                                  __m128i *sum) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i u0 = _mm_unpacklo_epi8(a, zero);
-  __m128i u1 = _mm_unpacklo_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(u0, u1);
-
-  u0 = _mm_unpackhi_epi8(a, zero);
-  u1 = _mm_unpackhi_epi8(l, zero);
-
-  sum[0] = _mm_add_epi16(sum[0], u0);
-  sum[0] = _mm_add_epi16(sum[0], u1);
-}
-
-static INLINE int GetMeanValue16x16(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int GetMeanValue32x32(const uint8_t *above, const uint8_t *left,
-                                    __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint16_t sum_value;
-
-  AddPixelsLarge(above, left, &sum_vector[0]);
-  AddPixelsLarge(above + 16, left + 16, &sum_vector[1]);
-
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-  sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector[0], 2);
-  sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-
-  sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int CalcRefPixelsMeanValue(const uint8_t *above,
-                                         const uint8_t *left, int bs,
-                                         __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = GetMeanValue4x4(above, left, params); break;
-    case 8: meanValue = GetMeanValue8x8(above, left, params); break;
-    case 16: meanValue = GetMeanValue16x16(above, left, params); break;
-    case 32: meanValue = GetMeanValue32x32(above, left, params); break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  params[0-3] : 4-tap filter coefficients (int32_t per coefficient)
-//
-static INLINE void GetIntraFilterParams(int bs, int mode, __m128i *params) {
-  const TX_SIZE tx_size =
-      (bs == 32) ? TX_32X32
-                 : ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
-  // c0
-  params[0] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0],
-                            av1_filter_intra_taps_4[tx_size][mode][0]);
-  // c1
-  params[1] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1],
-                            av1_filter_intra_taps_4[tx_size][mode][1]);
-  // c2
-  params[2] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2],
-                            av1_filter_intra_taps_4[tx_size][mode][2]);
-  // c3
-  params[3] = _mm_set_epi32(av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3],
-                            av1_filter_intra_taps_4[tx_size][mode][3]);
-}
-
-static const int maxBlkSize = 32;
-
-static INLINE void SavePred4x4(int *pred, const __m128i *mean, uint8_t *dst,
-                               ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i p1 = _mm_loadu_si128((const __m128i *)(pred + predStride));
-  __m128i p2 = _mm_loadu_si128((const __m128i *)(pred + 2 * predStride));
-  __m128i p3 = _mm_loadu_si128((const __m128i *)(pred + 3 * predStride));
-
-  p0 = _mm_add_epi32(p0, mean[0]);
-  p1 = _mm_add_epi32(p1, mean[0]);
-  p2 = _mm_add_epi32(p2, mean[0]);
-  p3 = _mm_add_epi32(p3, mean[0]);
-
-  p0 = _mm_packus_epi32(p0, p1);
-  p1 = _mm_packus_epi32(p2, p3);
-  p0 = _mm_packus_epi16(p0, p1);
-
-  *((int *)dst) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 2 * stride)) = _mm_cvtsi128_si32(p0);
-  p0 = _mm_srli_si128(p0, 4);
-  *((int *)(dst + 3 * stride)) = _mm_cvtsi128_si32(p0);
-}
-
-static void SavePred8x8(int *pred, const __m128i *mean, uint8_t *dst,
-                        ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 8) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    r += 1;
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)dst, p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred16x16(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3;
-  int r = 0;
-
-  while (r < 16) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePred32x32(int *pred, const __m128i *mean, uint8_t *dst,
-                          ptrdiff_t stride) {
-  const int predStride = (maxBlkSize << 1) + 1;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-  int r = 0;
-
-  while (r < 32) {
-    p0 = _mm_loadu_si128((const __m128i *)(pred + r * predStride));
-    p1 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 4));
-    p2 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 8));
-    p3 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 12));
-
-    p4 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 16));
-    p5 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 20));
-    p6 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 24));
-    p7 = _mm_loadu_si128((const __m128i *)(pred + r * predStride + 28));
-
-    p0 = _mm_add_epi32(p0, mean[0]);
-    p1 = _mm_add_epi32(p1, mean[0]);
-    p2 = _mm_add_epi32(p2, mean[0]);
-    p3 = _mm_add_epi32(p3, mean[0]);
-
-    p4 = _mm_add_epi32(p4, mean[0]);
-    p5 = _mm_add_epi32(p5, mean[0]);
-    p6 = _mm_add_epi32(p6, mean[0]);
-    p7 = _mm_add_epi32(p7, mean[0]);
-
-    p0 = _mm_packus_epi32(p0, p1);
-    p1 = _mm_packus_epi32(p2, p3);
-    p0 = _mm_packus_epi16(p0, p1);
-
-    p4 = _mm_packus_epi32(p4, p5);
-    p5 = _mm_packus_epi32(p6, p7);
-    p4 = _mm_packus_epi16(p4, p5);
-
-    _mm_storel_epi64((__m128i *)dst, p0);
-    p0 = _mm_srli_si128(p0, 8);
-    _mm_storel_epi64((__m128i *)(dst + 8), p0);
-
-    _mm_storel_epi64((__m128i *)(dst + 16), p4);
-    p4 = _mm_srli_si128(p4, 8);
-    _mm_storel_epi64((__m128i *)(dst + 24), p4);
-
-    dst += stride;
-    r += 1;
-  }
-}
-
-static void SavePrediction(int *pred, const __m128i *mean, int bs, uint8_t *dst,
-                           ptrdiff_t stride) {
-  switch (bs) {
-    case 4: SavePred4x4(pred, mean, dst, stride); break;
-    case 8: SavePred8x8(pred, mean, dst, stride); break;
-    case 16: SavePred16x16(pred, mean, dst, stride); break;
-    case 32: SavePred32x32(pred, mean, dst, stride); break;
-    default: assert(0);
-  }
-}
-
-typedef void (*ProducePixelsFunc)(__m128i *p, const __m128i *prm, int *pred,
-                                  const int predStride);
-
-static void ProduceFourPixels(__m128i *p, const __m128i *prm, int *pred,
-                              const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-
-  sum = _mm_extract_epi32(u0, 3);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 4) = x;
-}
-
-static void ProduceThreePixels(__m128i *p, const __m128i *prm, int *pred,
-                               const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-
-  sum = _mm_extract_epi32(u0, 2);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 3) = x;
-}
-
-static void ProduceTwoPixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-
-  sum = _mm_extract_epi32(u0, 1);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 2) = x;
-}
-
-static void ProduceOnePixels(__m128i *p, const __m128i *prm, int *pred,
-                             const int predStride) {
-  __m128i u0, u1, u2;
-  int c0 = _mm_extract_epi32(prm[1], 0);
-  int x = *(pred + predStride);
-  int sum;
-
-  u0 = _mm_mullo_epi32(p[0], prm[2]);
-  u1 = _mm_mullo_epi32(p[1], prm[0]);
-  u2 = _mm_mullo_epi32(p[2], prm[3]);
-
-  u0 = _mm_add_epi32(u0, u1);
-  u0 = _mm_add_epi32(u0, u2);
-
-  sum = _mm_extract_epi32(u0, 0);
-  sum += c0 * x;
-  x = ROUND_POWER_OF_TWO_SIGNED(sum, FILTER_INTRA_PREC_BITS);
-  *(pred + predStride + 1) = x;
-}
-
-static ProducePixelsFunc prodPixelsFuncTab[4] = {
-  ProduceOnePixels, ProduceTwoPixels, ProduceThreePixels, ProduceFourPixels
-};
-
-static void ProducePixels(int *pred, const __m128i *prm, int remain) {
-  __m128i p[3];
-  const int predStride = (maxBlkSize << 1) + 1;
-  int index;
-
-  p[0] = _mm_loadu_si128((const __m128i *)pred);
-  p[1] = _mm_loadu_si128((const __m128i *)(pred + 1));
-  p[2] = _mm_loadu_si128((const __m128i *)(pred + 2));
-
-  if (remain <= 2) {
-    return;
-  }
-  if (remain > 5) {
-    index = 3;
-  } else {
-    index = remain - 3;
-  }
-  prodPixelsFuncTab[index](p, prm, pred, predStride);
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void GeneratePrediction(const uint8_t *above, const uint8_t *left,
-                               const int bs, const __m128i *prm, int meanValue,
-                               uint8_t *dst, ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
-    }
-    r += 1;
-  }
-
-  SavePrediction(&pred[1][1], &prm[4], bs, dst, stride);
-}
-
-static void FilterPrediction(const uint8_t *above, const uint8_t *left, int bs,
-                             __m128i *prm, uint8_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = CalcRefPixelsMeanValue(above, left, bs, &prm[4]);
-  GeneratePrediction(above, left, bs, prm, meanValue, dst, stride);
-}
-
-void av1_dc_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                    const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_v_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_h_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                   const uint8_t *above, const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d45_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d135_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d117_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d153_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d207_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                      const uint8_t *above,
-                                      const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_d63_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                     const uint8_t *above,
-                                     const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-void av1_paeth_filter_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride, int bs,
-                                       const uint8_t *above,
-                                       const uint8_t *left) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, PAETH_PRED, &prm[0]);
-  FilterPrediction(above, left, bs, prm, dst, stride);
-}
-
-// ============== High Bit Depth ==============
-#if CONFIG_HIGHBITDEPTH
-static INLINE int HighbdGetMeanValue4x4(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 4;
-  sum_value >>= 3;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue8x8(const uint16_t *above,
-                                        const uint16_t *left, const int bd,
-                                        __m128i *params) {
-  const __m128i a = _mm_loadu_si128((const __m128i *)above);
-  const __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint16_t sum_value;
-  (void)bd;
-
-  sum_vector = _mm_add_epi16(a, l);
-
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-  sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-  u = _mm_srli_si128(sum_vector, 2);
-  sum_vector = _mm_add_epi16(sum_vector, u);
-
-  sum_value = _mm_extract_epi16(sum_vector, 0);
-  sum_value += 8;
-  sum_value >>= 4;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  Process 16 pixels above and left, 10-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels10bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  sum[0] = _mm_add_epi16(a, l);
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-  sum[0] = _mm_add_epi16(sum[0], a);
-  sum[0] = _mm_add_epi16(sum[0], l);
-}
-
-// Note:
-//  Process 16 pixels above and left, 12-bit depth
-//  Add to the last 8 pixels sum
-static INLINE void AddPixels12bit(const uint16_t *above, const uint16_t *left,
-                                  __m128i *sum) {
-  __m128i a = _mm_loadu_si128((const __m128i *)above);
-  __m128i l = _mm_loadu_si128((const __m128i *)left);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i v0, v1;
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(v0, v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  a = _mm_loadu_si128((const __m128i *)(above + 8));
-  l = _mm_loadu_si128((const __m128i *)(left + 8));
-
-  v0 = _mm_unpacklo_epi16(a, zero);
-  v1 = _mm_unpacklo_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-
-  v0 = _mm_unpackhi_epi16(a, zero);
-  v1 = _mm_unpackhi_epi16(l, zero);
-  sum[0] = _mm_add_epi32(sum[0], v0);
-  sum[0] = _mm_add_epi32(sum[0], v1);
-}
-
-static INLINE int HighbdGetMeanValue16x16(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector, u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector);
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 4 values
-    sum_vector = _mm_hadd_epi16(sum_vector, zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector, 2);
-    sum_vector = _mm_add_epi16(sum_vector, u);
-    sum_value = _mm_extract_epi16(sum_vector, 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector);
-
-    sum_vector = _mm_hadd_epi32(sum_vector, zero);
-    u = _mm_srli_si128(sum_vector, 4);
-    sum_vector = _mm_add_epi32(u, sum_vector);
-    sum_value = _mm_extract_epi32(sum_vector, 0);
-  }
-
-  sum_value += 16;
-  sum_value >>= 5;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-static INLINE int HighbdGetMeanValue32x32(const uint16_t *above,
-                                          const uint16_t *left, const int bd,
-                                          __m128i *params) {
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sum_vector[2], u;
-  uint32_t sum_value = 0;
-
-  if (10 == bd) {
-    AddPixels10bit(above, left, &sum_vector[0]);
-    AddPixels10bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 4 values
-    sum_vector[0] = _mm_hadd_epi16(sum_vector[0], zero);  // still has 2 values
-
-    u = _mm_srli_si128(sum_vector[0], 2);
-    sum_vector[0] = _mm_add_epi16(sum_vector[0], u);
-    sum_value = _mm_extract_epi16(sum_vector[0], 0);
-  } else if (12 == bd) {
-    AddPixels12bit(above, left, &sum_vector[0]);
-    AddPixels12bit(above + 16, left + 16, &sum_vector[1]);
-
-    sum_vector[0] = _mm_add_epi32(sum_vector[0], sum_vector[1]);
-    sum_vector[0] = _mm_hadd_epi32(sum_vector[0], zero);
-    u = _mm_srli_si128(sum_vector[0], 4);
-    sum_vector[0] = _mm_add_epi32(u, sum_vector[0]);
-    sum_value = _mm_extract_epi32(sum_vector[0], 0);
-  }
-
-  sum_value += 32;
-  sum_value >>= 6;
-  *params = _mm_set1_epi32(sum_value);
-  return sum_value;
-}
-
-// Note:
-//  params[4] : mean value, 4 int32_t repetition
-//
-static INLINE int HighbdCalcRefPixelsMeanValue(const uint16_t *above,
-                                               const uint16_t *left, int bs,
-                                               const int bd, __m128i *params) {
-  int meanValue = 0;
-  switch (bs) {
-    case 4: meanValue = HighbdGetMeanValue4x4(above, left, bd, params); break;
-    case 8: meanValue = HighbdGetMeanValue8x8(above, left, bd, params); break;
-    case 16:
-      meanValue = HighbdGetMeanValue16x16(above, left, bd, params);
-      break;
-    case 32:
-      meanValue = HighbdGetMeanValue32x32(above, left, bd, params);
-      break;
-    default: assert(0);
-  }
-  return meanValue;
-}
-
-// Note:
-//  At column index c, the remaining pixels are R = 2 * bs + 1 - r - c
-//  the number of pixels to produce is R - 2 = 2 * bs - r - c - 1
-static void HighbdGeneratePrediction(const uint16_t *above,
-                                     const uint16_t *left, const int bs,
-                                     const int bd, const __m128i *prm,
-                                     int meanValue, uint16_t *dst,
-                                     ptrdiff_t stride) {
-  int pred[33][65];
-  int r, c, colBound;
-  int remainings;
-  int ipred;
-
-  for (r = 0; r < bs; ++r) {
-    pred[r + 1][0] = (int)left[r] - meanValue;
-  }
-
-  above -= 1;
-  for (c = 0; c < 2 * bs + 1; ++c) {
-    pred[0][c] = (int)above[c] - meanValue;
-  }
-
-  r = 0;
-  c = 0;
-  while (r < bs) {
-    colBound = (bs << 1) - r;
-    for (c = 0; c < colBound; c += 4) {
-      remainings = colBound - c + 1;
-      ProducePixels(&pred[r][c], prm, remainings);
-    }
-    r += 1;
-  }
-
-  for (r = 0; r < bs; ++r) {
-    for (c = 0; c < bs; ++c) {
-      ipred = pred[r + 1][c + 1] + meanValue;
-      dst[c] = clip_pixel_highbd(ipred, bd);
-    }
-    dst += stride;
-  }
-}
-
-static void HighbdFilterPrediction(const uint16_t *above, const uint16_t *left,
-                                   int bs, const int bd, __m128i *prm,
-                                   uint16_t *dst, ptrdiff_t stride) {
-  int meanValue = 0;
-  meanValue = HighbdCalcRefPixelsMeanValue(above, left, bs, bd, &prm[4]);
-  HighbdGeneratePrediction(above, left, bs, bd, prm, meanValue, dst, stride);
-}
-
-void av1_highbd_dc_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                           int bs, const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, DC_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_v_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, V_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_h_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                          int bs, const uint16_t *above,
-                                          const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, H_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d45_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D45_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d135_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D135_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d117_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D117_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d153_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D153_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d207_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                             int bs, const uint16_t *above,
-                                             const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D207_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_d63_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                            int bs, const uint16_t *above,
-                                            const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, D63_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-
-void av1_highbd_paeth_filter_predictor_sse4_1(uint16_t *dst, ptrdiff_t stride,
-                                              int bs, const uint16_t *above,
-                                              const uint16_t *left, int bd) {
-  __m128i prm[5];
-  GetIntraFilterParams(bs, PAETH_PRED, &prm[0]);
-  HighbdFilterPrediction(above, left, bs, bd, prm, dst, stride);
-}
-#endif  // CONFIG_HIGHBITDEPTH
-
-#endif  // USE_3TAP_INTRA_FILTER
diff --git a/test/filterintra_predictors_test.cc b/test/filterintra_predictors_test.cc
deleted file mode 100644
index 671d371..0000000
--- a/test/filterintra_predictors_test.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "./av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "test/util.h"
-#include "av1/common/enums.h"
-
-namespace {
-
-using std::tr1::tuple;
-using libaom_test::ACMRandom;
-
-typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, int bs,
-                          const uint8_t *above, const uint8_t *left);
-
-// Note:
-//  Test parameter list:
-//  Reference predictor, optimized predictor, prediction mode, block size
-//
-typedef tuple<Predictor, Predictor, int> PredFuncMode;
-typedef tuple<PredFuncMode, int> PredParams;
-
-#if CONFIG_HIGHBITDEPTH
-typedef void (*HbdPredictor)(uint16_t *dst, ptrdiff_t stride, int bs,
-                             const uint16_t *above, const uint16_t *left,
-                             int bd);
-
-// Note:
-//  Test parameter list:
-//  Reference predictor, optimized predictor, prediction mode, block size,
-//  bit depth
-//
-typedef tuple<HbdPredictor, HbdPredictor, int> HbdPredFuncMode;
-typedef tuple<HbdPredFuncMode, int, int> HbdPredParams;
-#endif
-
-const int MaxBlkSize = 32;
-
-// By default, disable speed test
-#define PREDICTORS_SPEED_TEST (0)
-
-#if PREDICTORS_SPEED_TEST
-const int MaxTestNum = 100000;
-#else
-const int MaxTestNum = 100;
-#endif
-
-class AV1FilterIntraPredOptimzTest
-    : public ::testing::TestWithParam<PredParams> {
- public:
-  virtual ~AV1FilterIntraPredOptimzTest() {}
-  virtual void SetUp() {
-    PredFuncMode funcMode = GET_PARAM(0);
-    predFuncRef_ = std::tr1::get<0>(funcMode);
-    predFunc_ = std::tr1::get<1>(funcMode);
-    mode_ = std::tr1::get<2>(funcMode);
-    blockSize_ = GET_PARAM(1);
-
-    alloc_ = new uint8_t[3 * MaxBlkSize + 2];
-    predRef_ = new uint8_t[MaxBlkSize * MaxBlkSize];
-    pred_ = new uint8_t[MaxBlkSize * MaxBlkSize];
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    delete[] predRef_;
-    delete[] pred_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunTest() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    while (tstIndex < MaxTestNum) {
-      PrepareBuffer();
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
-      ASM_REGISTER_STATE_CHECK(
-          predFunc_(pred_, stride, blockSize_, &above[1], left));
-      DiffPred(tstIndex);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestC() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestSSE() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint8_t *left = alloc_;
-    uint8_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFunc_(predRef_, stride, blockSize_, &above[1], left);
-      tstIndex += 1;
-    }
-  }
-
- private:
-  void PrepareBuffer() const {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int i = 0;
-    while (i < (3 * MaxBlkSize + 2)) {
-      alloc_[i] = rnd.Rand8();
-      i += 1;
-    }
-  }
-
-  void DiffPred(int testNum) const {
-    int i = 0;
-    while (i < blockSize_ * blockSize_) {
-      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
-                                       << "Block size: " << blockSize_ << " "
-                                       << "Test number: " << testNum;
-      i += 1;
-    }
-  }
-
-  Predictor predFunc_;
-  Predictor predFuncRef_;
-  int mode_;
-  int blockSize_;
-  uint8_t *alloc_;
-  uint8_t *pred_;
-  uint8_t *predRef_;
-};
-
-#if CONFIG_HIGHBITDEPTH
-class AV1HbdFilterIntraPredOptimzTest
-    : public ::testing::TestWithParam<HbdPredParams> {
- public:
-  virtual ~AV1HbdFilterIntraPredOptimzTest() {}
-  virtual void SetUp() {
-    HbdPredFuncMode funcMode = GET_PARAM(0);
-    predFuncRef_ = std::tr1::get<0>(funcMode);
-    predFunc_ = std::tr1::get<1>(funcMode);
-    mode_ = std::tr1::get<2>(funcMode);
-    blockSize_ = GET_PARAM(1);
-    bd_ = GET_PARAM(2);
-
-    alloc_ = new uint16_t[3 * MaxBlkSize + 2];
-    predRef_ = new uint16_t[MaxBlkSize * MaxBlkSize];
-    pred_ = new uint16_t[MaxBlkSize * MaxBlkSize];
-  }
-
-  virtual void TearDown() {
-    delete[] alloc_;
-    delete[] predRef_;
-    delete[] pred_;
-    libaom_test::ClearSystemState();
-  }
-
- protected:
-  void RunTest() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    while (tstIndex < MaxTestNum) {
-      PrepareBuffer();
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      ASM_REGISTER_STATE_CHECK(
-          predFunc_(pred_, stride, blockSize_, &above[1], left, bd_));
-      DiffPred(tstIndex);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestC() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFuncRef_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      tstIndex += 1;
-    }
-  }
-
-  void RunSpeedTestSSE() const {
-    int tstIndex = 0;
-    int stride = blockSize_;
-    uint16_t *left = alloc_;
-    uint16_t *above = alloc_ + MaxBlkSize + 1;
-    PrepareBuffer();
-    while (tstIndex < MaxTestNum) {
-      predFunc_(predRef_, stride, blockSize_, &above[1], left, bd_);
-      tstIndex += 1;
-    }
-  }
-
- private:
-  void PrepareBuffer() const {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    int i = 0;
-    while (i < (3 * MaxBlkSize + 2)) {
-      alloc_[i] = rnd.Rand16() & ((1 << bd_) - 1);
-      i += 1;
-    }
-  }
-
-  void DiffPred(int testNum) const {
-    int i = 0;
-    while (i < blockSize_ * blockSize_) {
-      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
-                                       << "Block size: " << blockSize_ << " "
-                                       << "Bit depth: " << bd_ << " "
-                                       << "Test number: " << testNum;
-      i += 1;
-    }
-  }
-
-  HbdPredictor predFunc_;
-  HbdPredictor predFuncRef_;
-  int mode_;
-  int blockSize_;
-  int bd_;
-  uint16_t *alloc_;
-  uint16_t *pred_;
-  uint16_t *predRef_;
-};
-#endif  // CONFIG_HIGHBITDEPTH
-
-TEST_P(AV1FilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
-
-#if PREDICTORS_SPEED_TEST
-TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
-
-TEST_P(AV1FilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
-#endif
-
-#if CONFIG_HIGHBITDEPTH
-TEST_P(AV1HbdFilterIntraPredOptimzTest, BitExactCheck) { RunTest(); }
-
-#if PREDICTORS_SPEED_TEST
-TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckC) { RunSpeedTestC(); }
-
-TEST_P(AV1HbdFilterIntraPredOptimzTest, SpeedCheckSSE) { RunSpeedTestSSE(); }
-#endif  // PREDICTORS_SPEED_TEST
-#endif  // CONFIG_HIGHBITDEPTH
-
-using std::tr1::make_tuple;
-
-const PredFuncMode kPredFuncMdArray[] = {
-  make_tuple(av1_dc_filter_predictor_c, av1_dc_filter_predictor_sse4_1,
-             DC_PRED),
-  make_tuple(av1_v_filter_predictor_c, av1_v_filter_predictor_sse4_1, V_PRED),
-  make_tuple(av1_h_filter_predictor_c, av1_h_filter_predictor_sse4_1, H_PRED),
-  make_tuple(av1_d45_filter_predictor_c, av1_d45_filter_predictor_sse4_1,
-             D45_PRED),
-  make_tuple(av1_d135_filter_predictor_c, av1_d135_filter_predictor_sse4_1,
-             D135_PRED),
-  make_tuple(av1_d117_filter_predictor_c, av1_d117_filter_predictor_sse4_1,
-             D117_PRED),
-  make_tuple(av1_d153_filter_predictor_c, av1_d153_filter_predictor_sse4_1,
-             D153_PRED),
-  make_tuple(av1_d207_filter_predictor_c, av1_d207_filter_predictor_sse4_1,
-             D207_PRED),
-  make_tuple(av1_d63_filter_predictor_c, av1_d63_filter_predictor_sse4_1,
-             D63_PRED),
-  make_tuple(av1_paeth_filter_predictor_c, av1_paeth_filter_predictor_sse4_1,
-             PAETH_PRED),
-};
-
-const int kBlkSize[] = { 4, 8, 16, 32 };
-
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, AV1FilterIntraPredOptimzTest,
-    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
-                       ::testing::ValuesIn(kBlkSize)));
-
-#if CONFIG_HIGHBITDEPTH
-const HbdPredFuncMode kHbdPredFuncMdArray[] = {
-  make_tuple(av1_highbd_dc_filter_predictor_c,
-             av1_highbd_dc_filter_predictor_sse4_1, DC_PRED),
-  make_tuple(av1_highbd_v_filter_predictor_c,
-             av1_highbd_v_filter_predictor_sse4_1, V_PRED),
-  make_tuple(av1_highbd_h_filter_predictor_c,
-             av1_highbd_h_filter_predictor_sse4_1, H_PRED),
-  make_tuple(av1_highbd_d45_filter_predictor_c,
-             av1_highbd_d45_filter_predictor_sse4_1, D45_PRED),
-  make_tuple(av1_highbd_d135_filter_predictor_c,
-             av1_highbd_d135_filter_predictor_sse4_1, D135_PRED),
-  make_tuple(av1_highbd_d117_filter_predictor_c,
-             av1_highbd_d117_filter_predictor_sse4_1, D117_PRED),
-  make_tuple(av1_highbd_d153_filter_predictor_c,
-             av1_highbd_d153_filter_predictor_sse4_1, D153_PRED),
-  make_tuple(av1_highbd_d207_filter_predictor_c,
-             av1_highbd_d207_filter_predictor_sse4_1, D207_PRED),
-  make_tuple(av1_highbd_d63_filter_predictor_c,
-             av1_highbd_d63_filter_predictor_sse4_1, D63_PRED),
-  make_tuple(av1_highbd_paeth_filter_predictor_c,
-             av1_highbd_paeth_filter_predictor_sse4_1, PAETH_PRED),
-};
-
-const int kBd[] = { 10, 12 };
-
-INSTANTIATE_TEST_CASE_P(
-    SSE4_1, AV1HbdFilterIntraPredOptimzTest,
-    ::testing::Combine(::testing::ValuesIn(kHbdPredFuncMdArray),
-                       ::testing::ValuesIn(kBlkSize),
-                       ::testing::ValuesIn(kBd)));
-#endif  // CONFIG_HIGHBITDEPTH
-
-}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 693f778..e9b2d6a 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -151,16 +151,6 @@
       endif ()
     endif ()
 
-    # Omit 4-tap filter intra predictor test-- currently a 3-tap filter is in
-    # use.
-    #if (CONFIG_FILTER_INTRA)
-    #  if (HAVE_SSE4_1)
-    #    set(AOM_UNIT_TEST_COMMON_SOURCES
-    #        ${AOM_UNIT_TEST_COMMON_SOURCES}
-    #        "${AOM_ROOT}/test/filterintra_predictors_test.cc")
-    #  endif ()
-    #endif ()
-
     if (CONFIG_INTRABC)
         set(AOM_UNIT_TEST_COMMON_SOURCES
             ${AOM_UNIT_TEST_COMMON_SOURCES}
diff --git a/test/test.mk b/test/test.mk
index 9b9d170..f315d96 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -197,12 +197,6 @@
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += masked_sad_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += av1_wedge_utils_test.cc
 
-## Skip the unit test written for 4-tap filter intra predictor, because we
-## revert to 3-tap filter.
-## ifeq ($(CONFIG_FILTER_INTRA),yes)
-## LIBAOM_TEST_SRCS-$(HAVE_SSE4_1) += filterintra_predictors_test.cc
-## endif
-
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_sad_test.cc
 LIBAOM_TEST_SRCS-$(CONFIG_AV1_ENCODER) += obmc_variance_test.cc