Add sse4 implementation of lbd filter_intra modes

Speed up filter_intra_predictor() by 2.65x
Unit tests are also added.

Change-Id: I675fc07b0ec369aa3884bd8ebcd83d5c0f00e9cd
diff --git a/aom_dsp/x86/synonyms.h b/aom_dsp/x86/synonyms.h
index cd049a4..a89f8b3 100644
--- a/aom_dsp/x86/synonyms.h
+++ b/aom_dsp/x86/synonyms.h
@@ -89,4 +89,12 @@
   return _mm_srai_epi32(v_tmp_d, bits);
 }
 
+static INLINE __m128i xx_roundn_epi16(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi16((1 << bits) >> 1);
+  const __m128i v_sign_d = _mm_srai_epi16(v_val_d, 15);
+  const __m128i v_tmp_d =
+      _mm_add_epi16(_mm_add_epi16(v_val_d, v_bias_d), v_sign_d);
+  return _mm_srai_epi16(v_tmp_d, bits);
+}
+
 #endif  // AOM_DSP_X86_SYNONYMS_H_
diff --git a/av1/av1.cmake b/av1/av1.cmake
index f0f1309..a740547 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -411,6 +411,12 @@
       "${AOM_ROOT}/av1/common/x86/intra_edge_sse4.c")
 endif ()
 
+if (CONFIG_FILTER_INTRA)
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/x86/filterintra_sse4.c")
+endif ()
+
 set(AOM_AV1_COMMON_SOURCES
     ${AOM_AV1_COMMON_SOURCES}
     "${AOM_ROOT}/av1/common/warped_motion.c"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 53c38b3..5f11b07 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -168,20 +168,8 @@
 
 # FILTER_INTRA predictor functions
 if (aom_config("CONFIG_FILTER_INTRA") eq "yes") {
-  add_proto qw/void av1_dc_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_v_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_h_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d117_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_d153_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-  add_proto qw/void av1_paeth_filter_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left";
-
-  # High bitdepth functions
-  add_proto qw/void av1_highbd_dc_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  add_proto qw/void av1_highbd_v_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  add_proto qw/void av1_highbd_h_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  add_proto qw/void av1_highbd_d117_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  add_proto qw/void av1_highbd_d153_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
-  add_proto qw/void av1_highbd_paeth_filter_predictor/, "uint16_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint16_t *above, const uint16_t *left, int bd";
+  add_proto qw/void av1_filter_intra_predictor/, "uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size, const uint8_t *above, const uint8_t *left, int mode";
+  specialize qw/av1_filter_intra_predictor sse4_1/;
 }
 
 # High bitdepth functions
diff --git a/av1/common/reconintra.c b/av1/common/reconintra.c
index a16877f..d4dd816 100644
--- a/av1/common/reconintra.c
+++ b/av1/common/reconintra.c
@@ -1090,64 +1090,65 @@
 }
 
 #if CONFIG_FILTER_INTRA
-static int filter_intra_taps_4x2procunit[FILTER_INTRA_MODES][8][7] = {
+DECLARE_ALIGNED(16, const int8_t,
+                av1_filter_intra_taps[FILTER_INTRA_MODES][8][8]) = {
   {
-      { -6, 10, 0, 0, 0, 12, 0 },
-      { -5, 2, 10, 0, 0, 9, 0 },
-      { -3, 1, 1, 10, 0, 7, 0 },
-      { -3, 1, 1, 2, 10, 5, 0 },
-      { -4, 6, 0, 0, 0, 2, 12 },
-      { -3, 2, 6, 0, 0, 2, 9 },
-      { -3, 2, 2, 6, 0, 2, 7 },
-      { -3, 1, 2, 2, 6, 3, 5 },
+      { -6, 10, 0, 0, 0, 12, 0, 0 },
+      { -5, 2, 10, 0, 0, 9, 0, 0 },
+      { -3, 1, 1, 10, 0, 7, 0, 0 },
+      { -3, 1, 1, 2, 10, 5, 0, 0 },
+      { -4, 6, 0, 0, 0, 2, 12, 0 },
+      { -3, 2, 6, 0, 0, 2, 9, 0 },
+      { -3, 2, 2, 6, 0, 2, 7, 0 },
+      { -3, 1, 2, 2, 6, 3, 5, 0 },
   },
   {
-      { -10, 16, 0, 0, 0, 10, 0 },
-      { -6, 0, 16, 0, 0, 6, 0 },
-      { -4, 0, 0, 16, 0, 4, 0 },
-      { -2, 0, 0, 0, 16, 2, 0 },
-      { -10, 16, 0, 0, 0, 0, 10 },
-      { -6, 0, 16, 0, 0, 0, 6 },
-      { -4, 0, 0, 16, 0, 0, 4 },
-      { -2, 0, 0, 0, 16, 0, 2 },
+      { -10, 16, 0, 0, 0, 10, 0, 0 },
+      { -6, 0, 16, 0, 0, 6, 0, 0 },
+      { -4, 0, 0, 16, 0, 4, 0, 0 },
+      { -2, 0, 0, 0, 16, 2, 0, 0 },
+      { -10, 16, 0, 0, 0, 0, 10, 0 },
+      { -6, 0, 16, 0, 0, 0, 6, 0 },
+      { -4, 0, 0, 16, 0, 0, 4, 0 },
+      { -2, 0, 0, 0, 16, 0, 2, 0 },
   },
   {
-      { -8, 8, 0, 0, 0, 16, 0 },
-      { -8, 0, 8, 0, 0, 16, 0 },
-      { -8, 0, 0, 8, 0, 16, 0 },
-      { -8, 0, 0, 0, 8, 16, 0 },
-      { -4, 4, 0, 0, 0, 0, 16 },
-      { -4, 0, 4, 0, 0, 0, 16 },
-      { -4, 0, 0, 4, 0, 0, 16 },
-      { -4, 0, 0, 0, 4, 0, 16 },
+      { -8, 8, 0, 0, 0, 16, 0, 0 },
+      { -8, 0, 8, 0, 0, 16, 0, 0 },
+      { -8, 0, 0, 8, 0, 16, 0, 0 },
+      { -8, 0, 0, 0, 8, 16, 0, 0 },
+      { -4, 4, 0, 0, 0, 0, 16, 0 },
+      { -4, 0, 4, 0, 0, 0, 16, 0 },
+      { -4, 0, 0, 4, 0, 0, 16, 0 },
+      { -4, 0, 0, 0, 4, 0, 16, 0 },
   },
   {
-      { -2, 8, 0, 0, 0, 10, 0 },
-      { -1, 3, 8, 0, 0, 6, 0 },
-      { -1, 2, 3, 8, 0, 4, 0 },
-      { 0, 1, 2, 3, 8, 2, 0 },
-      { -1, 4, 0, 0, 0, 3, 10 },
-      { -1, 3, 4, 0, 0, 4, 6 },
-      { -1, 2, 3, 4, 0, 4, 4 },
-      { -1, 2, 2, 3, 4, 3, 3 },
+      { -2, 8, 0, 0, 0, 10, 0, 0 },
+      { -1, 3, 8, 0, 0, 6, 0, 0 },
+      { -1, 2, 3, 8, 0, 4, 0, 0 },
+      { 0, 1, 2, 3, 8, 2, 0, 0 },
+      { -1, 4, 0, 0, 0, 3, 10, 0 },
+      { -1, 3, 4, 0, 0, 4, 6, 0 },
+      { -1, 2, 3, 4, 0, 4, 4, 0 },
+      { -1, 2, 2, 3, 4, 3, 3, 0 },
   },
   {
-      { -12, 14, 0, 0, 0, 14, 0 },
-      { -10, 0, 14, 0, 0, 12, 0 },
-      { -9, 0, 0, 14, 0, 11, 0 },
-      { -8, 0, 0, 0, 14, 10, 0 },
-      { -10, 12, 0, 0, 0, 0, 14 },
-      { -9, 1, 12, 0, 0, 0, 12 },
-      { -8, 0, 0, 12, 0, 1, 11 },
-      { -7, 0, 0, 1, 12, 1, 9 },
+      { -12, 14, 0, 0, 0, 14, 0, 0 },
+      { -10, 0, 14, 0, 0, 12, 0, 0 },
+      { -9, 0, 0, 14, 0, 11, 0, 0 },
+      { -8, 0, 0, 0, 14, 10, 0, 0 },
+      { -10, 12, 0, 0, 0, 0, 14, 0 },
+      { -9, 1, 12, 0, 0, 0, 12, 0 },
+      { -8, 0, 0, 12, 0, 1, 11, 0 },
+      { -7, 0, 0, 1, 12, 1, 9, 0 },
   },
 };
 
-static void filter_intra_predictor(uint8_t *dst, ptrdiff_t stride,
-                                   TX_SIZE tx_size, const uint8_t *above,
-                                   const uint8_t *left, int mode) {
+void av1_filter_intra_predictor_c(uint8_t *dst, ptrdiff_t stride,
+                                  TX_SIZE tx_size, const uint8_t *above,
+                                  const uint8_t *left, int mode) {
   int r, c;
-  int buffer[33][33];
+  uint8_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
@@ -1157,100 +1158,47 @@
   for (r = 0; r < bh + 1; ++r)
     memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
 
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
-
-  for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
 
   for (r = 1; r < bh + 1; r += 2)
     for (c = 1; c < bw + 1; c += 4) {
-      const int p0 = buffer[r - 1][c - 1];
-      const int p1 = buffer[r - 1][c];
-      const int p2 = buffer[r - 1][c + 1];
-      const int p3 = buffer[r - 1][c + 2];
-      const int p4 = buffer[r - 1][c + 3];
-      const int p5 = buffer[r][c - 1];
-      const int p6 = buffer[r + 1][c - 1];
+      const uint8_t p0 = buffer[r - 1][c - 1];
+      const uint8_t p1 = buffer[r - 1][c];
+      const uint8_t p2 = buffer[r - 1][c + 1];
+      const uint8_t p3 = buffer[r - 1][c + 2];
+      const uint8_t p4 = buffer[r - 1][c + 3];
+      const uint8_t p5 = buffer[r][c - 1];
+      const uint8_t p6 = buffer[r + 1][c - 1];
       for (int k = 0; k < 8; ++k) {
         int r_offset = k >> 2;
         int c_offset = k & 0x03;
         buffer[r + r_offset][c + c_offset] =
-            filter_intra_taps_4x2procunit[mode][k][0] * p0 +
-            filter_intra_taps_4x2procunit[mode][k][1] * p1 +
-            filter_intra_taps_4x2procunit[mode][k][2] * p2 +
-            filter_intra_taps_4x2procunit[mode][k][3] * p3 +
-            filter_intra_taps_4x2procunit[mode][k][4] * p4 +
-            filter_intra_taps_4x2procunit[mode][k][5] * p5 +
-            filter_intra_taps_4x2procunit[mode][k][6] * p6;
-        buffer[r + r_offset][c + c_offset] =
             clip_pixel(ROUND_POWER_OF_TWO_SIGNED(
-                buffer[r + r_offset][c + c_offset], FILTER_INTRA_SCALE_BITS));
+                av1_filter_intra_taps[mode][k][0] * p0 +
+                    av1_filter_intra_taps[mode][k][1] * p1 +
+                    av1_filter_intra_taps[mode][k][2] * p2 +
+                    av1_filter_intra_taps[mode][k][3] * p3 +
+                    av1_filter_intra_taps[mode][k][4] * p4 +
+                    av1_filter_intra_taps[mode][k][5] * p5 +
+                    av1_filter_intra_taps[mode][k][6] * p6,
+                FILTER_INTRA_SCALE_BITS));
       }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = buffer[r + 1][c + 1];
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
     dst += stride;
   }
 }
 
-void av1_dc_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                               const uint8_t *above, const uint8_t *left) {
-  filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_DC_PRED);
-}
-
-void av1_v_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-  filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_V_PRED);
-}
-
-void av1_h_filter_predictor_c(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
-                              const uint8_t *above, const uint8_t *left) {
-  filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_H_PRED);
-}
-
-void av1_d153_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                 TX_SIZE tx_size, const uint8_t *above,
-                                 const uint8_t *left) {
-  filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_D153_PRED);
-}
-
-void av1_paeth_filter_predictor_c(uint8_t *dst, ptrdiff_t stride,
-                                  TX_SIZE tx_size, const uint8_t *above,
-                                  const uint8_t *left) {
-  filter_intra_predictor(dst, stride, tx_size, above, left, FILTER_PAETH_PRED);
-}
-
-static void filter_intra_predictors(FILTER_INTRA_MODE mode, uint8_t *dst,
-                                    ptrdiff_t stride, TX_SIZE tx_size,
-                                    const uint8_t *above, const uint8_t *left) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_dc_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_V_PRED:
-      av1_v_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_H_PRED:
-      av1_h_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_D153_PRED:
-      av1_d153_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    case FILTER_PAETH_PRED:
-      av1_paeth_filter_predictor(dst, stride, tx_size, above, left);
-      break;
-    default: assert(0);
-  }
-}
 static void highbd_filter_intra_predictor(uint16_t *dst, ptrdiff_t stride,
                                           TX_SIZE tx_size,
                                           const uint16_t *above,
                                           const uint16_t *left, int mode,
                                           int bd) {
   int r, c;
-  int buffer[33][33];
+  uint16_t buffer[33][33];
   const int bw = tx_size_wide[tx_size];
   const int bh = tx_size_high[tx_size];
 
@@ -1260,104 +1208,40 @@
   for (r = 0; r < bh + 1; ++r)
     memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
 
-  for (r = 0; r < bh; ++r) buffer[r + 1][0] = (int)left[r];
-
-  for (c = 0; c < bw + 1; ++c) buffer[0][c] = (int)above[c - 1];
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(buffer[0][0]));
 
   for (r = 1; r < bh + 1; r += 2)
     for (c = 1; c < bw + 1; c += 4) {
-      const int p0 = buffer[r - 1][c - 1];
-      const int p1 = buffer[r - 1][c];
-      const int p2 = buffer[r - 1][c + 1];
-      const int p3 = buffer[r - 1][c + 2];
-      const int p4 = buffer[r - 1][c + 3];
-      const int p5 = buffer[r][c - 1];
-      const int p6 = buffer[r + 1][c - 1];
+      const uint16_t p0 = buffer[r - 1][c - 1];
+      const uint16_t p1 = buffer[r - 1][c];
+      const uint16_t p2 = buffer[r - 1][c + 1];
+      const uint16_t p3 = buffer[r - 1][c + 2];
+      const uint16_t p4 = buffer[r - 1][c + 3];
+      const uint16_t p5 = buffer[r][c - 1];
+      const uint16_t p6 = buffer[r + 1][c - 1];
       for (int k = 0; k < 8; ++k) {
         int r_offset = k >> 2;
         int c_offset = k & 0x03;
         buffer[r + r_offset][c + c_offset] =
-            filter_intra_taps_4x2procunit[mode][k][0] * p0 +
-            filter_intra_taps_4x2procunit[mode][k][1] * p1 +
-            filter_intra_taps_4x2procunit[mode][k][2] * p2 +
-            filter_intra_taps_4x2procunit[mode][k][3] * p3 +
-            filter_intra_taps_4x2procunit[mode][k][4] * p4 +
-            filter_intra_taps_4x2procunit[mode][k][5] * p5 +
-            filter_intra_taps_4x2procunit[mode][k][6] * p6;
-        buffer[r + r_offset][c + c_offset] = clip_pixel_highbd(
-            ROUND_POWER_OF_TWO_SIGNED(buffer[r + r_offset][c + c_offset],
-                                      FILTER_INTRA_SCALE_BITS),
-            bd);
+            clip_pixel_highbd(ROUND_POWER_OF_TWO_SIGNED(
+                                  av1_filter_intra_taps[mode][k][0] * p0 +
+                                      av1_filter_intra_taps[mode][k][1] * p1 +
+                                      av1_filter_intra_taps[mode][k][2] * p2 +
+                                      av1_filter_intra_taps[mode][k][3] * p3 +
+                                      av1_filter_intra_taps[mode][k][4] * p4 +
+                                      av1_filter_intra_taps[mode][k][5] * p5 +
+                                      av1_filter_intra_taps[mode][k][6] * p6,
+                                  FILTER_INTRA_SCALE_BITS),
+                              bd);
       }
     }
 
   for (r = 0; r < bh; ++r) {
-    for (c = 0; c < bw; ++c) {
-      dst[c] = buffer[r + 1][c + 1];
-    }
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(dst[0]));
     dst += stride;
   }
 }
-
-void av1_highbd_dc_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                      TX_SIZE tx_size, const uint16_t *above,
-                                      const uint16_t *left, int bd) {
-  highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
-                                FILTER_DC_PRED, bd);
-}
-
-void av1_highbd_v_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
-                                FILTER_V_PRED, bd);
-}
-
-void av1_highbd_h_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                     TX_SIZE tx_size, const uint16_t *above,
-                                     const uint16_t *left, int bd) {
-  highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
-                                FILTER_H_PRED, bd);
-}
-
-void av1_highbd_d153_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                        TX_SIZE tx_size, const uint16_t *above,
-                                        const uint16_t *left, int bd) {
-  highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
-                                FILTER_D153_PRED, bd);
-}
-
-void av1_highbd_paeth_filter_predictor_c(uint16_t *dst, ptrdiff_t stride,
-                                         TX_SIZE tx_size, const uint16_t *above,
-                                         const uint16_t *left, int bd) {
-  highbd_filter_intra_predictor(dst, stride, tx_size, above, left,
-                                FILTER_PAETH_PRED, bd);
-}
-
-static void highbd_filter_intra_predictors(FILTER_INTRA_MODE mode,
-                                           uint16_t *dst, ptrdiff_t stride,
-                                           TX_SIZE tx_size,
-                                           const uint16_t *above,
-                                           const uint16_t *left, int bd) {
-  switch (mode) {
-    case FILTER_DC_PRED:
-      av1_highbd_dc_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_V_PRED:
-      av1_highbd_v_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_H_PRED:
-      av1_highbd_h_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_D153_PRED:
-      av1_highbd_d153_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    case FILTER_PAETH_PRED:
-      av1_highbd_paeth_filter_predictor(dst, stride, tx_size, above, left, bd);
-      break;
-    default: assert(0);
-  }
-}
 #endif  // CONFIG_FILTER_INTRA
 
 #if CONFIG_INTRA_EDGE
@@ -1769,8 +1653,8 @@
 
 #if CONFIG_FILTER_INTRA
   if (use_filter_intra) {
-    highbd_filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                                   above_row, left_col, xd->bd);
+    highbd_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                                  filter_intra_mode, xd->bd);
     return;
   }
 #endif  // CONFIG_FILTER_INTRA
@@ -1992,8 +1876,8 @@
 
 #if CONFIG_FILTER_INTRA
   if (use_filter_intra) {
-    filter_intra_predictors(filter_intra_mode, dst, dst_stride, tx_size,
-                            above_row, left_col);
+    av1_filter_intra_predictor(dst, dst_stride, tx_size, above_row, left_col,
+                               filter_intra_mode);
     return;
   }
 #endif  // CONFIG_FILTER_INTRA
diff --git a/av1/common/reconintra.h b/av1/common/reconintra.h
index cfa3357..ba71694 100644
--- a/av1/common/reconintra.h
+++ b/av1/common/reconintra.h
@@ -66,6 +66,10 @@
 }
 #endif  // CONFIG_INTRABC
 
+#if CONFIG_FILTER_INTRA
+extern const int8_t av1_filter_intra_taps[FILTER_INTRA_MODES][8][8];
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/common/x86/filterintra_sse4.c b/av1/common/x86/filterintra_sse4.c
new file mode 100644
index 0000000..fd3df2c
--- /dev/null
+++ b/av1/common/x86/filterintra_sse4.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <smmintrin.h>
+#include "./av1_rtcd.h"
+#include "aom_dsp/x86/synonyms.h"
+#include "av1/common/enums.h"
+#include "av1/common/reconintra.h"
+
+void av1_filter_intra_predictor_sse4_1(uint8_t *dst, ptrdiff_t stride,
+                                       TX_SIZE tx_size, const uint8_t *above,
+                                       const uint8_t *left, int mode) {
+  int r, c;
+  uint8_t buffer[33][33];
+  const int bw = tx_size_wide[tx_size];
+  const int bh = tx_size_high[tx_size];
+
+  assert(bw <= 32 && bh <= 32);
+
+  // The initialization is just for silencing Jenkins static analysis warnings
+  for (r = 0; r < bh + 1; ++r)
+    memset(buffer[r], 0, (bw + 1) * sizeof(buffer[0][0]));
+
+  for (r = 0; r < bh; ++r) buffer[r + 1][0] = left[r];
+  memcpy(buffer[0], &above[-1], (bw + 1) * sizeof(uint8_t));
+
+  const __m128i f1f0 = xx_load_128(av1_filter_intra_taps[mode][0]);
+  const __m128i f3f2 = xx_load_128(av1_filter_intra_taps[mode][2]);
+  const __m128i f5f4 = xx_load_128(av1_filter_intra_taps[mode][4]);
+  const __m128i f7f6 = xx_load_128(av1_filter_intra_taps[mode][6]);
+  const __m128i filter_intra_scale_bits =
+      _mm_set1_epi16(1 << (15 - FILTER_INTRA_SCALE_BITS));
+
+  for (r = 1; r < bh + 1; r += 2) {
+    for (c = 1; c < bw + 1; c += 4) {
+      DECLARE_ALIGNED(16, uint8_t, p[8]);
+      memcpy(p, &buffer[r - 1][c - 1], 5 * sizeof(uint8_t));
+      p[5] = buffer[r][c - 1];
+      p[6] = buffer[r + 1][c - 1];
+      p[7] = 0;
+      const __m128i p_b = xx_loadl_64(p);
+      const __m128i in = _mm_unpacklo_epi64(p_b, p_b);
+      const __m128i out_01 = _mm_maddubs_epi16(in, f1f0);
+      const __m128i out_23 = _mm_maddubs_epi16(in, f3f2);
+      const __m128i out_45 = _mm_maddubs_epi16(in, f5f4);
+      const __m128i out_67 = _mm_maddubs_epi16(in, f7f6);
+      const __m128i out_0123 = _mm_hadd_epi16(out_01, out_23);
+      const __m128i out_4567 = _mm_hadd_epi16(out_45, out_67);
+      const __m128i out_01234567 = _mm_hadd_epi16(out_0123, out_4567);
+      // Rounding
+      const __m128i round_w =
+          _mm_mulhrs_epi16(out_01234567, filter_intra_scale_bits);
+      const __m128i out_r = _mm_packus_epi16(round_w, round_w);
+      const __m128i out_r1 = _mm_srli_si128(out_r, 4);
+      // Storing
+      xx_storel_32(&buffer[r][c], out_r);
+      xx_storel_32(&buffer[r + 1][c], out_r1);
+    }
+  }
+
+  for (r = 0; r < bh; ++r) {
+    memcpy(dst, &buffer[r + 1][1], bw * sizeof(uint8_t));
+    dst += stride;
+  }
+}
diff --git a/test/filterintra_test.cc b/test/filterintra_test.cc
new file mode 100644
index 0000000..ed442a7
--- /dev/null
+++ b/test/filterintra_test.cc
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "av1/common/enums.h"
+
+namespace {
+
+using libaom_test::ACMRandom;
+using std::tr1::tuple;
+
+typedef void (*Predictor)(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                          const uint8_t *above, const uint8_t *left, int mode);
+
+// Note:
+//  Test parameter list:
+//  Reference predictor, optimized predictor, prediction mode, tx size
+//
+typedef tuple<Predictor, Predictor, int> PredFuncMode;
+typedef tuple<PredFuncMode, TX_SIZE> PredParams;
+
+const int MaxTxSize = 32;
+
+const int MaxTestNum = 100;
+
+class AV1FilterIntraPredTest : public ::testing::TestWithParam<PredParams> {
+ public:
+  virtual ~AV1FilterIntraPredTest() {}
+  virtual void SetUp() {
+    PredFuncMode funcMode = GET_PARAM(0);
+    predFuncRef_ = std::tr1::get<0>(funcMode);
+    predFunc_ = std::tr1::get<1>(funcMode);
+    mode_ = std::tr1::get<2>(funcMode);
+    txSize_ = GET_PARAM(1);
+
+    alloc_ = new uint8_t[2 * MaxTxSize + 1];
+    predRef_ = new uint8_t[MaxTxSize * MaxTxSize];
+    pred_ = new uint8_t[MaxTxSize * MaxTxSize];
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    delete[] predRef_;
+    delete[] pred_;
+    libaom_test::ClearSystemState();
+  }
+
+ protected:
+  void RunTest() const {
+    int tstIndex = 0;
+    int stride = tx_size_wide[txSize_];
+    uint8_t *left = alloc_;
+    uint8_t *above = alloc_ + MaxTxSize;
+    while (tstIndex < MaxTestNum) {
+      PrepareBuffer();
+      predFuncRef_(predRef_, stride, txSize_, &above[1], left, mode_);
+      ASM_REGISTER_STATE_CHECK(
+          predFunc_(pred_, stride, txSize_, &above[1], left, mode_));
+      DiffPred(tstIndex);
+      tstIndex += 1;
+    }
+  }
+
+ private:
+  void PrepareBuffer() const {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    int i = 0;
+    while (i < (2 * MaxTxSize + 1)) {
+      alloc_[i] = rnd.Rand8();
+      i++;
+    }
+  }
+
+  void DiffPred(int testNum) const {
+    int i = 0;
+    while (i < tx_size_wide[txSize_] * tx_size_high[txSize_]) {
+      EXPECT_EQ(predRef_[i], pred_[i]) << "Error at position: " << i << " "
+                                       << "Tx size: " << tx_size_wide[txSize_]
+                                       << "x" << tx_size_high[txSize_] << " "
+                                       << "Test number: " << testNum;
+      i++;
+    }
+  }
+
+  Predictor predFunc_;
+  Predictor predFuncRef_;
+  int mode_;
+  TX_SIZE txSize_;
+  uint8_t *alloc_;
+  uint8_t *pred_;
+  uint8_t *predRef_;
+};
+
+TEST_P(AV1FilterIntraPredTest, BitExactCheck) { RunTest(); }
+
+using std::tr1::make_tuple;
+
+const PredFuncMode kPredFuncMdArray[] = {
+  make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+             FILTER_DC_PRED),
+  make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+             FILTER_V_PRED),
+  make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+             FILTER_H_PRED),
+  make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+             FILTER_D153_PRED),
+  make_tuple(av1_filter_intra_predictor_c, av1_filter_intra_predictor_sse4_1,
+             FILTER_PAETH_PRED),
+};
+
+const TX_SIZE kTxSize[] = { TX_4X4,  TX_8X8,  TX_16X16, TX_32X32, TX_4X8,
+                            TX_8X4,  TX_8X16, TX_16X8,  TX_16X32, TX_32X16,
+                            TX_4X16, TX_16X4, TX_8X32,  TX_32X8 };
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, AV1FilterIntraPredTest,
+    ::testing::Combine(::testing::ValuesIn(kPredFuncMdArray),
+                       ::testing::ValuesIn(kTxSize)));
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 9f9e89e..f49e141 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -149,6 +149,14 @@
             "${AOM_ROOT}/test/intrabc_test.cc")
     endif ()
 
+    if (CONFIG_FILTER_INTRA)
+      if (HAVE_SSE4_1)
+        set(AOM_UNIT_TEST_COMMON_SOURCES
+            ${AOM_UNIT_TEST_COMMON_SOURCES}
+            "${AOM_ROOT}/test/filterintra_test.cc")
+      endif ()
+    endif ()
+
     if (CONFIG_CFL)
       set(AOM_UNIT_TEST_COMMON_SOURCES
         ${AOM_UNIT_TEST_COMMON_SOURCES}