Merge "Remove a flavor of SSIM that is never really used." into nextgenv2
diff --git a/test/test.mk b/test/test.mk
index adcebad..d6d08ff 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -182,6 +182,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_test.cc
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
new file mode 100644
index 0000000..eea7068
--- /dev/null
+++ b/test/vp10_convolve_test.cc
@@ -0,0 +1,250 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/vp10_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+TEST(VP10ConvolveTest, vp10_convolve8) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  INTERP_FILTER interp_filter = EIGHTTAP;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  ptrdiff_t filter_size = filter_params.tap;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst[1] = {0};
+  uint8_t dst1[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 3;
+  int subpel_y_q4 = 2;
+  int avg = 0;
+
+  int w = 1;
+  int h = 1;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  vp10_convolve(src + src_stride * filter_center + filter_center, src_stride,
+                dst, dst_stride, w, h, filter_params, subpel_x_q4, x_step_q4,
+                subpel_y_q4, y_step_q4, avg);
+
+  const int16_t* x_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);
+  const int16_t* y_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);
+
+  vpx_convolve8_c(src + src_stride * filter_center + filter_center, src_stride,
+                  dst1, dst_stride, x_filter, 16, y_filter, 16, w, h);
+  EXPECT_EQ(dst[0], dst1[0]);
+}
+TEST(VP10ConvolveTest, vp10_convolve) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  INTERP_FILTER interp_filter = EIGHTTAP;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  ptrdiff_t filter_size = filter_params.tap;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 3;
+  int subpel_y_q4 = 2;
+  int avg = 0;
+
+  int w = 1;
+  int h = 1;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  vp10_convolve(src + src_stride * filter_center + filter_center, src_stride,
+                dst, dst_stride, w, h, filter_params, subpel_x_q4, x_step_q4,
+                subpel_y_q4, y_step_q4, avg);
+
+  const int16_t* x_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);
+  const int16_t* y_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);
+
+  int temp[12];
+  int dst_ref = 0;
+  for (int r = 0; r < filter_size; r++) {
+    temp[r] = 0;
+    for (int c = 0; c < filter_size; c++) {
+      temp[r] += x_filter[c] * src[r * filter_size + c];
+    }
+    temp[r] = clip_pixel(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS));
+    dst_ref += temp[r] * y_filter[r];
+  }
+  dst_ref = clip_pixel(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS));
+  EXPECT_EQ(dst[0], dst_ref);
+}
+
+TEST(VP10ConvolveTest, vp10_convolve_avg) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  INTERP_FILTER interp_filter = EIGHTTAP;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  ptrdiff_t filter_size = filter_params.tap;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src0[12 * 12];
+  uint8_t src1[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst0[1] = {0};
+  uint8_t dst1[1] = {0};
+  uint8_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 3;
+  int subpel_y_q4 = 2;
+  int avg = 0;
+
+  int w = 1;
+  int h = 1;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src0[i] = rnd.Rand16() % (1 << 8);
+    src1[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  int offset = filter_size * filter_center + filter_center;
+
+  avg = 0;
+  vp10_convolve(src0 + offset, src_stride, dst0, dst_stride, w, h,
+                filter_params, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
+                avg);
+  avg = 0;
+  vp10_convolve(src1 + offset, src_stride, dst1, dst_stride, w, h,
+                filter_params, subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4,
+                avg);
+
+  avg = 0;
+  vp10_convolve(src0 + offset, src_stride, dst, dst_stride, w, h, filter_params,
+                subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg);
+  avg = 1;
+  vp10_convolve(src1 + offset, src_stride, dst, dst_stride, w, h, filter_params,
+                subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg);
+
+  EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10ConvolveTest, vp10_highbd_convolve) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  INTERP_FILTER interp_filter = EIGHTTAP;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  ptrdiff_t filter_size = filter_params.tap;
+  int filter_center = filter_size / 2 - 1;
+  uint16_t src[12 * 12];
+  int src_stride = filter_size;
+  uint16_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 8;
+  int subpel_y_q4 = 6;
+  int avg = 0;
+  int bd = 10;
+
+  int w = 1;
+  int h = 1;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << bd);
+  }
+
+  vp10_highbd_convolve(
+      CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
+      src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, filter_params,
+      subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
+
+  const int16_t* x_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);
+  const int16_t* y_filter =
+      vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);
+
+  int temp[12];
+  int dst_ref = 0;
+  for (int r = 0; r < filter_size; r++) {
+    temp[r] = 0;
+    for (int c = 0; c < filter_size; c++) {
+      temp[r] += x_filter[c] * src[r * filter_size + c];
+    }
+    temp[r] = clip_pixel_highbd(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS), bd);
+    dst_ref += temp[r] * y_filter[r];
+  }
+  dst_ref = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS), bd);
+  EXPECT_EQ(dst[0], dst_ref);
+}
+
+TEST(VP10ConvolveTest, vp10_highbd_convolve_avg) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  INTERP_FILTER interp_filter = EIGHTTAP;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  ptrdiff_t filter_size = filter_params.tap;
+  int filter_center = filter_size / 2 - 1;
+  uint16_t src0[12 * 12];
+  uint16_t src1[12 * 12];
+  int src_stride = filter_size;
+  uint16_t dst0[1] = {0};
+  uint16_t dst1[1] = {0};
+  uint16_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 3;
+  int subpel_y_q4 = 2;
+  int avg = 0;
+  int bd = 10;
+
+  int w = 1;
+  int h = 1;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src0[i] = rnd.Rand16() % (1 << bd);
+    src1[i] = rnd.Rand16() % (1 << bd);
+  }
+
+  int offset = filter_size * filter_center + filter_center;
+
+  avg = 0;
+  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
+                       CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
+                       filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
+                       y_step_q4, avg, bd);
+  avg = 0;
+  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
+                       CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
+                       filter_params, subpel_x_q4, x_step_q4, subpel_y_q4,
+                       y_step_q4, avg, bd);
+
+  avg = 0;
+  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
+                       CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, filter_params,
+                       subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
+  avg = 1;
+  vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
+                       CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, filter_params,
+                       subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
+
+  EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index 2e87903..4a3e67c 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -401,6 +401,7 @@
 #define MAX_ANGLE_DELTAS 3
 #define ANGLE_FAST_SEARCH 1
 #define ANGLE_SKIP_THRESH 0.10
+#define FILTER_FAST_SEARCH 1
 
 static uint8_t mode_to_angle_map[INTRA_MODES] = {
     0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index aaa7628..a3aa3cf 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -12,6 +12,8 @@
 
 #include "vp10/common/filter.h"
 
+#define USE_12_SHARP_FILTER 0
+
 DECLARE_ALIGNED(256, static const InterpKernel,
                 bilinear_filters[SUBPEL_SHIFTS]) = {
   { 0, 0, 0, 128,   0, 0, 0, 0 },
@@ -73,6 +75,29 @@
 #endif  // CONFIG_EXT_INTERP
 };
 
+#if USE_12_SHARP_FILTER
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_12sharp[16][12]) = {
+  // intfilt 0.8
+  {0,   0,   0,   0,   0, 128,   0,   0,   0,   0,   0, 0},
+  {0,   1,  -1,   3,  -7, 127,   8,  -4,   2,  -1,   0, 0},
+  {0,   1,  -3,   5, -12, 124,  18,  -8,   4,  -2,   1, 0},
+  {-1,   2,  -4,   8, -17, 120,  28, -11,   6,  -3,   1, -1},
+  {-1,   2,  -4,  10, -21, 114,  38, -15,   8,  -4,   2, -1},
+  {-1,   3,  -5,  11, -23, 107,  49, -18,   9,  -5,   2, -1},
+  {-1,   3,  -6,  12, -25,  99,  60, -21,  11,  -6,   3, -1},
+  {-1,   3,  -6,  12, -25,  90,  70, -23,  12,  -6,   3, -1},
+  {-1,   3,  -6,  12, -24,  80,  80, -24,  12,  -6,   3, -1},
+  {-1,   3,  -6,  12, -23,  70,  90, -25,  12,  -6,   3, -1},
+  {-1,   3,  -6,  11, -21,  60,  99, -25,  12,  -6,   3, -1},
+  {-1,   2,  -5,   9, -18,  49, 107, -23,  11,  -5,   3, -1},
+  {-1,   2,  -4,   8, -15,  38, 114, -21,  10,  -4,   2, -1},
+  {-1,   1,  -3,   6, -11,  28, 120, -17,   8,  -4,   2, -1},
+  {0,   1,  -2,   4,  -8,  18, 124, -12,   5,  -3,   1, 0},
+  {0,   0,  -1,   2,  -4,   8, 127,  -7,   3,  -1,   1, 0},
+};
+#endif  // USE_12_SHARP_FILTER
+
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
 #if CONFIG_EXT_INTERP
@@ -201,3 +226,23 @@
     sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
 };
 #endif  // CONFIG_EXT_INTRA
+
+static const InterpFilterParams
+vp10_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+  {(const int16_t*)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS},
+#if USE_12_SHARP_FILTER
+  {(const int16_t*)sub_pel_filters_12sharp, 12, SUBPEL_SHIFTS},
+#else  // USE_12_SHARP_FILTER
+  {(const int16_t*)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS},
+#endif  // USE_12_SHARP_FILTER
+#if CONFIG_EXT_INTERP && SWITCHABLE_FILTERS == 4
+  {(const int16_t*)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS},
+#endif
+  {(const int16_t*)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS}
+};
+
+InterpFilterParams vp10_get_interp_filter_params(
+    const INTERP_FILTER interp_filter) {
+  return vp10_interp_filter_params_list[interp_filter];
+}
diff --git a/vp10/common/filter.h b/vp10/common/filter.h
index a272db8..afebee0 100644
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h
@@ -55,6 +55,18 @@
 extern const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS];
 #endif  // CONFIG_EXT_INTRA
 
+typedef struct InterpFilterParams {
+  const int16_t* filter_ptr;
+  uint16_t tap;
+  uint16_t subpel_shifts;
+} InterpFilterParams;
+
+InterpFilterParams vp10_get_interp_filter_params(
+    const INTERP_FILTER interp_filter);
+static INLINE const int16_t* vp10_get_interp_filter_kernel(
+    const InterpFilterParams filter_params, const int subpel) {
+  return filter_params.filter_ptr + filter_params.tap * subpel;
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index d868b25..3fcdb97 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -13,6 +13,7 @@
 
 #include "vp10/common/filter.h"
 #include "vp10/common/onyxc_int.h"
+#include "vp10/common/vp10_convolve.h"
 #include "vpx/vpx_integer.h"
 
 #ifdef __cplusplus
@@ -27,23 +28,34 @@
                                    int w, int h, int ref,
                                    const INTERP_FILTER interp_filter,
                                    int xs, int ys) {
-  const InterpKernel *kernel = vp10_filter_kernels[interp_filter];
+  InterpFilterParams interp_filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  if (interp_filter_params.tap == SUBPEL_TAPS) {
+    const InterpKernel *kernel = vp10_filter_kernels[interp_filter];
 #if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
-  if (kernel[0][SUBPEL_TAPS / 2 - 1] == 128) {
-    // Interpolating filter
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+    } else {
+      sf->predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+    }
+#else
     sf->predict[subpel_x != 0][subpel_y != 0][ref](
         src, src_stride, dst, dst_stride,
         kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
-  } else {
-    sf->predict_ni[subpel_x != 0][subpel_y != 0][ref](
-        src, src_stride, dst, dst_stride,
-        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
-  }
-#else
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
 #endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  } else {
+    // ref > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    int avg = ref > 0;
+    vp10_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter_params,
+                  subpel_x, xs, subpel_y, ys, avg);
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -55,23 +67,35 @@
                                         int w, int h, int ref,
                                         const INTERP_FILTER interp_filter,
                                         int xs, int ys, int bd) {
-  const InterpKernel *kernel = vp10_filter_kernels[interp_filter];
+  InterpFilterParams interp_filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+  if (interp_filter_params.tap == SUBPEL_TAPS) {
+    const InterpKernel *kernel = vp10_filter_kernels[interp_filter];
 #if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
-  if (kernel[0][SUBPEL_TAPS / 2 - 1] == 128) {
-    // Interpolating filter
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+    } else {
+      sf->highbd_predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+    }
+#else
     sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
         src, src_stride, dst, dst_stride,
         kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
-  } else {
-    sf->highbd_predict_ni[subpel_x != 0][subpel_y != 0][ref](
-        src, src_stride, dst, dst_stride,
-        kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
-  }
-#else
-  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
 #endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  } else {
+    // ref > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    int avg = ref > 0;
+    vp10_highbd_convolve(src, src_stride, dst, dst_stride, w, h,
+                         interp_filter_params, subpel_x, xs, subpel_y, ys, avg,
+                         bd);
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
new file mode 100644
index 0000000..e8c0c92
--- /dev/null
+++ b/vp10/common/vp10_convolve.c
@@ -0,0 +1,199 @@
+#include <assert.h>
+
+#include "vp10/common/filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#define MAX_BLOCK_WIDTH (64)
+#define MAX_BLOCK_HEIGHT (64)
+#define MAX_STEP (32)
+#define MAX_FILTER_TAP (12)
+
+static void convolve_horiz(const uint8_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h,
+                           const InterpFilterParams filter_params,
+                           const int subpel_x_q4, int x_step_q4, int avg) {
+  int x, y;
+  int filter_size = filter_params.tap;
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter =
+          vp10_get_interp_filter_kernel(filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+      if (avg) {
+        dst[x] = ROUND_POWER_OF_TWO(
+            dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      } else {
+        dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      }
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_vert(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams filter_params,
+                          const int subpel_y_q4, int y_step_q4, int avg) {
+  int x, y;
+  int filter_size = filter_params.tap;
+  src -= src_stride * (filter_size / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter =
+          vp10_get_interp_filter_kernel(filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      if (avg) {
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+            dst[y * dst_stride] +
+                clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+            1);
+      } else {
+        dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      }
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
+                   int dst_stride, int w, int h,
+                   const InterpFilterParams filter_params,
+                   const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
+                   int y_step_q4, int avg) {
+  int filter_size = filter_params.tap;
+
+  // temp's size is set to (maximum possible intermediate_height) *
+  // MAX_BLOCK_WIDTH
+  uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
+                MAX_FILTER_TAP) *
+               MAX_BLOCK_WIDTH];
+  int temp_stride = MAX_BLOCK_WIDTH;
+
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+  assert(filter_params.tap <= MAX_FILTER_TAP);
+
+  convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride, temp,
+                 temp_stride, w, intermediate_height, filter_params,
+                 subpel_x_q4, x_step_q4, 0);
+  convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride, dst,
+                dst_stride, w, h, filter_params, subpel_y_q4, y_step_q4, avg);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_convolve_horiz(const uint16_t *src, int src_stride,
+                                  uint16_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams filter_params,
+                                  const int subpel_x_q4, int x_step_q4, int avg,
+                                  int bd) {
+  int x, y;
+  int filter_size = filter_params.tap;
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter =
+          vp10_get_interp_filter_kernel(filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+      if (avg)
+        dst[x] = ROUND_POWER_OF_TWO(
+            dst[x] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      else
+        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_vert(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams filter_params,
+                                 const int subpel_y_q4, int y_step_q4, int avg,
+                                 int bd) {
+  int x, y;
+  int filter_size = filter_params.tap;
+  src -= src_stride * (filter_size / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter =
+          vp10_get_interp_filter_kernel(filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      if (avg) {
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+            dst[y * dst_stride] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      } else {
+        dst[y * dst_stride] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      }
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams filter_params,
+                          const int subpel_x_q4, int x_step_q4,
+                          const int subpel_y_q4, int y_step_q4, int avg,
+                          int bd) {
+  int filter_size = filter_params.tap;
+
+  // temp's size is set to (maximum possible intermediate_height) *
+  // MAX_BLOCK_WIDTH
+  uint16_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
+                 MAX_FILTER_TAP) *
+                MAX_BLOCK_WIDTH];
+  int temp_stride = MAX_BLOCK_WIDTH;
+
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+  assert(filter_params.tap <= MAX_FILTER_TAP);
+
+  highbd_convolve_horiz(
+      CONVERT_TO_SHORTPTR(src8 - src_stride * (filter_size / 2 - 1)),
+      src_stride, temp, temp_stride, w, intermediate_height, filter_params,
+      subpel_x_q4, x_step_q4, 0, bd);
+  highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
+                       CONVERT_TO_SHORTPTR(dst8), dst_stride, w, h,
+                       filter_params, subpel_y_q4, y_step_q4, avg, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_convolve.h b/vp10/common/vp10_convolve.h
new file mode 100644
index 0000000..a3d6c65
--- /dev/null
+++ b/vp10/common/vp10_convolve.h
@@ -0,0 +1,31 @@
+#ifndef VP10_COMMON_VP10_CONVOLVE_H_
+#define VP10_COMMON_VP10_CONVOLVE_H_
+#include "vp10/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_convolve(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride,
+                   int w, int h,
+                   const InterpFilterParams filter_params,
+                   const int subpel_x,
+                   const int subpel_y,
+                   int xstep, int ystep, int avg);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_convolve(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride,
+                   int w, int h,
+                   const InterpFilterParams filter_params,
+                   const int subpel_x,
+                   const int subpel_y,
+                   int xstep, int ystep, int avg, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_VP10_CONVOLVE_H_
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index 19364c9..a26f969 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -734,13 +734,17 @@
     int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
     int x_pad = 0, y_pad = 0;
 
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+    int filter_size = filter_params.tap;
+
     if (subpel_x ||
 #if CONFIG_EXT_INTERP
         !i_filter ||
 #endif
         (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-      x0 -= VP9_INTERP_EXTEND - 1;
-      x1 += VP9_INTERP_EXTEND;
+      x0 -= filter_size / 2 - 1;
+      x1 += filter_size / 2;
       x_pad = 1;
     }
 
@@ -749,8 +753,8 @@
         !i_filter ||
 #endif
         (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-      y0 -= VP9_INTERP_EXTEND - 1;
-      y1 += VP9_INTERP_EXTEND;
+      y0 -= filter_size / 2 - 1;
+      y1 += filter_size / 2;
       y_pad = 1;
     }
 
@@ -767,7 +771,8 @@
       const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
       const int b_w = x1 - x0 + 1;
       const int b_h = y1 - y0 + 1;
-      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
+      const int border_offset = y_pad * (filter_size / 2 - 1) * b_w +
+                                x_pad * (filter_size / 2 - 1);
 
       extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
                          frame_width, frame_height, border_offset,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index f83e50d..6250d56 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -1837,7 +1837,8 @@
       p_angle = mode_to_angle_map[mbmi->mode] +
           mbmi->angle_delta[0] * ANGLE_STEP;
       for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
-        if (!pick_intra_filter(p_angle) && filter != INTRA_FILTER_LINEAR)
+        if ((FILTER_FAST_SEARCH || !pick_intra_filter(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
           continue;
         mic->mbmi.intra_filter = filter;
         super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
@@ -1878,7 +1879,8 @@
             mbmi->angle_delta[0] * ANGLE_STEP;
         for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
           mic->mbmi.intra_filter = filter;
-          if (!pick_intra_filter(p_angle) && filter != INTRA_FILTER_LINEAR)
+          if ((FILTER_FAST_SEARCH || !pick_intra_filter(p_angle)) &&
+              filter != INTRA_FILTER_LINEAR)
             continue;
           super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
                           &s, NULL, bsize, best_rd);
@@ -1909,7 +1911,8 @@
           mbmi->angle_delta[0] * ANGLE_STEP;
       for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
         mic->mbmi.intra_filter = filter;
-        if (!pick_intra_filter(p_angle) && filter != INTRA_FILTER_LINEAR)
+        if ((FILTER_FAST_SEARCH || !pick_intra_filter(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
           continue;
         super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
                         &s, NULL, bsize, best_rd);
@@ -1935,6 +1938,37 @@
     }
   }
 
+  if (FILTER_FAST_SEARCH && *rate_tokenonly < INT_MAX) {
+    mbmi->angle_delta[0] = best_angle_delta;
+    p_angle = mode_to_angle_map[mbmi->mode] +
+        mbmi->angle_delta[0] * ANGLE_STEP;
+    if (pick_intra_filter(p_angle)) {
+      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                        &s, NULL, bsize, best_rd);
+        if (this_rate_tokenonly == INT_MAX)
+          continue;
+
+        this_rate = this_rate_tokenonly + rate_overhead +
+            cpi->intra_filter_cost[intra_filter_ctx][filter];
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+        if (this_rd < best_rd) {
+          best_rd             = this_rd;
+          best_angle_delta    = mbmi->angle_delta[0];
+          best_tx_size        = mbmi->tx_size;
+          best_filter         = mbmi->intra_filter;
+          best_tx_type        = mbmi->tx_type;
+          *rate               = this_rate;
+          *rate_tokenonly     = this_rate_tokenonly;
+          *distortion         = this_distortion;
+          *skippable          = s;
+        }
+      }
+    }
+  }
+
   mbmi->tx_size = best_tx_size;
   mbmi->angle_delta[0] = best_angle_delta;
   mic->mbmi.intra_filter = best_filter;
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index fab97ea..4e89e5e 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -74,6 +74,8 @@
 VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.h
 VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
 VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
+VP10_COMMON_SRCS-yes += common/vp10_convolve.c
+VP10_COMMON_SRCS-yes += common/vp10_convolve.h
 VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h
 VP10_COMMON_SRCS-$(CONFIG_ANS) += common/divide.h
 VP10_COMMON_SRCS-$(CONFIG_ANS) += common/divide.c
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 7502f90..ec7c91b 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -46,8 +46,8 @@
     (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
 #if CONFIG_VP9_HIGHBITDEPTH
-#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
-#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1))
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)(x)) << 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)(x)) >> 1))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // VPX_PORTS_MEM_H_