Remove unused scaled convolve funcs from aom_dsp

BUG=aomedia:1575

Change-Id: Ic13751cb2dbaca84b90eab93e013c566f9c286a1
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 98c6241..a0ec277 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -52,27 +52,6 @@
   }
 }
 
-static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_qn,
-                                   int x_step_qn, int w, int h) {
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];  // q8
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      const int sum = horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const InterpKernel *x_filters, int x0_q4,
@@ -93,28 +72,6 @@
   }
 }
 
-static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_qn,
-                                       int x_step_qn, int w, int h) {
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_qn = x0_qn;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
-      const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
-      assert(x_filter_idx < SUBPEL_SHIFTS);
-      const int16_t *const x_filter = x_filters[x_filter_idx];
-      const int sum = horz_scalar_product(src_x, x_filter);
-      dst[x] = ROUND_POWER_OF_TWO(
-          dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
-      x_qn += x_step_qn;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
 static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const InterpKernel *y_filters, int y0_q4,
@@ -135,28 +92,6 @@
   }
 }
 
-static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_qn,
-                                  int y_step_qn, int w, int h) {
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (int y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
 static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const InterpKernel *y_filters, int y0_q4,
@@ -180,31 +115,6 @@
   }
 }
 
-static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_qn,
-                                      int y_step_qn, int w, int h) {
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_qn = y0_qn;
-    for (int y = 0; y < h; ++y) {
-      const unsigned char *src_y =
-          &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter =
-          y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
-      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] = ROUND_POWER_OF_TWO(
-          dst[y * dst_stride] +
-              clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
-          1);
-      y_qn += y_step_qn;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
 static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                      ptrdiff_t dst_stride, const InterpKernel *const x_filters,
                      int x0_q4, int x_step_q4,
@@ -239,41 +149,6 @@
                 dst_stride, y_filters, y0_q4, y_step_q4, w, h);
 }
 
-static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_qn,
-                             int x_step_qn, const InterpKernel *const y_filters,
-                             int y0_qn, int y_step_qn, int w, int h) {
-  // TODO(afergs): Update comment here
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
-  assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
-
-  convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
-                         intermediate_height);
-  convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
-}
-
 static const InterpKernel *get_filter_base(const int16_t *filter) {
   // NOTE: This assumes that the filter table is 256-byte aligned.
   // TODO(agrange) Modify to make independent of table alignment.
@@ -299,21 +174,6 @@
                  w, h);
 }
 
-void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int subpel_x,
-                                 int x_step_qn, const int16_t *filter_y,
-                                 int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                         x_step_qn, w, h);
-}
-
 void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
@@ -329,22 +189,6 @@
                      x_step_q4, w, h);
 }
 
-void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                     uint8_t *dst, ptrdiff_t dst_stride,
-                                     const int16_t *filter_x, int subpel_x,
-                                     int x_step_qn, const int16_t *filter_y,
-                                     int subpel_y, int y_step_qn, int w,
-                                     int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  (void)subpel_y;
-  (void)filter_y;
-  (void)y_step_qn;
-
-  convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
-                             subpel_x, x_step_qn, w, h);
-}
-
 void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                           uint8_t *dst, ptrdiff_t dst_stride,
                           const int16_t *filter_x, int x_step_q4,
@@ -360,21 +204,6 @@
                 w, h);
 }
 
-void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride,
-                                const int16_t *filter_x, int subpel_x,
-                                int x_step_qn, const int16_t *filter_y,
-                                int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
-                        y_step_qn, w, h);
-}
-
 void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
@@ -390,21 +219,6 @@
                     y_step_q4, w, h);
 }
 
-void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int subpel_x,
-                                    int x_step_qn, const int16_t *filter_y,
-                                    int subpel_y, int y_step_qn, int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  (void)subpel_x;
-  (void)filter_x;
-  (void)x_step_qn;
-
-  convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
-                            subpel_y, y_step_qn, w, h);
-}
-
 void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                      ptrdiff_t dst_stride, const int16_t *filter_x,
                      int x_step_q4, const int16_t *filter_y, int y_step_q4,
@@ -419,19 +233,6 @@
            filters_y, y0_q4, y_step_q4, w, h);
 }
 
-void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int subpel_x, int x_step_qn,
-                           const int16_t *filter_y, int subpel_y, int y_step_qn,
-                           int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-
-  convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
-                   x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
-}
-
 void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int x_step_q4, const int16_t *filter_y, int y_step_q4,
@@ -447,22 +248,6 @@
                      h);
 }
 
-void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
-                               uint8_t *dst, ptrdiff_t dst_stride,
-                               const int16_t *filter_x, int subpel_x,
-                               int x_step_qn, const int16_t *filter_y,
-                               int subpel_y, int y_step_qn, int w, int h) {
-  /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
-                        x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
-  aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
-                     h);
-}
-
 void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                          ptrdiff_t dst_stride, const int16_t *filter_x,
                          int filter_x_stride, const int16_t *filter_y,
@@ -498,56 +283,6 @@
   }
 }
 
-void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                        ptrdiff_t dst_stride, const int16_t *filter_x,
-                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                        int w, int h) {
-  aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                        filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                       ptrdiff_t dst_stride, const int16_t *filter_x,
-                       int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                       int w, int h) {
-  aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                       filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                     ptrdiff_t dst_stride, const int16_t *filter_x,
-                     int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                     int w, int h) {
-  aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                  filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                            uint8_t *dst, ptrdiff_t dst_stride,
-                            const int16_t *filter_x, int x_step_q4,
-                            const int16_t *filter_y, int y_step_q4, int w,
-                            int h) {
-  aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                           uint8_t *dst, ptrdiff_t dst_stride,
-                           const int16_t *filter_x, int x_step_q4,
-                           const int16_t *filter_y, int y_step_q4, int w,
-                           int h) {
-  aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
-                           x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
-                      filter_y, y_step_q4, w, h);
-}
-
 static INLINE int highbd_vert_scalar_product(const uint16_t *a,
                                              ptrdiff_t a_stride,
                                              const int16_t *b) {
@@ -556,7 +291,6 @@
   return sum;
 }
 
-// TODO(afergs): Make sure this works too
 static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const InterpKernel *x_filters, int x0_q4,
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 580cfb1..5e8bb4e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -275,19 +275,6 @@
 add_proto qw/void aom_convolve8_avg/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_avg_horiz/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_avg_vert/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_2d/,                 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_horiz/,              "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_vert/,               "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_2d/,             "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_horiz/,          "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_vert/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-add_proto qw/void aom_convolve8_horiz_scale/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert_scale/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert_scale/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_scale/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_scale/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
 
 specialize qw/aom_convolve_copy       sse2      /;
 specialize qw/aom_convolve_avg        sse2      /;
@@ -297,7 +284,6 @@
 specialize qw/aom_convolve8_avg       sse2 ssse3/;
 specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
 specialize qw/aom_convolve8_avg_vert  sse2 ssse3/;
-specialize qw/aom_scaled_2d                ssse3/;
 
 add_proto qw/void aom_convolve8_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 96ae540..a9a15fd 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -342,563 +342,6 @@
 FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
                         src - src_stride * 3, add_src_, ssse3);
 
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
-                      out2, out3, out4, out5, out6, out7)                 \
-  {                                                                       \
-    const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1);                    \
-    const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3);                    \
-    const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5);                    \
-    const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7);                    \
-                                                                          \
-    const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1);               \
-    const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3);               \
-    const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3);               \
-                                                                          \
-    const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2);               \
-    const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3);               \
-    const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3);               \
-                                                                          \
-    out0 = _mm_unpacklo_epi64(tr2_0, tr2_0);                              \
-    out1 = _mm_unpackhi_epi64(tr2_0, tr2_0);                              \
-    out2 = _mm_unpacklo_epi64(tr2_1, tr2_1);                              \
-    out3 = _mm_unpackhi_epi64(tr2_1, tr2_1);                              \
-    out4 = _mm_unpacklo_epi64(tr2_2, tr2_2);                              \
-    out5 = _mm_unpackhi_epi64(tr2_2, tr2_2);                              \
-    out6 = _mm_unpacklo_epi64(tr2_3, tr2_3);                              \
-    out7 = _mm_unpackhi_epi64(tr2_3, tr2_3);                              \
-  }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *x_filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
-  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
-  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
-  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
-  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
-  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
-  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
-  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
-  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
-  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
-  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
-  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A, B, C, D, E, F, G, H;
-
-  A = _mm_loadl_epi64((const __m128i *)src);
-  B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
-  C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
-  D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
-  E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
-  F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
-  G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
-  H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
-  TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
-  _mm_storel_epi64((__m128i *)dst, A);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
-  _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  // This function processes 8x8 areas.  The intermediate height is not always
-  // a multiple of 8, so force it to be a multiple of 8 here.
-  y = h + (8 - (h & 0x7));
-
-  do {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 8) {
-      // process 8 src_x steps
-      for (z = 0; z < 8; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 8; ++i) {
-            temp[z * 8 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 8x8 filters values back to dst
-      transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
-    }
-
-    src += src_stride * 8;
-    dst += dst_stride * 8;
-  } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  // TRANSPOSE...
-  // 00 01 02 03 04 05 06 07
-  // 10 11 12 13 14 15 16 17
-  // 20 21 22 23 24 25 26 27
-  // 30 31 32 33 34 35 36 37
-  //
-  // TO
-  //
-  // 00 10 20 30
-  // 01 11 21 31
-  // 02 12 22 32
-  // 03 13 23 33
-  // 04 14 24 34
-  // 05 15 25 35
-  // 06 16 26 36
-  // 07 17 27 37
-  //
-  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
-  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
-  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
-  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
-  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
-  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
-  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
-  // 02 03 12 13 22 23 32 33
-  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
-  // 06 07 16 17 26 27 36 37
-  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
-                                uint8_t *dst, ptrdiff_t dst_stride) {
-  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
-  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
-  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
-  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
-  // 00 10 01 11 02 12 03 13
-  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
-  // 20 30 21 31 22 32 23 33
-  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
-  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
-  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
-  B = _mm_srli_si128(A, 4);
-  C = _mm_srli_si128(A, 8);
-  D = _mm_srli_si128(A, 12);
-
-  *(int *)(dst) = _mm_cvtsi128_si32(A);
-  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
-  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
-  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *x_filters, int x0_q4,
-                                    int x_step_q4, int w, int h) {
-  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
-  int x, y, z;
-  src -= SUBPEL_TAPS / 2 - 1;
-
-  for (y = 0; y < h; y += 4) {
-    int x_q4 = x0_q4;
-    for (x = 0; x < w; x += 4) {
-      // process 4 src_x steps
-      for (z = 0; z < 4; ++z) {
-        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-        const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-        if (x_q4 & SUBPEL_MASK) {
-          filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
-        } else {
-          int i;
-          for (i = 0; i < 4; ++i) {
-            temp[z * 4 + i] = src_x[i * src_stride + 3];
-          }
-        }
-        x_q4 += x_step_q4;
-      }
-
-      // transpose the 4x4 filters values back to dst
-      transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
-    }
-
-    src += src_stride * 4;
-    dst += dst_stride * 4;
-  }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
-  const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
-  const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 4 bytes
-  *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                 uint8_t *dst, const int16_t *filter) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
-  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
-  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
-  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
-  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
-  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
-  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
-  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
-  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
-  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
-  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
-  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
-  // multiply 2 adjacent elements with the filter and add the result
-  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
-  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
-  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
-  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
-  // add and saturate the results together
-  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
-  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
-  __m128i temp = _mm_adds_epi16(x0, x3);
-  temp = _mm_adds_epi16(temp, min_x2x1);
-  temp = _mm_adds_epi16(temp, max_x2x1);
-  // round and shift by 7 bit each 16 bit
-  temp = _mm_mulhrs_epi16(temp, k_256);
-  // shrink to 8 bit each 16 bits
-  temp = _mm_packus_epi16(temp, temp);
-  // save only 8 bytes convolve result
-  _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *y_filters, int y0_q4,
-                                   int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
-                                  uint8_t *dst, const int16_t *filter, int w) {
-  const __m128i k_256 = _mm_set1_epi16(1 << 8);
-  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
-  // pack and duplicate the filter values
-  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
-  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
-  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
-  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
-  int i;
-
-  for (i = 0; i < w; i += 16) {
-    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
-    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
-    const __m128i C =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
-    const __m128i D =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
-    const __m128i E =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
-    const __m128i F =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
-    const __m128i G =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
-    const __m128i H =
-        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
-    // merge the result together
-    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
-    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
-    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
-    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
-    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
-    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
-    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
-    // add and saturate the results together
-    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
-    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
-    // merge the result together
-    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
-    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
-    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
-    // merge the result together
-    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
-    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
-    // multiply 2 adjacent elements with the filter and add the result
-    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
-    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
-    // add and saturate the results together
-    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
-    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
-    // add and saturate the results together
-    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
-    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
-    // round and shift by 7 bit each 16 bit
-    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
-    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
-    // shrink to 8 bit each 16 bits, the first lane contain the first
-    // convolve result and the second lane contain the second convolve
-    // result
-    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
-    src_ptr += 16;
-    // save 16 bytes convolve result
-    _mm_store_si128((__m128i *)&dst[i], temp_hi);
-  }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *y_filters, int y0_q4,
-                                    int y_step_q4, int w, int h) {
-  int y;
-  int y_q4 = y0_q4;
-
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (y = 0; y < h; ++y) {
-    const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-    const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-    if (y_q4 & SUBPEL_MASK) {
-      filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
-                            w);
-    } else {
-      memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
-    }
-    y_q4 += y_step_q4;
-  }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  if (w >= 8) {
-    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  } else {
-    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                            x_step_q4, w, intermediate_height);
-  }
-
-  if (w >= 16) {
-    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-  } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  } else {
-    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                           MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                           y_step_q4, w, h);
-  }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
-  // NOTE: This assumes that the filter table is 256-byte aligned.
-  // TODO(agrange) Modify to make independent of table alignment.
-  return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
-  return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-                         ptrdiff_t dst_stride, const int16_t *filter_x,
-                         int x_step_q4, const int16_t *filter_y, int y_step_q4,
-                         int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
 // void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const int16_t *filter_x, int x_step_q4,
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f6e770b..103bfb9 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -39,14 +39,9 @@
 struct ConvolveFunctions {
   ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8,
                     ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg,
-                    ConvolveFunc hv8, ConvolveFunc hv8_avg, ConvolveFunc sh8,
-                    ConvolveFunc sh8_avg, ConvolveFunc sv8,
-                    ConvolveFunc sv8_avg, ConvolveFunc shv8,
-                    ConvolveFunc shv8_avg, int bd)
+                    ConvolveFunc hv8, ConvolveFunc hv8_avg, int bd)
       : copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
-        v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
-        sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
-        use_highbd_(bd) {}
+        v8_avg_(v8_avg), hv8_avg_(hv8_avg), use_highbd_(bd) {}
 
   ConvolveFunc copy_;
   ConvolveFunc avg_;
@@ -56,12 +51,6 @@
   ConvolveFunc h8_avg_;
   ConvolveFunc v8_avg_;
   ConvolveFunc hv8_avg_;
-  ConvolveFunc sh8_;       // scaled horiz
-  ConvolveFunc sv8_;       // scaled vert
-  ConvolveFunc shv8_;      // scaled horiz/vert
-  ConvolveFunc sh8_avg_;   // scaled avg horiz
-  ConvolveFunc sv8_avg_;   // scaled avg vert
-  ConvolveFunc shv8_avg_;  // scaled avg horiz/vert
   int use_highbd_;  // 0 if high bitdepth not used, else the actual bit depth.
 };
 
@@ -524,63 +513,6 @@
           << "(" << x << "," << y << ")";
 }
 
-TEST_P(ConvolveTest, CopyHoriz) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->sh8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, CopyVert) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->sv8_(in, kInputStride, out, kOutputStride,
-                                      filter8, 16, filter8, 16, Width(),
-                                      Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, Copy2D) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  DECLARE_ALIGNED(256, const int16_t,
-                  filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
-  ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                       filter8, 16, filter8, 16, Width(),
-                                       Height()));
-
-  CheckGuardBlocks();
-
-  for (int y = 0; y < Height(); ++y)
-    for (int x = 0; x < Width(); ++x)
-      ASSERT_EQ(lookup(out, y * kOutputStride + x),
-                lookup(in, y * kInputStride + x))
-          << "(" << x << "," << y << ")";
-}
-
 const int kNumFilterBanks = SWITCHABLE_FILTERS;
 const int kNumFilters = 16;
 
@@ -826,37 +758,6 @@
   }
 }
 
-/* This test exercises that enough rows and columns are filtered with every
-   possible initial fractional positions and scaling steps. */
-TEST_P(ConvolveTest, CheckScalingFiltering) {
-  uint8_t *const in = input();
-  uint8_t *const out = output();
-  const InterpKernel *const eighttap =
-      (const InterpKernel *)av1_get_interp_filter_kernel(EIGHTTAP_REGULAR);
-
-  SetConstantInput(127);
-
-  for (int frac = 0; frac < 16; ++frac) {
-    for (int step = 1; step <= 32; ++step) {
-      /* Test the horizontal and vertical filters in combination. */
-      ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
-                                           eighttap[frac], step, eighttap[frac],
-                                           step, Width(), Height()));
-
-      CheckGuardBlocks();
-
-      for (int y = 0; y < Height(); ++y) {
-        for (int x = 0; x < Width(); ++x) {
-          ASSERT_EQ(lookup(in, y * kInputStride + x),
-                    lookup(out, y * kOutputStride + x))
-              << "x == " << x << ", y == " << y << ", frac == " << frac
-              << ", step == " << step;
-        }
-      }
-    }
-  }
-}
-
 TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
   const uint8_t *const in = input();
   uint8_t *const out = output();
@@ -1066,24 +967,17 @@
 const ConvolveFunctions convolve8_c(
     wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
     wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
-    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
-    wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
-    wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
-    wrap_convolve8_avg_c_8, 8);
+    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
 const ConvolveFunctions convolve10_c(
     wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
     wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
     wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
-    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
-    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
-    wrap_convolve8_avg_c_10, 10);
+    10);
 const ConvolveFunctions convolve12_c(
     wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
     wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
     wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
-    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
-    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
-    wrap_convolve8_avg_c_12, 12);
+    12);
 const ConvolveParam kArrayConvolve_c[] = {
   ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
 };
@@ -1095,25 +989,16 @@
     wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
     wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
     wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
-    wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
-    wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
-    wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
     wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
 const ConvolveFunctions convolve10_sse2(
     wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
     wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
     wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
-    wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
-    wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
-    wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
     wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
 const ConvolveFunctions convolve12_sse2(
     wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
     wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
     wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
-    wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
-    wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
-    wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
     wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
 const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
                                               ALL_SIZES(convolve10_sse2),
@@ -1123,12 +1008,13 @@
 #endif
 
 #if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(
-    aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_ssse3,
-    aom_convolve8_avg_horiz_ssse3, aom_convolve8_vert_ssse3,
-    aom_convolve8_avg_vert_ssse3, aom_convolve8_ssse3, aom_convolve8_avg_ssse3,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_ssse3, aom_scaled_avg_2d_c, 0);
+const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c, aom_convolve_avg_c,
+                                        aom_convolve8_horiz_ssse3,
+                                        aom_convolve8_avg_horiz_ssse3,
+                                        aom_convolve8_vert_ssse3,
+                                        aom_convolve8_avg_vert_ssse3,
+                                        aom_convolve8_ssse3,
+                                        aom_convolve8_avg_ssse3, 0);
 
 const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
 INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
@@ -1140,25 +1026,17 @@
     wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
     wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
     wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
-    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
-    wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
-    wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+    wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, 8);
 const ConvolveFunctions convolve10_avx2(
     wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
     wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
     wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
-    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
-    wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
-    wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
-    wrap_convolve8_avg_c_10, 10);
+    wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10, 10);
 const ConvolveFunctions convolve12_avx2(
     wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
     wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
     wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
-    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
-    wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
-    wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
-    wrap_convolve8_avg_c_12, 12);
+    wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12, 12);
 const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2),
                                                ALL_SIZES_64(convolve10_avx2),
                                                ALL_SIZES_64(convolve12_avx2) };
@@ -1172,16 +1050,12 @@
 const ConvolveFunctions convolve8_neon(
     aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
     aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
-    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon, 0);
 #else   // HAVE_NEON
 const ConvolveFunctions convolve8_neon(
     aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
     aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
-    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+    aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon, 0);
 #endif  // HAVE_NEON_ASM
 
 const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES_64(convolve8_neon) };
@@ -1195,8 +1069,7 @@
     aom_convolve_copy_dspr2, aom_convolve_avg_dspr2, aom_convolve8_horiz_dspr2,
     aom_convolve8_avg_horiz_dspr2, aom_convolve8_vert_dspr2,
     aom_convolve8_avg_vert_dspr2, aom_convolve8_dspr2, aom_convolve8_avg_dspr2,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+    0);
 
 const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES_64(convolve8_dspr2) };
 INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
@@ -1208,9 +1081,7 @@
 const ConvolveFunctions convolve8_msa(
     aom_convolve_copy_msa, aom_convolve_avg_msa, aom_convolve8_horiz_msa,
     aom_convolve8_avg_horiz_msa, aom_convolve8_vert_msa,
-    aom_convolve8_avg_vert_msa, aom_convolve8_msa, aom_convolve8_avg_msa,
-    aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
-    aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+    aom_convolve8_avg_vert_msa, aom_convolve8_msa, aom_convolve8_avg_msa, 0);
 
 const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES_64(convolve8_msa) };
 INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,