Remove unused convolve_add_src functions. BUG=aomedia:1575 Change-Id: I22535731b0cc341edbfe4e40a9fe7a3218c9f920

commit: 427b18511106fc428b726d7d33d0a1a136c7c409 [log] [tgz]
author: Urvang Joshi <urvang@google.com> Tue Mar 13 17:05:34 2018 -0700
committer: Yaowu Xu <yaowu@google.com> Wed Mar 14 04:49:44 2018 +0000
tree: 3ac781bd2b3e99ada95ae0adf21010193d9919b4
parent: df8ce4f59c423e0d36c23064f4fa83ef2d7141d0 [diff]
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 38942fe..8265100 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c

@@ -203,235 +203,6 @@
   return sum;
 }
 
-static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const InterpKernel *x_filters, int x0_q4,
-                                   int x_step_q4, int w, int h) {
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-
-      const int sum = horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                          src_x[SUBPEL_TAPS / 2 - 1]);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const InterpKernel *y_filters, int y0_q4,
-                                  int y_step_q4, int w, int h) {
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int sum = vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const InterpKernel *const x_filters, int x0_q4,
-                             int x_step_q4, const InterpKernel *const y_filters,
-                             int y0_q4, int y_step_q4, int w, int h) {
-  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
-                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                         intermediate_height);
-  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
-                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
-                                   uint8_t *dst, ptrdiff_t dst_stride,
-                                   const int16_t *filter_x, int x_step_q4,
-                                   const int16_t *filter_y, int y_step_q4,
-                                   int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                         x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
-                                  uint8_t *dst, ptrdiff_t dst_stride,
-                                  const int16_t *filter_x, int x_step_q4,
-                                  const int16_t *filter_y, int y_step_q4, int w,
-                                  int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                        y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                             uint8_t *dst, ptrdiff_t dst_stride,
-                             const int16_t *filter_x, int x_step_q4,
-                             const int16_t *filter_y, int y_step_q4, int w,
-                             int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const InterpKernel *x_filters, int x0_q4,
-                                       int x_step_q4, int w, int h) {
-  const int bd = 8;
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                           (1 << (bd + FILTER_BITS - 1));
-      const int sum = horz_scalar_product(src_x, x_filter) + rounding;
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const InterpKernel *y_filters, int y0_q4,
-                                      int y_step_q4, int w, int h) {
-  const int bd = 8;
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int rounding =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      const int sum =
-          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
-      dst[y * dst_stride] =
-          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const InterpKernel *const x_filters, int x0_q4,
-                                 int x_step_q4,
-                                 const InterpKernel *const y_filters, int y0_q4,
-                                 int y_step_q4, int w, int h) {
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                             src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
-                             x_step_q4, w, intermediate_height);
-  convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                            MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
-                            y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint16_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  (void)filter_y;
-  (void)y_step_q4;
-
-  convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                             x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
-                                      uint8_t *dst, ptrdiff_t dst_stride,
-                                      const int16_t *filter_x, int x_step_q4,
-                                      const int16_t *filter_y, int y_step_q4,
-                                      int w, int h) {
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  (void)filter_x;
-  (void)x_step_q4;
-
-  convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
-                            y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
-                                 uint8_t *dst, ptrdiff_t dst_stride,
-                                 const int16_t *filter_x, int x_step_q4,
-                                 const int16_t *filter_y, int y_step_q4, int w,
-                                 int h) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                       x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
 static INLINE int highbd_horz_scalar_product(const uint16_t *a,
                                              const int16_t *b) {
   int sum = 0;
@@ -601,207 +372,3 @@
     dst += dst_stride;
   }
 }
-
-static void highbd_convolve_add_src_horiz(const uint8_t *src8,
-                                          ptrdiff_t src_stride, uint8_t *dst8,
-                                          ptrdiff_t dst_stride,
-                                          const InterpKernel *x_filters,
-                                          int x0_q4, int x_step_q4, int w,
-                                          int h, int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int sum = highbd_horz_scalar_product(src_x, x_filter);
-      dst[x] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
-          bd);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert(const uint8_t *src8,
-                                         ptrdiff_t src_stride, uint8_t *dst8,
-                                         ptrdiff_t dst_stride,
-                                         const InterpKernel *y_filters,
-                                         int y0_q4, int y_step_q4, int w, int h,
-                                         int bd) {
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
-      dst[y * dst_stride] =
-          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
-                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
-                            bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const InterpKernel *const x_filters,
-                                    int x0_q4, int x_step_q4,
-                                    const InterpKernel *const y_filters,
-                                    int y0_q4, int y_step_q4, int w, int h,
-                                    int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                                src_stride, CONVERT_TO_BYTEPTR(temp),
-                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
-                                intermediate_height, bd);
-  highbd_convolve_add_src_vert(
-      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
-                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-static void highbd_convolve_add_src_horiz_hip(
-    const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
-    int x_step_q4, int w, int h, int bd) {
-  const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
-  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  src -= SUBPEL_TAPS / 2 - 1;
-  for (int y = 0; y < h; ++y) {
-    int x_q4 = x0_q4;
-    for (int x = 0; x < w; ++x) {
-      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
-      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-      const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
-                           (1 << (bd + FILTER_BITS - 1));
-      const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
-      dst[x] =
-          (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
-                          0, extraprec_clamp_limit - 1);
-      x_q4 += x_step_q4;
-    }
-    src += src_stride;
-    dst += dst_stride;
-  }
-}
-
-static void highbd_convolve_add_src_vert_hip(
-    const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
-  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-  for (int x = 0; x < w; ++x) {
-    int y_q4 = y0_q4;
-    for (int y = 0; y < h; ++y) {
-      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
-      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-      const int rounding =
-          ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
-          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
-      const int sum =
-          highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
-      dst[y * dst_stride] = clip_pixel_highbd(
-          ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
-      y_q4 += y_step_q4;
-    }
-    ++src;
-    ++dst;
-  }
-}
-
-static void highbd_convolve_add_src_hip(
-    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
-    ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
-    int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
-    int y_step_q4, int w, int h, int bd) {
-  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
-  // 2d filtering proceeds in 2 steps:
-  //   (1) Interpolate horizontally into an intermediate buffer, temp.
-  //   (2) Interpolate temp vertically to derive the sub-pixel result.
-  // Deriving the maximum number of rows in the temp buffer (135):
-  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
-  // --Largest block size is 64x64 pixels.
-  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
-  //   original frame (in 1/16th pixel units).
-  // --Must round-up because block may be located at sub-pixel position.
-  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
-  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
-  const int intermediate_height =
-      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
-  assert(w <= MAX_SB_SIZE);
-  assert(h <= MAX_SB_SIZE);
-  assert(y_step_q4 <= 32);
-  assert(x_step_q4 <= 32);
-
-  highbd_convolve_add_src_horiz_hip(
-      src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
-      x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
-  highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
-                                   MAX_SB_SIZE, dst, dst_stride, y_filters,
-                                   y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
-                                        ptrdiff_t src_stride, uint8_t *dst,
-                                        ptrdiff_t dst_stride,
-                                        const int16_t *filter_x, int x_step_q4,
-                                        const int16_t *filter_y, int y_step_q4,
-                                        int w, int h, int bd) {
-  const InterpKernel *const filters_x = get_filter_base(filter_x);
-  const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
-  const InterpKernel *const filters_y = get_filter_base(filter_y);
-  const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
-  highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
-                              x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
-                              h, bd);
-}

diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 222f4b9..89fa1bf 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake

@@ -358,22 +358,6 @@
   endif ()
 endif ()
 
-set(AOM_DSP_COMMON_INTRIN_SSE2
-    ${AOM_DSP_COMMON_INTRIN_SSE2}
-    "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
-    ${AOM_DSP_COMMON_INTRIN_AVX2}
-    "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_avx2.c")
-
-set(AOM_DSP_COMMON_INTRIN_SSSE3
-   ${AOM_DSP_COMMON_INTRIN_SSSE3}
-   "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
-    ${AOM_DSP_COMMON_INTRIN_AVX2}
-    "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c")
-
 # Creates aom_dsp build targets. Must not be called until after libaom target
 # has been created.
 function (setup_aom_dsp_targets)

diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6f274c6..a87b8cb 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl

@@ -279,19 +279,6 @@
 specialize qw/aom_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
 specialize qw/aom_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
 
-add_proto qw/void aom_convolve8_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_hip/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_vert_hip/,  "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-specialize qw/aom_convolve8_add_src ssse3/;
-specialize qw/aom_convolve8_add_src_horiz ssse3/;
-specialize qw/aom_convolve8_add_src_vert ssse3/;
-specialize qw/aom_convolve8_add_src_hip sse2/;
-specialize qw/aom_convolve8_add_src_hip avx2/;
-
 # TODO(any): These need to be extended to up to 128x128 block sizes
 if (!(aom_config("CONFIG_AV1") eq "yes")) {
   specialize qw/aom_convolve_copy       neon dspr2 msa/;
@@ -316,12 +303,6 @@
 add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
 specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
 
-add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
-specialize qw/aom_highbd_convolve8_add_src_hip ssse3 avx2/;
-
 #
 # Loopfilter
 #

diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 2c51cb9..ab25d9c 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c

@@ -91,21 +91,5 @@
 //                                int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 
-// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
-// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
-void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                       uint8_t *dst, ptrdiff_t dst_stride,
-                                       const int16_t *filter_x, int x_step_q4,
-                                       const int16_t *filter_y, int y_step_q4,
-                                       int w, int h, int bd) {
-  assert(x_step_q4 == 16);
-  assert(y_step_q4 == 16);
-  ((int16_t *)filter_x)[3] += 128;
-  ((int16_t *)filter_y)[3] += 128;
-  aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
-                            x_step_q4, filter_y, y_step_q4, w, h, bd);
-  ((int16_t *)filter_x)[3] -= 128;
-  ((int16_t *)filter_y)[3] -= 128;
-}
 #endif  // ARCH_X86_64
 #endif  // HAVE_SSE2

diff --git a/aom_dsp/x86/aom_convolve_hip_avx2.c b/aom_dsp/x86/aom_convolve_hip_avx2.c
deleted file mode 100644
index b122341..0000000
--- a/aom_dsp/x86/aom_convolve_hip_avx2.c
+++ /dev/null

@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
-// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
-// on the left.
-// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
-// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-void aom_convolve8_add_src_hip_avx2(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  DECLARE_ALIGNED(32, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m256i zero_256 = _mm256_setzero_si256();
-
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
-  const __m256i clamp_low = zero_256;
-  const __m256i clamp_high = _mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-
-  /* Horizontal filter */
-  {
-    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
-    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
-    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                          (1 << (bd + FILTER_BITS - 1)));
-
-    for (int i = 0; i < intermediate_height; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint8_t *data_ij = src_ptr + i * src_stride + j;
-
-        // Load 8-bit src data
-        const __m128i data_0 = xx_loadu_128(data_ij + 0);
-        const __m128i data_1 = xx_loadu_128(data_ij + 1);
-        const __m128i data_2 = xx_loadu_128(data_ij + 2);
-        const __m128i data_3 = xx_loadu_128(data_ij + 3);
-        const __m128i data_4 = xx_loadu_128(data_ij + 4);
-        const __m128i data_5 = xx_loadu_128(data_ij + 5);
-        const __m128i data_6 = xx_loadu_128(data_ij + 6);
-        const __m128i data_7 = xx_loadu_128(data_ij + 7);
-
-        // (Zero-)Extend 8-bit data to 16-bit data
-        const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
-        const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
-        const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
-        const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
-        const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
-        const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
-        const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
-        const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
-
-        // Multiply src data by filter coeffs and sum pairs
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        // Calculate scalar product for even- and odd-indices separately,
-        // increasing to 32-bit precision
-        const __m256i res_even_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
-        const __m256i res_odd_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-
-        const __m256i res_even =
-            _mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
-                              FILTER_BITS - EXTRAPREC_BITS);
-        const __m256i res_odd =
-            _mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
-                              FILTER_BITS - EXTRAPREC_BITS);
-
-        // Reduce to 16-bit precision and pack even- and odd-index results
-        // back into one register. The _mm256_packs_epi32 intrinsic returns
-        // a register with the pixels ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
-        const __m256i res_clamped =
-            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-
-        // Store in a temporary array
-        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
-    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
-    // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
-
-        // Load 16-bit data from the output of the horizontal filter in
-        // which the pixels are ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
-        const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
-        const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
-        const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
-        const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
-        const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
-        const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
-        const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
-
-        // Filter the even-indices, increasing to 32-bit precision
-        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
-        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
-        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
-        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
-        const __m256i res_even = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
-        // Filter the odd-indices, increasing to 32-bit precision
-        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
-        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
-        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
-        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        const __m256i res_odd = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
-        // Pixels are currently in the following order:
-        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
-        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
-        //
-        // Rearrange the pixels into the following order:
-        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
-        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
-        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
-        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
-        const __m256i res_lo_round =
-            _mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
-                              FILTER_BITS + EXTRAPREC_BITS);
-        const __m256i res_hi_round =
-            _mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
-                              FILTER_BITS + EXTRAPREC_BITS);
-
-        // Reduce to 16-bit precision and pack into the correct order:
-        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
-        const __m256i res_16bit =
-            _mm256_packs_epi32(res_lo_round, res_hi_round);
-
-        // Reduce to 8-bit precision. This messes up the order:
-        // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
-        // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit =
-            _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
-
-        // Swap the two central 32-bit values to get the order:
-        // [ - - - - - - - - - - - - - - - - ]
-        // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
-        const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
-
-        // Store the lower 128-bit lane in the dst array
-        xx_storeu_128(dst + i * dst_stride + j,
-                      _mm256_castsi256_si128(res_8bit2));
-      }
-    }
-  }
-}

diff --git a/aom_dsp/x86/aom_convolve_hip_sse2.c b/aom_dsp/x86/aom_convolve_hip_sse2.c
deleted file mode 100644
index f666a0e..0000000
--- a/aom_dsp/x86/aom_convolve_hip_sse2.c
+++ /dev/null

@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
-                                    uint8_t *dst, ptrdiff_t dst_stride,
-                                    const int16_t *filter_x, int x_step_q4,
-                                    const int16_t *filter_y, int y_step_q4,
-                                    int w, int h) {
-  const int bd = 8;
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  DECLARE_ALIGNED(16, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
-        // Filter even-index pixels
-        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero),
-                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storel_epi64(p, res_8bit);
-      }
-    }
-  }
-}

diff --git a/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c b/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c
deleted file mode 100644
index 7a78d7b..0000000
--- a/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c
+++ /dev/null

@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
-// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
-// on the left.
-// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
-// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
-void aom_highbd_convolve8_add_src_hip_avx2(
-    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
-  DECLARE_ALIGNED(32, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero_128 = _mm_setzero_si128();
-  const __m256i zero_256 = _mm256_setzero_si256();
-
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
-  const __m256i clamp_low = zero_256;
-
-  /* Horizontal filter */
-  {
-    const __m256i clamp_high_ep =
-        _mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-
-    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
-    const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
-    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                          (1 << (bd + FILTER_BITS - 1)));
-
-    for (int i = 0; i < intermediate_height; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint16_t *src_ij = src_ptr + i * src_stride + j;
-
-        // Load 16-bit src data
-        const __m256i src_0 = yy_loadu_256(src_ij + 0);
-        const __m256i src_1 = yy_loadu_256(src_ij + 1);
-        const __m256i src_2 = yy_loadu_256(src_ij + 2);
-        const __m256i src_3 = yy_loadu_256(src_ij + 3);
-        const __m256i src_4 = yy_loadu_256(src_ij + 4);
-        const __m256i src_5 = yy_loadu_256(src_ij + 5);
-        const __m256i src_6 = yy_loadu_256(src_ij + 6);
-        const __m256i src_7 = yy_loadu_256(src_ij + 7);
-
-        // Multiply src data by filter coeffs and sum pairs
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        // Calculate scalar product for even- and odd-indices separately,
-        // increasing to 32-bit precision
-        const __m256i res_even_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
-        const __m256i res_even =
-            _mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
-                              FILTER_BITS - EXTRAPREC_BITS);
-
-        const __m256i res_odd_sum = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-        const __m256i res_odd =
-            _mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
-                              FILTER_BITS - EXTRAPREC_BITS);
-
-        // Reduce to 16-bit precision and pack even- and odd-index results
-        // back into one register. The _mm256_packs_epi32 intrinsic returns
-        // a register with the pixels ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i res = _mm256_packs_epi32(res_even, res_odd);
-        const __m256i res_clamped =
-            _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
-
-        // Store in a temporary array
-        yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
-
-    // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
-    const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
-    // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
-    const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
-    const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
-    // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
-    const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
-    // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
-    const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
-    // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
-    const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
-    // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
-    const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
-    const __m256i round_const =
-        _mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                          (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; j += 16) {
-        const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
-
-        // Load 16-bit data from the output of the horizontal filter in
-        // which the pixels are ordered as follows:
-        // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
-        const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
-        const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
-        const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
-        const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
-        const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
-        const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
-        const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
-        const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
-
-        // Filter the even-indices, increasing to 32-bit precision
-        const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
-        const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
-        const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
-        const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
-        const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
-        const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
-        const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
-        const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
-        const __m256i res_even = _mm256_add_epi32(
-            _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
-        // Filter the odd-indices, increasing to 32-bit precision
-        const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
-        const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
-        const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
-        const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
-        const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
-        const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
-        const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
-        const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
-        const __m256i res_odd = _mm256_add_epi32(
-            _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
-        // Pixels are currently in the following order:
-        // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
-        // res_odd order:  [ 15 13 11 9 ] [ 7 5 3 1 ]
-        //
-        // Rearrange the pixels into the following order:
-        // res_lo order: [ 11 10  9  8 ] [ 3 2 1 0 ]
-        // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
-        const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
-        const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
-        const __m256i res_lo_round =
-            _mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
-                              FILTER_BITS + EXTRAPREC_BITS);
-        const __m256i res_hi_round =
-            _mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
-                              FILTER_BITS + EXTRAPREC_BITS);
-
-        // Reduce to 16-bit precision and pack into the correct order:
-        // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
-        const __m256i res_16bit =
-            _mm256_packs_epi32(res_lo_round, res_hi_round);
-        const __m256i res_16bit_clamped = _mm256_min_epi16(
-            _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
-
-        // Store in the dst array
-        yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
-      }
-    }
-  }
-}

diff --git a/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
deleted file mode 100644
index 5bf6743..0000000
--- a/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
+++ /dev/null

@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-void aom_highbd_convolve8_add_src_hip_ssse3(
-    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
-    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
-    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
-  assert(x_step_q4 == 16 && y_step_q4 == 16);
-  assert(!(w & 7));
-  (void)x_step_q4;
-  (void)y_step_q4;
-
-  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
-  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
-  DECLARE_ALIGNED(16, uint16_t,
-                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
-  int intermediate_height = h + SUBPEL_TAPS - 1;
-  int i, j;
-  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
-  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
-  const __m128i zero = _mm_setzero_si128();
-  // Add an offset to account for the "add_src" part of the convolve function.
-  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
-  /* Horizontal filter */
-  {
-    const __m128i coeffs_x =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
-                       (1 << (bd + FILTER_BITS - 1)));
-
-    for (i = 0; i < intermediate_height; ++i) {
-      for (j = 0; j < w; j += 8) {
-        const __m128i data =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-        const __m128i data2 =
-            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
-        // Filter even-index pixels
-        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
-        const __m128i res_2 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
-        const __m128i res_4 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
-        const __m128i res_6 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
-        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
-                                  FILTER_BITS - EXTRAPREC_BITS);
-
-        // Filter odd-index pixels
-        const __m128i res_1 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
-        const __m128i res_3 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
-        const __m128i res_5 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
-        const __m128i res_7 =
-            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
-        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
-                                 FILTER_BITS - EXTRAPREC_BITS);
-
-        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
-        const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1);
-        __m128i res = _mm_packs_epi32(res_even, res_odd);
-        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
-        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
-      }
-    }
-  }
-
-  /* Vertical filter */
-  {
-    const __m128i coeffs_y =
-        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
-    const __m128i round_const =
-        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
-                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
-    for (i = 0; i < h; ++i) {
-      for (j = 0; j < w; j += 8) {
-        // Filter even-index pixels
-        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
-        const __m128i src_0 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_2 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_4 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_6 =
-            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
-                                               _mm_add_epi32(res_4, res_6));
-
-        // Filter odd-index pixels
-        const __m128i src_1 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
-        const __m128i src_3 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
-        const __m128i src_5 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
-        const __m128i src_7 =
-            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
-                               *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
-
-        // Rearrange pixels back into the order 0 ... 7
-        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
-        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
-        const __m128i res_lo_round = _mm_srai_epi32(
-            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
-        const __m128i res_hi_round = _mm_srai_epi32(
-            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
-        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
-        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
-
-        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
-        _mm_storeu_si128(p, res_16bit);
-      }
-    }
-  }
-}

diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 7dd1f1d..4ea9f05 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c

@@ -285,12 +285,6 @@
 filter8_1dfunction aom_filter_block1d8_h8_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
 
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -312,15 +306,9 @@
 FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
 FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
 
-FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
-                        ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
-                        src - src_stride * 3, add_src_, ssse3);
-
 // void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
 //                          const int16_t *filter_x, int x_step_q4,
 //                          const int16_t *filter_y, int y_step_q4,
 //                          int w, int h);
 FUN_CONV_2D(, ssse3);
-FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);

diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 6dc2310..3ca7921 100644
--- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm

@@ -378,10 +378,6 @@
 SUBPIX_HFILTER8  h8
 SUBPIX_HFILTER4  h8
 
-SUBPIX_HFILTER16 h8_add_src
-SUBPIX_HFILTER8  h8_add_src
-SUBPIX_HFILTER4  h8_add_src
-
 ;-------------------------------------------------------------------------------
 
 ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
@@ -872,9 +868,3 @@
 SUBPIX_VFILTER16     v8
 SUBPIX_VFILTER       v8, 8
 SUBPIX_VFILTER       v8, 4
-
-%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON)
-SUBPIX_VFILTER16 v8_add_src
-SUBPIX_VFILTER   v8_add_src, 8
-SUBPIX_VFILTER   v8_add_src, 4
-%endif

diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index f6812e8..9200e40 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h

@@ -115,68 +115,6 @@
     }                                                                        \
   }
 
-// convolve_add_src is only used by the Wiener filter, which will never
-// end up calling the bilinear functions (it uses a symmetric filter, so
-// the possible numbers of taps are 1,3,5,7)
-#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
-                                opt)                                        \
-  void aom_convolve8_##name##_##opt(                                        \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    (void)filter_x;                                                         \
-    (void)x_step_q4;                                                        \
-    (void)filter_y;                                                         \
-    (void)y_step_q4;                                                        \
-    assert((-128 <= filter[3]) && (filter[3] <= 127));                      \
-    assert(step_q4 == 16);                                                  \
-    while (w >= 16) {                                                       \
-      aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst,  \
-                                               dst_stride, h, filter);      \
-      src += 16;                                                            \
-      dst += 16;                                                            \
-      w -= 16;                                                              \
-    }                                                                       \
-    while (w >= 8) {                                                        \
-      aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 8;                                                             \
-      dst += 8;                                                             \
-      w -= 8;                                                               \
-    }                                                                       \
-    while (w >= 4) {                                                        \
-      aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,   \
-                                              dst_stride, h, filter);       \
-      src += 4;                                                             \
-      dst += 4;                                                             \
-      w -= 4;                                                               \
-    }                                                                       \
-    if (w) {                                                                \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,  \
-                               x_step_q4, filter_y, y_step_q4, w, h);       \
-    }                                                                       \
-  }
-
-#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt)                           \
-  void aom_convolve8_##type##opt(                                           \
-      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
-      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
-      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);  \
-    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                  \
-    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                  \
-    assert(w <= MAX_SB_SIZE);                                               \
-    assert(h <= MAX_SB_SIZE);                                               \
-    assert(x_step_q4 == 16);                                                \
-    assert(y_step_q4 == 16);                                                \
-    aom_convolve8_##htype##horiz_##opt(                                     \
-        src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x,    \
-        x_step_q4, filter_y, y_step_q4, w, h + 7);                          \
-    aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
-                                     dst, dst_stride, filter_x, x_step_q4,  \
-                                     filter_y, y_step_q4, w, h);            \
-  }
-
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
commit	427b18511106fc428b726d7d33d0a1a136c7c409	[log] [tgz]
author	Urvang Joshi <urvang@google.com>	Tue Mar 13 17:05:34 2018 -0700
committer	Yaowu Xu <yaowu@google.com>	Wed Mar 14 04:49:44 2018 +0000
tree	3ac781bd2b3e99ada95ae0adf21010193d9919b4
parent	df8ce4f59c423e0d36c23064f4fa83ef2d7141d0 [diff]