Remove unused convolve_add_src functions.
BUG=aomedia:1575
Change-Id: I22535731b0cc341edbfe4e40a9fe7a3218c9f920
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 38942fe..8265100 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -203,235 +203,6 @@
return sum;
}
-static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
-
- const int sum = horz_scalar_product(src_x, x_filter);
- dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
- src_x[SUBPEL_TAPS / 2 - 1]);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int sum = vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
- src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
-
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
- temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
- intermediate_height);
- convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
- dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
- y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
-static void convolve_add_src_horiz_hip(const uint8_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
- const int bd = 8;
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
- (1 << (bd + FILTER_BITS - 1));
- const int sum = horz_scalar_product(src_x, x_filter) + rounding;
- dst[x] =
- (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
- 0, EXTRAPREC_CLAMP_LIMIT(bd) - 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void convolve_add_src_vert_hip(const uint16_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- const int bd = 8;
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int rounding =
- ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
- const int sum =
- highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
- dst[y * dst_stride] =
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS));
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void convolve_add_src_hip(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4,
- const InterpKernel *const y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
-
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- convolve_add_src_horiz_hip(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
- x_step_q4, w, intermediate_height);
- convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
- y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_horiz_hip_c(const uint8_t *src, ptrdiff_t src_stride,
- uint16_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- (void)filter_y;
- (void)y_step_q4;
-
- convolve_add_src_horiz_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_vert_hip_c(const uint16_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- (void)filter_x;
- (void)x_step_q4;
-
- convolve_add_src_vert_hip(src, src_stride, dst, dst_stride, filters_y, y0_q4,
- y_step_q4, w, h);
-}
-
-void aom_convolve8_add_src_hip_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
static INLINE int highbd_horz_scalar_product(const uint16_t *a,
const int16_t *b) {
int sum = 0;
@@ -601,207 +372,3 @@
dst += dst_stride;
}
}
-
-static void highbd_convolve_add_src_horiz(const uint8_t *src8,
- ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride,
- const InterpKernel *x_filters,
- int x0_q4, int x_step_q4, int w,
- int h, int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- const int sum = highbd_horz_scalar_product(src_x, x_filter);
- dst[x] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
- bd);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_add_src_vert(const uint8_t *src8,
- ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride,
- const InterpKernel *y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int sum = highbd_vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] =
- clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
- src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
- bd);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters,
- int x0_q4, int x_step_q4,
- const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h,
- int bd) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, CONVERT_TO_BYTEPTR(temp),
- MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
- intermediate_height, bd);
- highbd_convolve_add_src_vert(
- CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
-}
-
-static void highbd_convolve_add_src_horiz_hip(
- const uint8_t *src8, ptrdiff_t src_stride, uint16_t *dst,
- ptrdiff_t dst_stride, const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h, int bd) {
- const int extraprec_clamp_limit = EXTRAPREC_CLAMP_LIMIT(bd);
- uint16_t *src = CONVERT_TO_SHORTPTR(src8);
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_q4 = x0_q4;
- for (int x = 0; x < w; ++x) {
- const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- const int rounding = ((int)src_x[SUBPEL_TAPS / 2 - 1] << FILTER_BITS) +
- (1 << (bd + FILTER_BITS - 1));
- const int sum = highbd_horz_scalar_product(src_x, x_filter) + rounding;
- dst[x] =
- (uint16_t)clamp(ROUND_POWER_OF_TWO(sum, FILTER_BITS - EXTRAPREC_BITS),
- 0, extraprec_clamp_limit - 1);
- x_q4 += x_step_q4;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-static void highbd_convolve_add_src_vert_hip(
- const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride, const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h, int bd) {
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (int x = 0; x < w; ++x) {
- int y_q4 = y0_q4;
- for (int y = 0; y < h; ++y) {
- const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- const int rounding =
- ((int)src_y[(SUBPEL_TAPS / 2 - 1) * src_stride] << FILTER_BITS) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1));
- const int sum =
- highbd_vert_scalar_product(src_y, src_stride, y_filter) + rounding;
- dst[y * dst_stride] = clip_pixel_highbd(
- ROUND_POWER_OF_TWO(sum, FILTER_BITS + EXTRAPREC_BITS), bd);
- y_q4 += y_step_q4;
- }
- ++src;
- ++dst;
- }
-}
-
-static void highbd_convolve_add_src_hip(
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters, int y0_q4,
- int y_step_q4, int w, int h, int bd) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- highbd_convolve_add_src_horiz_hip(
- src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, MAX_SB_SIZE,
- x_filters, x0_q4, x_step_q4, w, intermediate_height, bd);
- highbd_convolve_add_src_vert_hip(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters,
- y0_q4, y_step_q4, w, h, bd);
-}
-
-void aom_highbd_convolve8_add_src_hip_c(const uint8_t *src,
- ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- highbd_convolve_add_src_hip(src, src_stride, dst, dst_stride, filters_x,
- x0_q4, x_step_q4, filters_y, y0_q4, y_step_q4, w,
- h, bd);
-}
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 222f4b9..89fa1bf 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -358,22 +358,6 @@
endif ()
endif ()
-set(AOM_DSP_COMMON_INTRIN_SSE2
- ${AOM_DSP_COMMON_INTRIN_SSE2}
- "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_sse2.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
- ${AOM_DSP_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/aom_dsp/x86/aom_convolve_hip_avx2.c")
-
-set(AOM_DSP_COMMON_INTRIN_SSSE3
- ${AOM_DSP_COMMON_INTRIN_SSSE3}
- "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c")
-
-set(AOM_DSP_COMMON_INTRIN_AVX2
- ${AOM_DSP_COMMON_INTRIN_AVX2}
- "${AOM_ROOT}/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c")
-
# Creates aom_dsp build targets. Must not be called until after libaom target
# has been created.
function (setup_aom_dsp_targets)
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 6f274c6..a87b8cb 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -279,19 +279,6 @@
specialize qw/aom_convolve8_horiz sse2 ssse3/, "$avx2_ssse3";
specialize qw/aom_convolve8_vert sse2 ssse3/, "$avx2_ssse3";
-add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_horiz_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_add_src_vert_hip/, "const uint16_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-specialize qw/aom_convolve8_add_src ssse3/;
-specialize qw/aom_convolve8_add_src_horiz ssse3/;
-specialize qw/aom_convolve8_add_src_vert ssse3/;
-specialize qw/aom_convolve8_add_src_hip sse2/;
-specialize qw/aom_convolve8_add_src_hip avx2/;
-
# TODO(any): These need to be extended to up to 128x128 block sizes
if (!(aom_config("CONFIG_AV1") eq "yes")) {
specialize qw/aom_convolve_copy neon dspr2 msa/;
@@ -316,12 +303,6 @@
add_proto qw/void aom_highbd_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
specialize qw/aom_highbd_convolve8_vert avx2/, "$sse2_x86_64";
-add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-add_proto qw/void aom_highbd_convolve8_add_src_hip/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
-
-specialize qw/aom_highbd_convolve8_add_src/, "$sse2_x86_64";
-specialize qw/aom_highbd_convolve8_add_src_hip ssse3 avx2/;
-
#
# Loopfilter
#
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 2c51cb9..ab25d9c 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -91,21 +91,5 @@
// int w, int h, int bd);
HIGH_FUN_CONV_2D(, sse2);
-// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
-// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
-void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h, int bd) {
- assert(x_step_q4 == 16);
- assert(y_step_q4 == 16);
- ((int16_t *)filter_x)[3] += 128;
- ((int16_t *)filter_y)[3] += 128;
- aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h, bd);
- ((int16_t *)filter_x)[3] -= 128;
- ((int16_t *)filter_y)[3] -= 128;
-}
#endif // ARCH_X86_64
#endif // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_convolve_hip_avx2.c b/aom_dsp/x86/aom_convolve_hip_avx2.c
deleted file mode 100644
index b122341..0000000
--- a/aom_dsp/x86/aom_convolve_hip_avx2.c
+++ /dev/null
@@ -1,262 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
-// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
-// on the left.
-// A row of, say, 8-bit pixels with values p0, p1, p2, ..., p30, p31 will be
-// loaded and stored as [ p31 ... p17 p16 ][ p15 ... p1 p0 ].
-void aom_convolve8_add_src_hip_avx2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const int bd = 8;
- assert(x_step_q4 == 16 && y_step_q4 == 16);
- assert(!(w & 7));
- (void)x_step_q4;
- (void)y_step_q4;
-
- DECLARE_ALIGNED(32, uint16_t,
- temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
- const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
- const __m128i zero_128 = _mm_setzero_si128();
- const __m256i zero_256 = _mm256_setzero_si256();
-
- // Add an offset to account for the "add_src" part of the convolve function.
- const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
- const __m256i clamp_low = zero_256;
- const __m256i clamp_high = _mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-
- /* Horizontal filter */
- {
- // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
- const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
- // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
- (1 << (bd + FILTER_BITS - 1)));
-
- for (int i = 0; i < intermediate_height; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint8_t *data_ij = src_ptr + i * src_stride + j;
-
- // Load 8-bit src data
- const __m128i data_0 = xx_loadu_128(data_ij + 0);
- const __m128i data_1 = xx_loadu_128(data_ij + 1);
- const __m128i data_2 = xx_loadu_128(data_ij + 2);
- const __m128i data_3 = xx_loadu_128(data_ij + 3);
- const __m128i data_4 = xx_loadu_128(data_ij + 4);
- const __m128i data_5 = xx_loadu_128(data_ij + 5);
- const __m128i data_6 = xx_loadu_128(data_ij + 6);
- const __m128i data_7 = xx_loadu_128(data_ij + 7);
-
- // (Zero-)Extend 8-bit data to 16-bit data
- const __m256i src_0 = _mm256_cvtepu8_epi16(data_0);
- const __m256i src_1 = _mm256_cvtepu8_epi16(data_1);
- const __m256i src_2 = _mm256_cvtepu8_epi16(data_2);
- const __m256i src_3 = _mm256_cvtepu8_epi16(data_3);
- const __m256i src_4 = _mm256_cvtepu8_epi16(data_4);
- const __m256i src_5 = _mm256_cvtepu8_epi16(data_5);
- const __m256i src_6 = _mm256_cvtepu8_epi16(data_6);
- const __m256i src_7 = _mm256_cvtepu8_epi16(data_7);
-
- // Multiply src data by filter coeffs and sum pairs
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- // Calculate scalar product for even- and odd-indices separately,
- // increasing to 32-bit precision
- const __m256i res_even_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
- const __m256i res_odd_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
-
- const __m256i res_even =
- _mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
- const __m256i res_odd =
- _mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Reduce to 16-bit precision and pack even- and odd-index results
- // back into one register. The _mm256_packs_epi32 intrinsic returns
- // a register with the pixels ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i res = _mm256_packs_epi32(res_even, res_odd);
- const __m256i res_clamped =
- _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high);
-
- // Store in a temporary array
- yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
- }
- }
- }
-
- /* Vertical filter */
- {
- // coeffs [ g7 g6 g5 g4 g3 g2 g1 g0 ]
- const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
- // coeffs [ g3 g2 g3 g2 g1 g0 g1 g0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs [ g7 g6 g7 g6 g5 g4 g5 g4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ g1 g0 g1 g0 g1 g0 g1 g0 ][ g1 g0 g1 g0 g1 g0 g1 g0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ g3 g2 g3 g2 g3 g2 g3 g2 ][ g3 g2 g3 g2 g3 g2 g3 g2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ g5 g4 g5 g4 g5 g4 g5 g4 ][ g5 g4 g5 g4 g5 g4 g5 g4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ g7 g6 g7 g6 g7 g6 g7 g6 ][ g7 g6 g7 g6 g7 g6 g7 g6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint16_t *data_ij = temp + i * MAX_SB_SIZE + j;
-
- // Load 16-bit data from the output of the horizontal filter in
- // which the pixels are ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i data_0 = yy_loadu_256(data_ij + 0 * MAX_SB_SIZE);
- const __m256i data_1 = yy_loadu_256(data_ij + 1 * MAX_SB_SIZE);
- const __m256i data_2 = yy_loadu_256(data_ij + 2 * MAX_SB_SIZE);
- const __m256i data_3 = yy_loadu_256(data_ij + 3 * MAX_SB_SIZE);
- const __m256i data_4 = yy_loadu_256(data_ij + 4 * MAX_SB_SIZE);
- const __m256i data_5 = yy_loadu_256(data_ij + 5 * MAX_SB_SIZE);
- const __m256i data_6 = yy_loadu_256(data_ij + 6 * MAX_SB_SIZE);
- const __m256i data_7 = yy_loadu_256(data_ij + 7 * MAX_SB_SIZE);
-
- // Filter the even-indices, increasing to 32-bit precision
- const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
- const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
- const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
- const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter the odd-indices, increasing to 32-bit precision
- const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
- const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
- const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
- const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Pixels are currently in the following order:
- // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
- // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
- //
- // Rearrange the pixels into the following order:
- // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
- // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round =
- _mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
- FILTER_BITS + EXTRAPREC_BITS);
- const __m256i res_hi_round =
- _mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
- FILTER_BITS + EXTRAPREC_BITS);
-
- // Reduce to 16-bit precision and pack into the correct order:
- // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
- const __m256i res_16bit =
- _mm256_packs_epi32(res_lo_round, res_hi_round);
-
- // Reduce to 8-bit precision. This messes up the order:
- // [ - - - - - - - - 15 14 13 12 11 10 9 8 ]
- // [ - - - - - - - - 7 6 5 4 3 2 1 0 ]
- const __m256i res_8bit =
- _mm256_packus_epi16(res_16bit, zero_256 /* don't care value */);
-
- // Swap the two central 32-bit values to get the order:
- // [ - - - - - - - - - - - - - - - - ]
- // [ 15 14 13 12 11 10 9 8 7 6 5 4 3 2 1 0 ]
- const __m256i res_8bit2 = _mm256_permute4x64_epi64(res_8bit, 0xd8);
-
- // Store the lower 128-bit lane in the dst array
- xx_storeu_128(dst + i * dst_stride + j,
- _mm256_castsi256_si128(res_8bit2));
- }
- }
- }
-}
diff --git a/aom_dsp/x86/aom_convolve_hip_sse2.c b/aom_dsp/x86/aom_convolve_hip_sse2.c
deleted file mode 100644
index f666a0e..0000000
--- a/aom_dsp/x86/aom_convolve_hip_sse2.c
+++ /dev/null
@@ -1,196 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <emmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const int bd = 8;
- assert(x_step_q4 == 16 && y_step_q4 == 16);
- assert(!(w & 7));
- (void)x_step_q4;
- (void)y_step_q4;
-
- DECLARE_ALIGNED(16, uint16_t,
- temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
- int i, j;
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
- const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
- const __m128i zero = _mm_setzero_si128();
- // Add an offset to account for the "add_src" part of the convolve function.
- const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
- /* Horizontal filter */
- {
- const __m128i coeffs_x =
- _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
- (1 << (bd + FILTER_BITS - 1)));
-
- for (i = 0; i < intermediate_height; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
-
- // Filter even-index pixels
- const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Filter odd-index pixels
- const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_min_epi16(_mm_max_epi16(res, zero),
- _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
- _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const __m128i coeffs_y =
- _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
- *(__m128i *)(data + 1 * MAX_SB_SIZE));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
- *(__m128i *)(data + 3 * MAX_SB_SIZE));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
- *(__m128i *)(data + 5 * MAX_SB_SIZE));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
- *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
- *(__m128i *)(data + 1 * MAX_SB_SIZE));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
- *(__m128i *)(data + 3 * MAX_SB_SIZE));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
- *(__m128i *)(data + 5 * MAX_SB_SIZE));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
- *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
- const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
-
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- _mm_storel_epi64(p, res_8bit);
- }
- }
- }
-}
diff --git a/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c b/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c
deleted file mode 100644
index 7a78d7b..0000000
--- a/aom_dsp/x86/aom_highbd_convolve_hip_avx2.c
+++ /dev/null
@@ -1,252 +0,0 @@
-/*
- * Copyright (c) 2018, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <immintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-#include "aom_dsp/x86/synonyms.h"
-#include "aom_dsp/x86/synonyms_avx2.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-// 128-bit xmmwords are written as [ ... ] with the MSB on the left.
-// 256-bit ymmwords are written as two xmmwords, [ ... ][ ... ] with the MSB
-// on the left.
-// A row of, say, 16-bit pixels with values p0, p1, p2, ..., p14, p15 will be
-// loaded and stored as [ p15 ... p9 p8 ][ p7 ... p1 p0 ].
-void aom_highbd_convolve8_add_src_hip_avx2(
- const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
- assert(x_step_q4 == 16 && y_step_q4 == 16);
- assert(!(w & 7));
- (void)x_step_q4;
- (void)y_step_q4;
-
- const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
- DECLARE_ALIGNED(32, uint16_t,
- temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
- const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
- const __m128i zero_128 = _mm_setzero_si128();
- const __m256i zero_256 = _mm256_setzero_si256();
-
- // Add an offset to account for the "add_src" part of the convolve function.
- const __m128i offset = _mm_insert_epi16(zero_128, 1 << FILTER_BITS, 3);
-
- const __m256i clamp_low = zero_256;
-
- /* Horizontal filter */
- {
- const __m256i clamp_high_ep =
- _mm256_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1);
-
- // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
- const __m128i coeffs_x = _mm_add_epi16(xx_loadu_128(filter_x), offset);
-
- // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
- (1 << (bd + FILTER_BITS - 1)));
-
- for (int i = 0; i < intermediate_height; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint16_t *src_ij = src_ptr + i * src_stride + j;
-
- // Load 16-bit src data
- const __m256i src_0 = yy_loadu_256(src_ij + 0);
- const __m256i src_1 = yy_loadu_256(src_ij + 1);
- const __m256i src_2 = yy_loadu_256(src_ij + 2);
- const __m256i src_3 = yy_loadu_256(src_ij + 3);
- const __m256i src_4 = yy_loadu_256(src_ij + 4);
- const __m256i src_5 = yy_loadu_256(src_ij + 5);
- const __m256i src_6 = yy_loadu_256(src_ij + 6);
- const __m256i src_7 = yy_loadu_256(src_ij + 7);
-
- // Multiply src data by filter coeffs and sum pairs
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- // Calculate scalar product for even- and odd-indices separately,
- // increasing to 32-bit precision
- const __m256i res_even_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_4), _mm256_add_epi32(res_2, res_6));
- const __m256i res_even =
- _mm256_srai_epi32(_mm256_add_epi32(res_even_sum, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- const __m256i res_odd_sum = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_5), _mm256_add_epi32(res_3, res_7));
- const __m256i res_odd =
- _mm256_srai_epi32(_mm256_add_epi32(res_odd_sum, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Reduce to 16-bit precision and pack even- and odd-index results
- // back into one register. The _mm256_packs_epi32 intrinsic returns
- // a register with the pixels ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i res = _mm256_packs_epi32(res_even, res_odd);
- const __m256i res_clamped =
- _mm256_min_epi16(_mm256_max_epi16(res, clamp_low), clamp_high_ep);
-
- // Store in a temporary array
- yy_storeu_256(temp + i * MAX_SB_SIZE + j, res_clamped);
- }
- }
- }
-
- /* Vertical filter */
- {
- const __m256i clamp_high = _mm256_set1_epi16((1 << bd) - 1);
-
- // coeffs [ f7 f6 f5 f4 f3 f2 f1 f0 ]
- const __m128i coeffs_y = _mm_add_epi16(xx_loadu_128(filter_y), offset);
-
- // coeffs [ f3 f2 f3 f2 f1 f0 f1 f0 ]
- const __m128i coeffs_0123 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs [ f7 f6 f7 f6 f5 f4 f5 f4 ]
- const __m128i coeffs_4567 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m128i coeffs_01_128 = _mm_unpacklo_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m128i coeffs_23_128 = _mm_unpackhi_epi64(coeffs_0123, coeffs_0123);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m128i coeffs_45_128 = _mm_unpacklo_epi64(coeffs_4567, coeffs_4567);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m128i coeffs_67_128 = _mm_unpackhi_epi64(coeffs_4567, coeffs_4567);
-
- // coeffs [ f1 f0 f1 f0 f1 f0 f1 f0 ][ f1 f0 f1 f0 f1 f0 f1 f0 ]
- const __m256i coeffs_01 = yy_set_m128i(coeffs_01_128, coeffs_01_128);
- // coeffs [ f3 f2 f3 f2 f3 f2 f3 f2 ][ f3 f2 f3 f2 f3 f2 f3 f2 ]
- const __m256i coeffs_23 = yy_set_m128i(coeffs_23_128, coeffs_23_128);
- // coeffs [ f5 f4 f5 f4 f5 f4 f5 f4 ][ f5 f4 f5 f4 f5 f4 f5 f4 ]
- const __m256i coeffs_45 = yy_set_m128i(coeffs_45_128, coeffs_45_128);
- // coeffs [ f7 f6 f7 f6 f7 f6 f7 f6 ][ f7 f6 f7 f6 f7 f6 f7 f6 ]
- const __m256i coeffs_67 = yy_set_m128i(coeffs_67_128, coeffs_67_128);
-
- const __m256i round_const =
- _mm256_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
- for (int i = 0; i < h; ++i) {
- for (int j = 0; j < w; j += 16) {
- const uint16_t *temp_ij = temp + i * MAX_SB_SIZE + j;
-
- // Load 16-bit data from the output of the horizontal filter in
- // which the pixels are ordered as follows:
- // [ 15 13 11 9 14 12 10 8 ] [ 7 5 3 1 6 4 2 0 ]
- const __m256i data_0 = yy_loadu_256(temp_ij + 0 * MAX_SB_SIZE);
- const __m256i data_1 = yy_loadu_256(temp_ij + 1 * MAX_SB_SIZE);
- const __m256i data_2 = yy_loadu_256(temp_ij + 2 * MAX_SB_SIZE);
- const __m256i data_3 = yy_loadu_256(temp_ij + 3 * MAX_SB_SIZE);
- const __m256i data_4 = yy_loadu_256(temp_ij + 4 * MAX_SB_SIZE);
- const __m256i data_5 = yy_loadu_256(temp_ij + 5 * MAX_SB_SIZE);
- const __m256i data_6 = yy_loadu_256(temp_ij + 6 * MAX_SB_SIZE);
- const __m256i data_7 = yy_loadu_256(temp_ij + 7 * MAX_SB_SIZE);
-
- // Filter the even-indices, increasing to 32-bit precision
- const __m256i src_0 = _mm256_unpacklo_epi16(data_0, data_1);
- const __m256i src_2 = _mm256_unpacklo_epi16(data_2, data_3);
- const __m256i src_4 = _mm256_unpacklo_epi16(data_4, data_5);
- const __m256i src_6 = _mm256_unpacklo_epi16(data_6, data_7);
-
- const __m256i res_0 = _mm256_madd_epi16(src_0, coeffs_01);
- const __m256i res_2 = _mm256_madd_epi16(src_2, coeffs_23);
- const __m256i res_4 = _mm256_madd_epi16(src_4, coeffs_45);
- const __m256i res_6 = _mm256_madd_epi16(src_6, coeffs_67);
-
- const __m256i res_even = _mm256_add_epi32(
- _mm256_add_epi32(res_0, res_2), _mm256_add_epi32(res_4, res_6));
-
- // Filter the odd-indices, increasing to 32-bit precision
- const __m256i src_1 = _mm256_unpackhi_epi16(data_0, data_1);
- const __m256i src_3 = _mm256_unpackhi_epi16(data_2, data_3);
- const __m256i src_5 = _mm256_unpackhi_epi16(data_4, data_5);
- const __m256i src_7 = _mm256_unpackhi_epi16(data_6, data_7);
-
- const __m256i res_1 = _mm256_madd_epi16(src_1, coeffs_01);
- const __m256i res_3 = _mm256_madd_epi16(src_3, coeffs_23);
- const __m256i res_5 = _mm256_madd_epi16(src_5, coeffs_45);
- const __m256i res_7 = _mm256_madd_epi16(src_7, coeffs_67);
-
- const __m256i res_odd = _mm256_add_epi32(
- _mm256_add_epi32(res_1, res_3), _mm256_add_epi32(res_5, res_7));
-
- // Pixels are currently in the following order:
- // res_even order: [ 14 12 10 8 ] [ 6 4 2 0 ]
- // res_odd order: [ 15 13 11 9 ] [ 7 5 3 1 ]
- //
- // Rearrange the pixels into the following order:
- // res_lo order: [ 11 10 9 8 ] [ 3 2 1 0 ]
- // res_hi order: [ 15 14 13 12 ] [ 7 6 5 4 ]
- const __m256i res_lo = _mm256_unpacklo_epi32(res_even, res_odd);
- const __m256i res_hi = _mm256_unpackhi_epi32(res_even, res_odd);
-
- const __m256i res_lo_round =
- _mm256_srai_epi32(_mm256_add_epi32(res_lo, round_const),
- FILTER_BITS + EXTRAPREC_BITS);
- const __m256i res_hi_round =
- _mm256_srai_epi32(_mm256_add_epi32(res_hi, round_const),
- FILTER_BITS + EXTRAPREC_BITS);
-
- // Reduce to 16-bit precision and pack into the correct order:
- // [ 15 14 13 12 11 10 9 8 ][ 7 6 5 4 3 2 1 0 ]
- const __m256i res_16bit =
- _mm256_packs_epi32(res_lo_round, res_hi_round);
- const __m256i res_16bit_clamped = _mm256_min_epi16(
- _mm256_max_epi16(res_16bit, clamp_low), clamp_high);
-
- // Store in the dst array
- yy_storeu_256(dst + i * dst_stride + j, res_16bit_clamped);
- }
- }
- }
-}
diff --git a/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c b/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
deleted file mode 100644
index 5bf6743..0000000
--- a/aom_dsp/x86/aom_highbd_convolve_hip_ssse3.c
+++ /dev/null
@@ -1,204 +0,0 @@
-/*
- * Copyright (c) 2016, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <tmmintrin.h>
-#include <assert.h>
-
-#include "./aom_dsp_rtcd.h"
-#include "aom_dsp/aom_convolve.h"
-#include "aom_dsp/aom_dsp_common.h"
-#include "aom_dsp/aom_filter.h"
-
-#if EXTRAPREC_BITS > 2
-#error "Highbd high-prec convolve filter only supports EXTRAPREC_BITS <= 2"
-#error "(need to use 32-bit intermediates for EXTRAPREC_BITS > 2)"
-#endif
-
-void aom_highbd_convolve8_add_src_hip_ssse3(
- const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
- assert(x_step_q4 == 16 && y_step_q4 == 16);
- assert(!(w & 7));
- (void)x_step_q4;
- (void)y_step_q4;
-
- const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
- uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);
-
- DECLARE_ALIGNED(16, uint16_t,
- temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
- int intermediate_height = h + SUBPEL_TAPS - 1;
- int i, j;
- const int center_tap = ((SUBPEL_TAPS - 1) / 2);
- const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;
-
- const __m128i zero = _mm_setzero_si128();
- // Add an offset to account for the "add_src" part of the convolve function.
- const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);
-
- /* Horizontal filter */
- {
- const __m128i coeffs_x =
- _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
- (1 << (bd + FILTER_BITS - 1)));
-
- for (i = 0; i < intermediate_height; ++i) {
- for (j = 0; j < w; j += 8) {
- const __m128i data =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
- const __m128i data2 =
- _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);
-
- // Filter even-index pixels
- const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
- const __m128i res_2 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
- const __m128i res_4 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
- const __m128i res_6 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);
-
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
- res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Filter odd-index pixels
- const __m128i res_1 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
- const __m128i res_3 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
- const __m128i res_5 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
- const __m128i res_7 =
- _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);
-
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
- res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
- FILTER_BITS - EXTRAPREC_BITS);
-
- // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
- const __m128i maxval = _mm_set1_epi16((EXTRAPREC_CLAMP_LIMIT(bd)) - 1);
- __m128i res = _mm_packs_epi32(res_even, res_odd);
- res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
- _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
- }
- }
- }
-
- /* Vertical filter */
- {
- const __m128i coeffs_y =
- _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
-
- const __m128i round_const =
- _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
- (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));
-
- for (i = 0; i < h; ++i) {
- for (j = 0; j < w; j += 8) {
- // Filter even-index pixels
- const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
- const __m128i src_0 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
- *(__m128i *)(data + 1 * MAX_SB_SIZE));
- const __m128i src_2 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
- *(__m128i *)(data + 3 * MAX_SB_SIZE));
- const __m128i src_4 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
- *(__m128i *)(data + 5 * MAX_SB_SIZE));
- const __m128i src_6 =
- _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
- *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
-
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
- _mm_add_epi32(res_4, res_6));
-
- // Filter odd-index pixels
- const __m128i src_1 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
- *(__m128i *)(data + 1 * MAX_SB_SIZE));
- const __m128i src_3 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
- *(__m128i *)(data + 3 * MAX_SB_SIZE));
- const __m128i src_5 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
- *(__m128i *)(data + 5 * MAX_SB_SIZE));
- const __m128i src_7 =
- _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
- *(__m128i *)(data + 7 * MAX_SB_SIZE));
-
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
-
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
-
- // Rearrange pixels back into the order 0 ... 7
- const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
- const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
-
- const __m128i res_lo_round = _mm_srai_epi32(
- _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
- const __m128i res_hi_round = _mm_srai_epi32(
- _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);
-
- const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
- __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
- res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);
-
- __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- _mm_storeu_si128(p, res_16bit);
- }
- }
- }
-}
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 7dd1f1d..4ea9f05 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -285,12 +285,6 @@
filter8_1dfunction aom_filter_block1d8_h8_ssse3;
filter8_1dfunction aom_filter_block1d4_v8_ssse3;
filter8_1dfunction aom_filter_block1d4_h8_ssse3;
-filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
-filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
filter8_1dfunction aom_filter_block1d16_v2_ssse3;
filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -312,15 +306,9 @@
FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
- ssse3);
-FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
- src - src_stride * 3, add_src_, ssse3);
-
// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
// const int16_t *filter_y, int y_step_q4,
// int w, int h);
FUN_CONV_2D(, ssse3);
-FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 6dc2310..3ca7921 100644
--- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -378,10 +378,6 @@
SUBPIX_HFILTER8 h8
SUBPIX_HFILTER4 h8
-SUBPIX_HFILTER16 h8_add_src
-SUBPIX_HFILTER8 h8_add_src
-SUBPIX_HFILTER4 h8_add_src
-
;-------------------------------------------------------------------------------
; TODO(Linfeng): Detect cpu type and choose the code with better performance.
@@ -872,9 +868,3 @@
SUBPIX_VFILTER16 v8
SUBPIX_VFILTER v8, 8
SUBPIX_VFILTER v8, 4
-
-%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON)
-SUBPIX_VFILTER16 v8_add_src
-SUBPIX_VFILTER v8_add_src, 8
-SUBPIX_VFILTER v8_add_src, 4
-%endif
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index f6812e8..9200e40 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -115,68 +115,6 @@
} \
}
-// convolve_add_src is only used by the Wiener filter, which will never
-// end up calling the bilinear functions (it uses a symmetric filter, so
-// the possible numbers of taps are 1,3,5,7)
-#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
- opt) \
- void aom_convolve8_##name##_##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h) { \
- (void)filter_x; \
- (void)x_step_q4; \
- (void)filter_y; \
- (void)y_step_q4; \
- assert((-128 <= filter[3]) && (filter[3] <= 127)); \
- assert(step_q4 == 16); \
- while (w >= 16) { \
- aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst, \
- dst_stride, h, filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- if (w) { \
- aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h); \
- } \
- }
-
-#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt) \
- void aom_convolve8_##type##opt( \
- const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
- ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, int w, int h) { \
- DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]); \
- assert((-128 <= filter_x[3]) && (filter_x[3] <= 127)); \
- assert((-128 <= filter_y[3]) && (filter_y[3] <= 127)); \
- assert(w <= MAX_SB_SIZE); \
- assert(h <= MAX_SB_SIZE); \
- assert(x_step_q4 == 16); \
- assert(y_step_q4 == 16); \
- aom_convolve8_##htype##horiz_##opt( \
- src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x, \
- x_step_q4, filter_y, y_step_q4, w, h + 7); \
- aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
- dst, dst_stride, filter_x, x_step_q4, \
- filter_y, y_step_q4, w, h); \
- }
-
typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
const ptrdiff_t src_pitch,
uint16_t *output_ptr,