Remove unused scaled convolve funcs from aom_dsp
BUG=aomedia:1575
Change-Id: Ic13751cb2dbaca84b90eab93e013c566f9c286a1
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index 98c6241..a0ec277 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -52,27 +52,6 @@
}
}
-static void convolve_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_qn,
- int x_step_qn, int w, int h) {
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_qn = x0_qn;
- for (int x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS]; // q8
- const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(x_filter_idx < SUBPEL_SHIFTS);
- const int16_t *const x_filter = x_filters[x_filter_idx];
- const int sum = horz_scalar_product(src_x, x_filter);
- dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- x_qn += x_step_qn;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
@@ -93,28 +72,6 @@
}
}
-static void convolve_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_qn,
- int x_step_qn, int w, int h) {
- src -= SUBPEL_TAPS / 2 - 1;
- for (int y = 0; y < h; ++y) {
- int x_qn = x0_qn;
- for (int x = 0; x < w; ++x) {
- const uint8_t *const src_x = &src[x_qn >> SCALE_SUBPEL_BITS];
- const int x_filter_idx = (x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS;
- assert(x_filter_idx < SUBPEL_SHIFTS);
- const int16_t *const x_filter = x_filters[x_filter_idx];
- const int sum = horz_scalar_product(src_x, x_filter);
- dst[x] = ROUND_POWER_OF_TWO(
- dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
- x_qn += x_step_qn;
- }
- src += src_stride;
- dst += dst_stride;
- }
-}
-
static void convolve_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
@@ -135,28 +92,6 @@
}
}
-static void convolve_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_qn,
- int y_step_qn, int w, int h) {
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (int x = 0; x < w; ++x) {
- int y_qn = y0_qn;
- for (int y = 0; y < h; ++y) {
- const unsigned char *src_y =
- &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter =
- y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
- const int sum = vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
- y_qn += y_step_qn;
- }
- ++src;
- ++dst;
- }
-}
-
static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *y_filters, int y0_q4,
@@ -180,31 +115,6 @@
}
}
-static void convolve_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_qn,
- int y_step_qn, int w, int h) {
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
-
- for (int x = 0; x < w; ++x) {
- int y_qn = y0_qn;
- for (int y = 0; y < h; ++y) {
- const unsigned char *src_y =
- &src[(y_qn >> SCALE_SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter =
- y_filters[(y_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS];
- const int sum = vert_scalar_product(src_y, src_stride, y_filter);
- dst[y * dst_stride] = ROUND_POWER_OF_TWO(
- dst[y * dst_stride] +
- clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
- 1);
- y_qn += y_step_qn;
- }
- ++src;
- ++dst;
- }
-}
-
static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *const x_filters,
int x0_q4, int x_step_q4,
@@ -239,41 +149,6 @@
dst_stride, y_filters, y0_q4, y_step_q4, w, h);
}
-static void convolve_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_qn,
- int x_step_qn, const InterpKernel *const y_filters,
- int y0_qn, int y_step_qn, int w, int h) {
- // TODO(afergs): Update comment here
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_qn = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
- const int intermediate_height =
- (((h - 1) * y_step_qn + y0_qn) >> SCALE_SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
-
- assert(y_step_qn <= SCALE_SUBPEL_BITS * 2);
- assert(x_step_qn <= SCALE_SUBPEL_BITS * 2);
-
- convolve_horiz_scale_c(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
- temp, MAX_SB_SIZE, x_filters, x0_qn, x_step_qn, w,
- intermediate_height);
- convolve_vert_scale_c(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
- dst, dst_stride, y_filters, y0_qn, y_step_qn, w, h);
-}
-
static const InterpKernel *get_filter_base(const int16_t *filter) {
// NOTE: This assumes that the filter table is 256-byte aligned.
// TODO(agrange) Modify to make independent of table alignment.
@@ -299,21 +174,6 @@
w, h);
}
-void aom_convolve8_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x,
- int x_step_qn, const int16_t *filter_y,
- int subpel_y, int y_step_qn, int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
-
- (void)subpel_y;
- (void)filter_y;
- (void)y_step_qn;
-
- convolve_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
- x_step_qn, w, h);
-}
-
void aom_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -329,22 +189,6 @@
x_step_q4, w, h);
}
-void aom_convolve8_avg_horiz_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x,
- int x_step_qn, const int16_t *filter_y,
- int subpel_y, int y_step_qn, int w,
- int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
-
- (void)subpel_y;
- (void)filter_y;
- (void)y_step_qn;
-
- convolve_avg_horiz_scale_c(src, src_stride, dst, dst_stride, filters_x,
- subpel_x, x_step_qn, w, h);
-}
-
void aom_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -360,21 +204,6 @@
w, h);
}
-void aom_convolve8_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x,
- int x_step_qn, const int16_t *filter_y,
- int subpel_y, int y_step_qn, int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
-
- (void)subpel_x;
- (void)filter_x;
- (void)x_step_qn;
-
- convolve_vert_scale_c(src, src_stride, dst, dst_stride, filters_y, subpel_y,
- y_step_qn, w, h);
-}
-
void aom_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
@@ -390,21 +219,6 @@
y_step_q4, w, h);
}
-void aom_convolve8_avg_vert_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x,
- int x_step_qn, const int16_t *filter_y,
- int subpel_y, int y_step_qn, int w, int h) {
- const InterpKernel *const filters_y = get_filter_base(filter_y);
-
- (void)subpel_x;
- (void)filter_x;
- (void)x_step_qn;
-
- convolve_avg_vert_scale_c(src, src_stride, dst, dst_stride, filters_y,
- subpel_y, y_step_qn, w, h);
-}
-
void aom_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
@@ -419,19 +233,6 @@
filters_y, y0_q4, y_step_q4, w, h);
}
-void aom_convolve8_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x, int x_step_qn,
- const int16_t *filter_y, int subpel_y, int y_step_qn,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
-
- convolve_scale_c(src, src_stride, dst, dst_stride, filters_x, subpel_x,
- x_step_qn, filters_y, subpel_y, y_step_qn, w, h);
-}
-
void aom_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int x_step_q4, const int16_t *filter_y, int y_step_q4,
@@ -447,22 +248,6 @@
h);
}
-void aom_convolve8_avg_scale_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int subpel_x,
- int x_step_qn, const int16_t *filter_y,
- int subpel_y, int y_step_qn, int w, int h) {
- /* Fixed size intermediate buffer places limits on parameters. */
- DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
-
- aom_convolve8_scale_c(src, src_stride, temp, MAX_SB_SIZE, filter_x, subpel_x,
- x_step_qn, filter_y, subpel_y, y_step_qn, w, h);
- aom_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride, NULL, 0, NULL, 0, w,
- h);
-}
-
void aom_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const int16_t *filter_x,
int filter_x_stride, const int16_t *filter_y,
@@ -498,56 +283,6 @@
}
}
-void aom_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- aom_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- aom_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- aom_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- aom_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4, int w,
- int h) {
- aom_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x,
- x_step_q4, filter_y, y_step_q4, w, h);
-}
-
-void aom_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- aom_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4,
- filter_y, y_step_q4, w, h);
-}
-
static INLINE int highbd_vert_scalar_product(const uint16_t *a,
ptrdiff_t a_stride,
const int16_t *b) {
@@ -556,7 +291,6 @@
return sum;
}
-// TODO(afergs): Make sure this works too
static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *x_filters, int x0_q4,
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 580cfb1..5e8bb4e 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -275,19 +275,6 @@
add_proto qw/void aom_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-
-add_proto qw/void aom_convolve8_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_horiz_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_vert_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
-add_proto qw/void aom_convolve8_avg_scale/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int subpel_x, int x_step_q4, const int16_t *filter_y, int subpel_y, int y_step_q4, int w, int h";
specialize qw/aom_convolve_copy sse2 /;
specialize qw/aom_convolve_avg sse2 /;
@@ -297,7 +284,6 @@
specialize qw/aom_convolve8_avg sse2 ssse3/;
specialize qw/aom_convolve8_avg_horiz sse2 ssse3/;
specialize qw/aom_convolve8_avg_vert sse2 ssse3/;
-specialize qw/aom_scaled_2d ssse3/;
add_proto qw/void aom_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 96ae540..a9a15fd 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -342,563 +342,6 @@
FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
src - src_stride * 3, add_src_, ssse3);
-#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
- out2, out3, out4, out5, out6, out7) \
- { \
- const __m128i tr0_0 = _mm_unpacklo_epi8(in0, in1); \
- const __m128i tr0_1 = _mm_unpacklo_epi8(in2, in3); \
- const __m128i tr0_2 = _mm_unpacklo_epi8(in4, in5); \
- const __m128i tr0_3 = _mm_unpacklo_epi8(in6, in7); \
- \
- const __m128i tr1_0 = _mm_unpacklo_epi16(tr0_0, tr0_1); \
- const __m128i tr1_1 = _mm_unpackhi_epi16(tr0_0, tr0_1); \
- const __m128i tr1_2 = _mm_unpacklo_epi16(tr0_2, tr0_3); \
- const __m128i tr1_3 = _mm_unpackhi_epi16(tr0_2, tr0_3); \
- \
- const __m128i tr2_0 = _mm_unpacklo_epi32(tr1_0, tr1_2); \
- const __m128i tr2_1 = _mm_unpackhi_epi32(tr1_0, tr1_2); \
- const __m128i tr2_2 = _mm_unpacklo_epi32(tr1_1, tr1_3); \
- const __m128i tr2_3 = _mm_unpackhi_epi32(tr1_1, tr1_3); \
- \
- out0 = _mm_unpacklo_epi64(tr2_0, tr2_0); \
- out1 = _mm_unpackhi_epi64(tr2_0, tr2_0); \
- out2 = _mm_unpacklo_epi64(tr2_1, tr2_1); \
- out3 = _mm_unpackhi_epi64(tr2_1, tr2_1); \
- out4 = _mm_unpacklo_epi64(tr2_2, tr2_2); \
- out5 = _mm_unpackhi_epi64(tr2_2, tr2_2); \
- out6 = _mm_unpacklo_epi64(tr2_3, tr2_3); \
- out7 = _mm_unpackhi_epi64(tr2_3, tr2_3); \
- }
-
-static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *x_filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
- const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
- // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
- const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
- const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
- const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
- const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
- const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
- const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void transpose8x8_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A, B, C, D, E, F, G, H;
-
- A = _mm_loadl_epi64((const __m128i *)src);
- B = _mm_loadl_epi64((const __m128i *)(src + src_stride));
- C = _mm_loadl_epi64((const __m128i *)(src + src_stride * 2));
- D = _mm_loadl_epi64((const __m128i *)(src + src_stride * 3));
- E = _mm_loadl_epi64((const __m128i *)(src + src_stride * 4));
- F = _mm_loadl_epi64((const __m128i *)(src + src_stride * 5));
- G = _mm_loadl_epi64((const __m128i *)(src + src_stride * 6));
- H = _mm_loadl_epi64((const __m128i *)(src + src_stride * 7));
-
- TRANSPOSE_8X8(A, B, C, D, E, F, G, H, A, B, C, D, E, F, G, H);
-
- _mm_storel_epi64((__m128i *)dst, A);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 1), B);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 2), C);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 3), D);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 4), E);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 5), F);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 6), G);
- _mm_storel_epi64((__m128i *)(dst + dst_stride * 7), H);
-}
-
-static void scaledconvolve_horiz_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
-
- // This function processes 8x8 areas. The intermediate height is not always
- // a multiple of 8, so force it to be a multiple of 8 here.
- y = h + (8 - (h & 0x7));
-
- do {
- int x_q4 = x0_q4;
- for (x = 0; x < w; x += 8) {
- // process 8 src_x steps
- for (z = 0; z < 8; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- if (x_q4 & SUBPEL_MASK) {
- filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
- } else {
- int i;
- for (i = 0; i < 8; ++i) {
- temp[z * 8 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
- }
-
- // transpose the 8x8 filters values back to dst
- transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
- }
-
- src += src_stride * 8;
- dst += dst_stride * 8;
- } while (y -= 8);
-}
-
-static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- // TRANSPOSE...
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- //
- // TO
- //
- // 00 10 20 30
- // 01 11 21 31
- // 02 12 22 32
- // 03 13 23 33
- // 04 14 24 34
- // 05 15 25 35
- // 06 16 26 36
- // 07 17 27 37
- //
- // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
- const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
- // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
- const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
- // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
- const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
- const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 02 03 12 13 22 23 32 33
- const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
- // 06 07 16 17 26 27 36 37
- const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 4 bytes
- *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride) {
- __m128i A = _mm_cvtsi32_si128(*(const int *)src);
- __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
- __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
- __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
- // 00 10 01 11 02 12 03 13
- const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
- // 20 30 21 31 22 32 23 33
- const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- A = _mm_unpacklo_epi16(tr0_0, tr0_1);
- B = _mm_srli_si128(A, 4);
- C = _mm_srli_si128(A, 8);
- D = _mm_srli_si128(A, 12);
-
- *(int *)(dst) = _mm_cvtsi128_si32(A);
- *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
- *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
- *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
-}
-
-static void scaledconvolve_horiz_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *x_filters, int x0_q4,
- int x_step_q4, int w, int h) {
- DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
- int x, y, z;
- src -= SUBPEL_TAPS / 2 - 1;
-
- for (y = 0; y < h; y += 4) {
- int x_q4 = x0_q4;
- for (x = 0; x < w; x += 4) {
- // process 4 src_x steps
- for (z = 0; z < 4; ++z) {
- const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
- const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
- if (x_q4 & SUBPEL_MASK) {
- filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
- } else {
- int i;
- for (i = 0; i < 4; ++i) {
- temp[z * 4 + i] = src_x[i * src_stride + 3];
- }
- }
- x_q4 += x_step_q4;
- }
-
- // transpose the 4x4 filters values back to dst
- transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
- }
-
- src += src_stride * 4;
- dst += dst_stride * 4;
- }
-}
-
-static void filter_vert_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_cvtsi32_si128(*(const int *)src_ptr);
- const __m128i B = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch));
- const __m128i C = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_cvtsi32_si128(*(const int *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 4 bytes
- *(int *)dst = _mm_cvtsi128_si32(temp);
-}
-
-static void scaledconvolve_vert_w4(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
-
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
-
- y_q4 += y_step_q4;
- }
-}
-
-static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
- const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
- const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
- const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
- const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
- const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
- const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
- const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
- const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
- const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
- // add and saturate the results together
- const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
- const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
- __m128i temp = _mm_adds_epi16(x0, x3);
- temp = _mm_adds_epi16(temp, min_x2x1);
- temp = _mm_adds_epi16(temp, max_x2x1);
- // round and shift by 7 bit each 16 bit
- temp = _mm_mulhrs_epi16(temp, k_256);
- // shrink to 8 bit each 16 bits
- temp = _mm_packus_epi16(temp, temp);
- // save only 8 bytes convolve result
- _mm_storel_epi64((__m128i *)dst, temp);
-}
-
-static void scaledconvolve_vert_w8(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
- y_q4 += y_step_q4;
- }
-}
-
-static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
- uint8_t *dst, const int16_t *filter, int w) {
- const __m128i k_256 = _mm_set1_epi16(1 << 8);
- const __m128i f_values = _mm_load_si128((const __m128i *)filter);
- // pack and duplicate the filter values
- const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
- const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
- const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
- const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
- int i;
-
- for (i = 0; i < w; i += 16) {
- const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
- const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
- const __m128i C =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
- const __m128i D =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
- const __m128i E =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
- const __m128i F =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
- const __m128i G =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
- const __m128i H =
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
- // merge the result together
- const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
- const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
- const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
- const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
- const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
- const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
- const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
- // add and saturate the results together
- const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
- const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
- // merge the result together
- const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
- const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
- const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
- // merge the result together
- const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
- const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
- // multiply 2 adjacent elements with the filter and add the result
- const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
- const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
- // add and saturate the results together
- __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
- __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));
-
- // add and saturate the results together
- temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
- temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
- // round and shift by 7 bit each 16 bit
- temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
- temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
- // shrink to 8 bit each 16 bits, the first lane contain the first
- // convolve result and the second lane contain the second convolve
- // result
- temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
- src_ptr += 16;
- // save 16 bytes convolve result
- _mm_store_si128((__m128i *)&dst[i], temp_hi);
- }
-}
-
-static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *y_filters, int y0_q4,
- int y_step_q4, int w, int h) {
- int y;
- int y_q4 = y0_q4;
-
- src -= src_stride * (SUBPEL_TAPS / 2 - 1);
- for (y = 0; y < h; ++y) {
- const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
- const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
- if (y_q4 & SUBPEL_MASK) {
- filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
- w);
- } else {
- memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
- }
- y_q4 += y_step_q4;
- }
-}
-
-static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride,
- uint8_t *dst, ptrdiff_t dst_stride,
- const InterpKernel *const x_filters, int x0_q4,
- int x_step_q4, const InterpKernel *const y_filters,
- int y0_q4, int y_step_q4, int w, int h) {
- // Note: Fixed size intermediate buffer, temp, places limits on parameters.
- // 2d filtering proceeds in 2 steps:
- // (1) Interpolate horizontally into an intermediate buffer, temp.
- // (2) Interpolate temp vertically to derive the sub-pixel result.
- // Deriving the maximum number of rows in the temp buffer (135):
- // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
- // --Largest block size is 64x64 pixels.
- // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
- // original frame (in 1/16th pixel units).
- // --Must round-up because block may be located at sub-pixel position.
- // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
- // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
- // --Require an additional 8 rows for the horiz_w8 transpose tail.
- DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
- const int intermediate_height =
- (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
-
- assert(w <= MAX_SB_SIZE);
- assert(h <= MAX_SB_SIZE);
- assert(y_step_q4 <= 32);
- assert(x_step_q4 <= 32);
-
- if (w >= 8) {
- scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
- x_step_q4, w, intermediate_height);
- } else {
- scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
- src_stride, temp, MAX_SB_SIZE, x_filters, x0_q4,
- x_step_q4, w, intermediate_height);
- }
-
- if (w >= 16) {
- scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
- y_step_q4, w, h);
- } else if (w == 8) {
- scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
- y_step_q4, w, h);
- } else {
- scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
- MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4,
- y_step_q4, w, h);
- }
-}
-
-static const InterpKernel *get_filter_base(const int16_t *filter) {
- // NOTE: This assumes that the filter table is 256-byte aligned.
- // TODO(agrange) Modify to make independent of table alignment.
- return (const InterpKernel *)(((intptr_t)filter) & ~((intptr_t)0xFF));
-}
-
-static int get_filter_offset(const int16_t *f, const InterpKernel *base) {
- return (int)((const InterpKernel *)(intptr_t)f - base);
-}
-
-void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
- ptrdiff_t dst_stride, const int16_t *filter_x,
- int x_step_q4, const int16_t *filter_y, int y_step_q4,
- int w, int h) {
- const InterpKernel *const filters_x = get_filter_base(filter_x);
- const int x0_q4 = get_filter_offset(filter_x, filters_x);
-
- const InterpKernel *const filters_y = get_filter_base(filter_y);
- const int y0_q4 = get_filter_offset(filter_y, filters_y);
-
- scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4,
- x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
-}
-
// void aom_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
// uint8_t *dst, ptrdiff_t dst_stride,
// const int16_t *filter_x, int x_step_q4,
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index f6e770b..103bfb9 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -39,14 +39,9 @@
struct ConvolveFunctions {
ConvolveFunctions(ConvolveFunc copy, ConvolveFunc avg, ConvolveFunc h8,
ConvolveFunc h8_avg, ConvolveFunc v8, ConvolveFunc v8_avg,
- ConvolveFunc hv8, ConvolveFunc hv8_avg, ConvolveFunc sh8,
- ConvolveFunc sh8_avg, ConvolveFunc sv8,
- ConvolveFunc sv8_avg, ConvolveFunc shv8,
- ConvolveFunc shv8_avg, int bd)
+ ConvolveFunc hv8, ConvolveFunc hv8_avg, int bd)
: copy_(copy), avg_(avg), h8_(h8), v8_(v8), hv8_(hv8), h8_avg_(h8_avg),
- v8_avg_(v8_avg), hv8_avg_(hv8_avg), sh8_(sh8), sv8_(sv8), shv8_(shv8),
- sh8_avg_(sh8_avg), sv8_avg_(sv8_avg), shv8_avg_(shv8_avg),
- use_highbd_(bd) {}
+ v8_avg_(v8_avg), hv8_avg_(hv8_avg), use_highbd_(bd) {}
ConvolveFunc copy_;
ConvolveFunc avg_;
@@ -56,12 +51,6 @@
ConvolveFunc h8_avg_;
ConvolveFunc v8_avg_;
ConvolveFunc hv8_avg_;
- ConvolveFunc sh8_; // scaled horiz
- ConvolveFunc sv8_; // scaled vert
- ConvolveFunc shv8_; // scaled horiz/vert
- ConvolveFunc sh8_avg_; // scaled avg horiz
- ConvolveFunc sv8_avg_; // scaled avg vert
- ConvolveFunc shv8_avg_; // scaled avg horiz/vert
int use_highbd_; // 0 if high bitdepth not used, else the actual bit depth.
};
@@ -524,63 +513,6 @@
<< "(" << x << "," << y << ")";
}
-TEST_P(ConvolveTest, CopyHoriz) {
- uint8_t *const in = input();
- uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
- ASM_REGISTER_STATE_CHECK(UUT_->sh8_(in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y)
- for (int x = 0; x < Width(); ++x)
- ASSERT_EQ(lookup(out, y * kOutputStride + x),
- lookup(in, y * kInputStride + x))
- << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, CopyVert) {
- uint8_t *const in = input();
- uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
- ASM_REGISTER_STATE_CHECK(UUT_->sv8_(in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y)
- for (int x = 0; x < Width(); ++x)
- ASSERT_EQ(lookup(out, y * kOutputStride + x),
- lookup(in, y * kInputStride + x))
- << "(" << x << "," << y << ")";
-}
-
-TEST_P(ConvolveTest, Copy2D) {
- uint8_t *const in = input();
- uint8_t *const out = output();
- DECLARE_ALIGNED(256, const int16_t,
- filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 };
-
- ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
- filter8, 16, filter8, 16, Width(),
- Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y)
- for (int x = 0; x < Width(); ++x)
- ASSERT_EQ(lookup(out, y * kOutputStride + x),
- lookup(in, y * kInputStride + x))
- << "(" << x << "," << y << ")";
-}
-
const int kNumFilterBanks = SWITCHABLE_FILTERS;
const int kNumFilters = 16;
@@ -826,37 +758,6 @@
}
}
-/* This test exercises that enough rows and columns are filtered with every
- possible initial fractional positions and scaling steps. */
-TEST_P(ConvolveTest, CheckScalingFiltering) {
- uint8_t *const in = input();
- uint8_t *const out = output();
- const InterpKernel *const eighttap =
- (const InterpKernel *)av1_get_interp_filter_kernel(EIGHTTAP_REGULAR);
-
- SetConstantInput(127);
-
- for (int frac = 0; frac < 16; ++frac) {
- for (int step = 1; step <= 32; ++step) {
- /* Test the horizontal and vertical filters in combination. */
- ASM_REGISTER_STATE_CHECK(UUT_->shv8_(in, kInputStride, out, kOutputStride,
- eighttap[frac], step, eighttap[frac],
- step, Width(), Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y) {
- for (int x = 0; x < Width(); ++x) {
- ASSERT_EQ(lookup(in, y * kInputStride + x),
- lookup(out, y * kOutputStride + x))
- << "x == " << x << ", y == " << y << ", frac == " << frac
- << ", step == " << step;
- }
- }
- }
- }
-}
-
TEST_P(ConvolveTest, DISABLED_Copy_Speed) {
const uint8_t *const in = input();
uint8_t *const out = output();
@@ -1066,24 +967,17 @@
const ConvolveFunctions convolve8_c(
wrap_convolve_copy_c_8, wrap_convolve_avg_c_8, wrap_convolve8_horiz_c_8,
wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
- wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8,
- wrap_convolve8_horiz_c_8, wrap_convolve8_avg_horiz_c_8,
- wrap_convolve8_vert_c_8, wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8,
- wrap_convolve8_avg_c_8, 8);
+ wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
const ConvolveFunctions convolve10_c(
wrap_convolve_copy_c_10, wrap_convolve_avg_c_10, wrap_convolve8_horiz_c_10,
wrap_convolve8_avg_horiz_c_10, wrap_convolve8_vert_c_10,
wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10, wrap_convolve8_avg_c_10,
- wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
- wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
- wrap_convolve8_avg_c_10, 10);
+ 10);
const ConvolveFunctions convolve12_c(
wrap_convolve_copy_c_12, wrap_convolve_avg_c_12, wrap_convolve8_horiz_c_12,
wrap_convolve8_avg_horiz_c_12, wrap_convolve8_vert_c_12,
wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12, wrap_convolve8_avg_c_12,
- wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
- wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
- wrap_convolve8_avg_c_12, 12);
+ 12);
const ConvolveParam kArrayConvolve_c[] = {
ALL_SIZES(convolve8_c), ALL_SIZES(convolve10_c), ALL_SIZES(convolve12_c)
};
@@ -1095,25 +989,16 @@
wrap_convolve_copy_sse2_8, wrap_convolve_avg_sse2_8,
wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
- wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8,
- wrap_convolve8_horiz_sse2_8, wrap_convolve8_avg_horiz_sse2_8,
- wrap_convolve8_vert_sse2_8, wrap_convolve8_avg_vert_sse2_8,
wrap_convolve8_sse2_8, wrap_convolve8_avg_sse2_8, 8);
const ConvolveFunctions convolve10_sse2(
wrap_convolve_copy_sse2_10, wrap_convolve_avg_sse2_10,
wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
- wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10,
- wrap_convolve8_horiz_sse2_10, wrap_convolve8_avg_horiz_sse2_10,
- wrap_convolve8_vert_sse2_10, wrap_convolve8_avg_vert_sse2_10,
wrap_convolve8_sse2_10, wrap_convolve8_avg_sse2_10, 10);
const ConvolveFunctions convolve12_sse2(
wrap_convolve_copy_sse2_12, wrap_convolve_avg_sse2_12,
wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
- wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12,
- wrap_convolve8_horiz_sse2_12, wrap_convolve8_avg_horiz_sse2_12,
- wrap_convolve8_vert_sse2_12, wrap_convolve8_avg_vert_sse2_12,
wrap_convolve8_sse2_12, wrap_convolve8_avg_sse2_12, 12);
const ConvolveParam kArrayConvolve_sse2[] = { ALL_SIZES(convolve8_sse2),
ALL_SIZES(convolve10_sse2),
@@ -1123,12 +1008,13 @@
#endif
#if HAVE_SSSE3
-const ConvolveFunctions convolve8_ssse3(
- aom_convolve_copy_c, aom_convolve_avg_c, aom_convolve8_horiz_ssse3,
- aom_convolve8_avg_horiz_ssse3, aom_convolve8_vert_ssse3,
- aom_convolve8_avg_vert_ssse3, aom_convolve8_ssse3, aom_convolve8_avg_ssse3,
- aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
- aom_scaled_avg_vert_c, aom_scaled_2d_ssse3, aom_scaled_avg_2d_c, 0);
+const ConvolveFunctions convolve8_ssse3(aom_convolve_copy_c, aom_convolve_avg_c,
+ aom_convolve8_horiz_ssse3,
+ aom_convolve8_avg_horiz_ssse3,
+ aom_convolve8_vert_ssse3,
+ aom_convolve8_avg_vert_ssse3,
+ aom_convolve8_ssse3,
+ aom_convolve8_avg_ssse3, 0);
const ConvolveParam kArrayConvolve8_ssse3[] = { ALL_SIZES(convolve8_ssse3) };
INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest,
@@ -1140,25 +1026,17 @@
wrap_convolve_copy_avx2_8, wrap_convolve_avg_avx2_8,
wrap_convolve8_horiz_avx2_8, wrap_convolve8_avg_horiz_avx2_8,
wrap_convolve8_vert_avx2_8, wrap_convolve8_avg_vert_avx2_8,
- wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, wrap_convolve8_horiz_c_8,
- wrap_convolve8_avg_horiz_c_8, wrap_convolve8_vert_c_8,
- wrap_convolve8_avg_vert_c_8, wrap_convolve8_c_8, wrap_convolve8_avg_c_8, 8);
+ wrap_convolve8_avx2_8, wrap_convolve8_avg_avx2_8, 8);
const ConvolveFunctions convolve10_avx2(
wrap_convolve_copy_avx2_10, wrap_convolve_avg_avx2_10,
wrap_convolve8_horiz_avx2_10, wrap_convolve8_avg_horiz_avx2_10,
wrap_convolve8_vert_avx2_10, wrap_convolve8_avg_vert_avx2_10,
- wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10,
- wrap_convolve8_horiz_c_10, wrap_convolve8_avg_horiz_c_10,
- wrap_convolve8_vert_c_10, wrap_convolve8_avg_vert_c_10, wrap_convolve8_c_10,
- wrap_convolve8_avg_c_10, 10);
+ wrap_convolve8_avx2_10, wrap_convolve8_avg_avx2_10, 10);
const ConvolveFunctions convolve12_avx2(
wrap_convolve_copy_avx2_12, wrap_convolve_avg_avx2_12,
wrap_convolve8_horiz_avx2_12, wrap_convolve8_avg_horiz_avx2_12,
wrap_convolve8_vert_avx2_12, wrap_convolve8_avg_vert_avx2_12,
- wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12,
- wrap_convolve8_horiz_c_12, wrap_convolve8_avg_horiz_c_12,
- wrap_convolve8_vert_c_12, wrap_convolve8_avg_vert_c_12, wrap_convolve8_c_12,
- wrap_convolve8_avg_c_12, 12);
+ wrap_convolve8_avx2_12, wrap_convolve8_avg_avx2_12, 12);
const ConvolveParam kArrayConvolve8_avx2[] = { ALL_SIZES_64(convolve8_avx2),
ALL_SIZES_64(convolve10_avx2),
ALL_SIZES_64(convolve12_avx2) };
@@ -1172,16 +1050,12 @@
const ConvolveFunctions convolve8_neon(
aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
- aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
- aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
- aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+ aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon, 0);
#else // HAVE_NEON
const ConvolveFunctions convolve8_neon(
aom_convolve_copy_neon, aom_convolve_avg_neon, aom_convolve8_horiz_neon,
aom_convolve8_avg_horiz_neon, aom_convolve8_vert_neon,
- aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon,
- aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
- aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+ aom_convolve8_avg_vert_neon, aom_convolve8_neon, aom_convolve8_avg_neon, 0);
#endif // HAVE_NEON_ASM
const ConvolveParam kArrayConvolve8_neon[] = { ALL_SIZES_64(convolve8_neon) };
@@ -1195,8 +1069,7 @@
aom_convolve_copy_dspr2, aom_convolve_avg_dspr2, aom_convolve8_horiz_dspr2,
aom_convolve8_avg_horiz_dspr2, aom_convolve8_vert_dspr2,
aom_convolve8_avg_vert_dspr2, aom_convolve8_dspr2, aom_convolve8_avg_dspr2,
- aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
- aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+ 0);
const ConvolveParam kArrayConvolve8_dspr2[] = { ALL_SIZES_64(convolve8_dspr2) };
INSTANTIATE_TEST_CASE_P(DSPR2, ConvolveTest,
@@ -1208,9 +1081,7 @@
const ConvolveFunctions convolve8_msa(
aom_convolve_copy_msa, aom_convolve_avg_msa, aom_convolve8_horiz_msa,
aom_convolve8_avg_horiz_msa, aom_convolve8_vert_msa,
- aom_convolve8_avg_vert_msa, aom_convolve8_msa, aom_convolve8_avg_msa,
- aom_scaled_horiz_c, aom_scaled_avg_horiz_c, aom_scaled_vert_c,
- aom_scaled_avg_vert_c, aom_scaled_2d_c, aom_scaled_avg_2d_c, 0);
+ aom_convolve8_avg_vert_msa, aom_convolve8_msa, aom_convolve8_avg_msa, 0);
const ConvolveParam kArrayConvolve8_msa[] = { ALL_SIZES_64(convolve8_msa) };
INSTANTIATE_TEST_CASE_P(MSA, ConvolveTest,