Add SSE2 variant for MULTITAP_SHARP2 2D-convolve
This CL adds SSE2 intrinsics for av1_convolve_2d_sr_c for
MULTITAP_SHARP2 case, and updates AV1Convolve2DTest to support
the added intrinsic and to obtain module level gains.
Module gains improved by a factor of ~11.5x w.r.t. C
Change-Id: Ie32de0d73ea7cf2cdeb8382afca419b8e069ae13
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 9e3b73e..9312e9e 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -110,4 +110,11 @@
const InterpFilterParams *filter_params_x,
int subpel_x_qn, ConvolveParams *conv_params);
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params);
+
#endif // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 61523da..530d129 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -947,569 +947,469 @@
const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
- int im_dst_stride;
- int width, height;
-#if defined(__aarch64__)
- uint8x8_t t0;
- uint8x8_t t1, t2, t3, t4, t5, t6, t7;
- const uint8_t *s;
-#endif
-
- DECLARE_ALIGNED(16, int16_t,
- im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
- const int bd = 8;
- const int im_h = h + filter_params_y->taps - 1;
- const int im_stride = MAX_SB_SIZE;
- const int vert_offset = filter_params_y->taps / 2 - 1;
- const int horiz_offset = filter_params_x->taps / 2 - 1;
-
- const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
- int16_t *dst_ptr;
-
- dst_ptr = im_block;
- im_dst_stride = im_stride;
- height = im_h;
- width = w;
-
- const int16_t round_bits =
- FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
- const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
- int16_t x_filter_tmp[8];
- int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
- // filter coeffs are even, so downshifting by 1 to reduce intermediate
- // precision requirements.
- filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
- vst1q_s16(&x_filter_tmp[0], filter_x_coef);
-
- assert(conv_params->round_0 > 0);
-
- if (w <= 4) {
- const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
- const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
-
-#if defined(__aarch64__)
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
- do {
- assert(height >= 4);
- s = src_ptr;
- __builtin_prefetch(s + 0 * src_stride);
- __builtin_prefetch(s + 1 * src_stride);
- __builtin_prefetch(s + 2 * src_stride);
- __builtin_prefetch(s + 3 * src_stride);
-
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
- s += 7;
-
- load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
- d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- horiz_const, shift_round_0);
- d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
- horiz_const, shift_round_0);
- d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
- horiz_const, shift_round_0);
- d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
- horiz_const, shift_round_0);
-
- transpose_s16_4x4d(&d0, &d1, &d2, &d3);
- if (w == 4) {
- vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
- vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
- vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
- vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
- } else if (w == 2) {
- vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
- vreinterpret_u32_s16(d0), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
- vreinterpret_u32_s16(d1), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
- vreinterpret_u32_s16(d2), 0);
- vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
- vreinterpret_u32_s16(d3), 0);
- }
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * im_dst_stride;
- height -= 4;
- } while (height >= 4);
-
- if (height) {
- assert(height < 4);
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
- }
-#else
- horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
-#endif
-
+ if (filter_params_x->taps > 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
} else {
- const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
- const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+ int im_dst_stride;
+ int width, height;
+#if defined(__aarch64__)
+ uint8x8_t t0;
+ uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+ const uint8_t *s;
+#endif
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+ const int bd = 8;
+ const int im_h = h + filter_params_y->taps - 1;
+ const int im_stride = MAX_SB_SIZE;
+ const int vert_offset = filter_params_y->taps / 2 - 1;
+ const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+ const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+ int16_t *dst_ptr;
+
+ dst_ptr = im_block;
+ im_dst_stride = im_stride;
+ height = im_h;
+ width = w;
+
+ const int16_t round_bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+ int16_t x_filter_tmp[8];
+ int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+ // filter coeffs are even, so downshifting by 1 to reduce intermediate
+ // precision requirements.
+ filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+ vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+ assert(conv_params->round_0 > 0);
+
+ if (w <= 4) {
+ const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
#if defined(__aarch64__)
- int16_t *d_tmp;
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
- int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
- do {
- assert(height >= 8);
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
- __builtin_prefetch(src_ptr + 4 * src_stride);
- __builtin_prefetch(src_ptr + 5 * src_stride);
- __builtin_prefetch(src_ptr + 6 * src_stride);
- __builtin_prefetch(src_ptr + 7 * src_stride);
-
- load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
- width = w;
- s = src_ptr + 7;
- d_tmp = dst_ptr;
-
- __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
- __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
-
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
do {
- load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ assert(height >= 4);
+ s = src_ptr;
+ __builtin_prefetch(s + 0 * src_stride);
+ __builtin_prefetch(s + 1 * src_stride);
+ __builtin_prefetch(s + 2 * src_stride);
+ __builtin_prefetch(s + 3 * src_stride);
- transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
- s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
- s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
- s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
- s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
- s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
- s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
- s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
- s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
- res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
- horiz_const, shift_round_0);
- res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
- horiz_const, shift_round_0);
- res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
- horiz_const, shift_round_0);
- res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
- horiz_const, shift_round_0);
- res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
- horiz_const, shift_round_0);
- res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
- x_filter_tmp, horiz_const, shift_round_0);
- res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
- x_filter_tmp, horiz_const, shift_round_0);
- res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
- x_filter_tmp, horiz_const, shift_round_0);
-
- transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
- &res7);
-
- store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
- res6, res7);
-
- s0 = s8;
- s1 = s9;
- s2 = s10;
- s3 = s11;
- s4 = s12;
- s5 = s13;
- s6 = s14;
- s += 8;
- d_tmp += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 8 * src_stride;
- dst_ptr += 8 * im_dst_stride;
- height -= 8;
- } while (height >= 8);
-
- if (height >= 4) {
- assert(height < 8);
- int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
- reg10, reg11, reg12, reg13, reg14;
- int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
- int16x8_t out0, out1, out2, out3;
-
- __builtin_prefetch(src_ptr + 0 * src_stride);
- __builtin_prefetch(src_ptr + 1 * src_stride);
- __builtin_prefetch(src_ptr + 2 * src_stride);
- __builtin_prefetch(src_ptr + 3 * src_stride);
-
- load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
- transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
- reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
- __builtin_prefetch(dst_ptr + 0 * dst_stride);
- __builtin_prefetch(dst_ptr + 1 * dst_stride);
- __builtin_prefetch(dst_ptr + 2 * dst_stride);
- __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
- s = src_ptr + 7;
- d_tmp = dst_ptr;
- width = w;
-
- do {
load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
transpose_u8_8x4(&t0, &t1, &t2, &t3);
- reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
- reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
- reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
- d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
- x_filter_tmp);
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+ s += 7;
- d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
- x_filter_tmp);
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
- d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
- x_filter_tmp);
+ s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
- d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
- x_filter_tmp);
+ d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+ horiz_const, shift_round_0);
- d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
- x_filter_tmp);
+ transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+ if (w == 4) {
+ vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+ vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+ vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+ vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+ } else if (w == 2) {
+ vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+ vreinterpret_u32_s16(d0), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+ vreinterpret_u32_s16(d1), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+ vreinterpret_u32_s16(d2), 0);
+ vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+ vreinterpret_u32_s16(d3), 0);
+ }
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * im_dst_stride;
+ height -= 4;
+ } while (height >= 4);
- d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
- x_filter_tmp);
+ if (height) {
+ assert(height < 4);
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride,
+ w, height, x_filter_tmp, horiz_const,
+ shift_round_0);
+ }
+#else
+ horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
+#endif
- d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
- x_filter_tmp);
+ } else {
+ const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+ const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
- d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
- x_filter_tmp);
+#if defined(__aarch64__)
+ int16_t *d_tmp;
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+ int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+ do {
+ assert(height >= 8);
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+ __builtin_prefetch(src_ptr + 4 * src_stride);
+ __builtin_prefetch(src_ptr + 5 * src_stride);
+ __builtin_prefetch(src_ptr + 6 * src_stride);
+ __builtin_prefetch(src_ptr + 7 * src_stride);
- transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
- &out2, &out3);
+ load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+ &t7);
- out0 = vaddq_s16(out0, horiz_const);
- out0 = vqrshlq_s16(out0, shift_round_0);
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- out1 = vaddq_s16(out1, horiz_const);
- out1 = vqrshlq_s16(out1, shift_round_0);
+ s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
- out2 = vaddq_s16(out2, horiz_const);
- out2 = vqrshlq_s16(out2, shift_round_0);
+ width = w;
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
- out3 = vaddq_s16(out3, horiz_const);
- out3 = vqrshlq_s16(out3, shift_round_0);
+ __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+ __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
- store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+ do {
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- reg0 = reg8;
- reg1 = reg9;
- reg2 = reg10;
- reg3 = reg11;
- reg4 = reg12;
- reg5 = reg13;
- reg6 = reg14;
- s += 8;
- d_tmp += 8;
- width -= 8;
- } while (width > 0);
- src_ptr += 4 * src_stride;
- dst_ptr += 4 * im_dst_stride;
- height -= 4;
- }
+ transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
- if (height) {
- assert(height < 4);
+ s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+ horiz_const, shift_round_0);
+ res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+ x_filter_tmp, horiz_const, shift_round_0);
+ res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+ x_filter_tmp, horiz_const, shift_round_0);
+
+ transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+ &res7);
+
+ store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4,
+ res5, res6, res7);
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 8 * src_stride;
+ dst_ptr += 8 * im_dst_stride;
+ height -= 8;
+ } while (height >= 8);
+
+ if (height >= 4) {
+ assert(height < 8);
+ int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ reg10, reg11, reg12, reg13, reg14;
+ int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+ int16x8_t out0, out1, out2, out3;
+
+ __builtin_prefetch(src_ptr + 0 * src_stride);
+ __builtin_prefetch(src_ptr + 1 * src_stride);
+ __builtin_prefetch(src_ptr + 2 * src_stride);
+ __builtin_prefetch(src_ptr + 3 * src_stride);
+
+ load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+ __builtin_prefetch(dst_ptr + 0 * dst_stride);
+ __builtin_prefetch(dst_ptr + 1 * dst_stride);
+ __builtin_prefetch(dst_ptr + 2 * dst_stride);
+ __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+ s = src_ptr + 7;
+ d_tmp = dst_ptr;
+ width = w;
+
+ do {
+ load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+ transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+ reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+ reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+ reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+ reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+ reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+ d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+ x_filter_tmp);
+
+ d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+ x_filter_tmp);
+
+ d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+ x_filter_tmp);
+
+ d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+ x_filter_tmp);
+
+ d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+ x_filter_tmp);
+
+ d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+ x_filter_tmp);
+
+ d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+ x_filter_tmp);
+
+ d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+ reg14, x_filter_tmp);
+
+ transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0,
+ &out1, &out2, &out3);
+
+ out0 = vaddq_s16(out0, horiz_const);
+ out0 = vqrshlq_s16(out0, shift_round_0);
+
+ out1 = vaddq_s16(out1, horiz_const);
+ out1 = vqrshlq_s16(out1, shift_round_0);
+
+ out2 = vaddq_s16(out2, horiz_const);
+ out2 = vqrshlq_s16(out2, shift_round_0);
+
+ out3 = vaddq_s16(out3, horiz_const);
+ out3 = vqrshlq_s16(out3, shift_round_0);
+
+ store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+ reg0 = reg8;
+ reg1 = reg9;
+ reg2 = reg10;
+ reg3 = reg11;
+ reg4 = reg12;
+ reg5 = reg13;
+ reg6 = reg14;
+ s += 8;
+ d_tmp += 8;
+ width -= 8;
+ } while (width > 0);
+ src_ptr += 4 * src_stride;
+ dst_ptr += 4 * im_dst_stride;
+ height -= 4;
+ }
+
+ if (height) {
+ assert(height < 4);
+ horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+ height, x_filter_tmp, horiz_const,
+ shift_round_0);
+ }
+#else
+
horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
height, x_filter_tmp, horiz_const,
shift_round_0);
+#endif
}
-#else
- horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
- height, x_filter_tmp, horiz_const,
- shift_round_0);
-#endif
- }
+ // vertical
+ {
+ uint8_t *dst_u8_ptr, *d_u8;
+ int16_t *v_src_ptr, *v_s;
- // vertical
- {
- uint8_t *dst_u8_ptr, *d_u8;
- int16_t *v_src_ptr, *v_s;
+ const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+ (1 << (offset_bits - conv_params->round_1 - 1));
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
- const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
- (1 << (offset_bits - conv_params->round_1 - 1));
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+ const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+ const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
- const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
- const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
- const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+ src_stride = im_stride;
+ v_src_ptr = im_block;
+ dst_u8_ptr = dst;
- src_stride = im_stride;
- v_src_ptr = im_block;
- dst_u8_ptr = dst;
+ height = h;
+ width = w;
- height = h;
- width = w;
-
- if (width <= 4) {
- int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint16x4_t d0;
- uint16x8_t dd0;
- uint8x8_t d01;
+ if (width <= 4) {
+ int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint16x4_t d0;
+ uint16x8_t dd0;
+ uint8x8_t d01;
#if defined(__aarch64__)
- int16x4_t s8, s9, s10;
- uint16x4_t d1, d2, d3;
- uint16x8_t dd1;
- uint8x8_t d23;
+ int16x4_t s8, s9, s10;
+ uint16x4_t d1, d2, d3;
+ uint16x8_t dd1;
+ uint8x8_t d23;
#endif
- d_u8 = dst_u8_ptr;
- v_s = v_src_ptr;
-
- __builtin_prefetch(v_s + 0 * im_stride);
- __builtin_prefetch(v_s + 1 * im_stride);
- __builtin_prefetch(v_s + 2 * im_stride);
- __builtin_prefetch(v_s + 3 * im_stride);
- __builtin_prefetch(v_s + 4 * im_stride);
- __builtin_prefetch(v_s + 5 * im_stride);
- __builtin_prefetch(v_s + 6 * im_stride);
- __builtin_prefetch(v_s + 7 * im_stride);
-
- load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- v_s += (7 * im_stride);
-
- do {
-#if defined(__aarch64__)
- load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
- v_s += (im_stride << 2);
-
- __builtin_prefetch(d_u8 + 0 * dst_stride);
- __builtin_prefetch(d_u8 + 1 * dst_stride);
- __builtin_prefetch(d_u8 + 2 * dst_stride);
- __builtin_prefetch(d_u8 + 3 * dst_stride);
-
- d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec);
- d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec);
- d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec);
- d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec);
-
- dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
- dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
-
- d01 = vqmovn_u16(dd0);
- d23 = vqmovn_u16(dd1);
-
- if ((w == 4) && (h != 2)) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 1); // 10 11 12 13
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
- 0); // 20 21 22 23
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
- 1); // 30 31 32 33
- d_u8 += dst_stride;
- } else if ((w == 2) && (h != 2)) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 2); // 10 11
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
- 0); // 20 21
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
- 2); // 30 31
- d_u8 += dst_stride;
- } else if ((w == 4) && (h == 2)) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 1); // 10 11 12 13
- d_u8 += dst_stride;
- } else if ((w == 2) && (h == 2)) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 2); // 10 11
- d_u8 += dst_stride;
- }
-
- s0 = s4;
- s1 = s5;
- s2 = s6;
- s3 = s7;
- s4 = s8;
- s5 = s9;
- s6 = s10;
- height -= 4;
-#else
- s7 = vld1_s16(v_s);
- v_s += im_stride;
-
- __builtin_prefetch(d_u8 + 0 * dst_stride);
-
- d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
- round_shift_vec, offset_const,
- sub_const_vec);
-
- dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
- d01 = vqmovn_u16(dd0);
-
- if (w == 4) {
- vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
- 0); // 00 01 02 03
- d_u8 += dst_stride;
-
- } else if (w == 2) {
- vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
- 0); // 00 01
- d_u8 += dst_stride;
- }
-
- s0 = s1;
- s1 = s2;
- s2 = s3;
- s3 = s4;
- s4 = s5;
- s5 = s6;
- s6 = s7;
- height -= 1;
-#endif
- } while (height > 0);
- } else {
- // if width is a multiple of 8 & height is a multiple of 4
- int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
- uint8x8_t res0;
-#if defined(__aarch64__)
- int16x8_t s8, s9, s10;
- uint8x8_t res1, res2, res3;
-#endif
-
- do {
- __builtin_prefetch(v_src_ptr + 0 * im_stride);
- __builtin_prefetch(v_src_ptr + 1 * im_stride);
- __builtin_prefetch(v_src_ptr + 2 * im_stride);
- __builtin_prefetch(v_src_ptr + 3 * im_stride);
- __builtin_prefetch(v_src_ptr + 4 * im_stride);
- __builtin_prefetch(v_src_ptr + 5 * im_stride);
- __builtin_prefetch(v_src_ptr + 6 * im_stride);
- __builtin_prefetch(v_src_ptr + 7 * im_stride);
-
- v_s = v_src_ptr;
- load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
- v_s += (7 * im_stride);
-
d_u8 = dst_u8_ptr;
- height = h;
+ v_s = v_src_ptr;
+
+ __builtin_prefetch(v_s + 0 * im_stride);
+ __builtin_prefetch(v_s + 1 * im_stride);
+ __builtin_prefetch(v_s + 2 * im_stride);
+ __builtin_prefetch(v_s + 3 * im_stride);
+ __builtin_prefetch(v_s + 4 * im_stride);
+ __builtin_prefetch(v_s + 5 * im_stride);
+ __builtin_prefetch(v_s + 6 * im_stride);
+ __builtin_prefetch(v_s + 7 * im_stride);
+
+ load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
do {
#if defined(__aarch64__)
- load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
v_s += (im_stride << 2);
- __builtin_prefetch(d_u8 + 4 * dst_stride);
- __builtin_prefetch(d_u8 + 5 * dst_stride);
- __builtin_prefetch(d_u8 + 6 * dst_stride);
- __builtin_prefetch(d_u8 + 7 * dst_stride);
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+ __builtin_prefetch(d_u8 + 1 * dst_stride);
+ __builtin_prefetch(d_u8 + 2 * dst_stride);
+ __builtin_prefetch(d_u8 + 3 * dst_stride);
- res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
- res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
+ d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
- if (h != 2) {
- vst1_u8(d_u8, res0);
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+ dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+ d01 = vqmovn_u16(dd0);
+ d23 = vqmovn_u16(dd1);
+
+ if ((w == 4) && (h != 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
d_u8 += dst_stride;
- vst1_u8(d_u8, res1);
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
d_u8 += dst_stride;
- vst1_u8(d_u8, res2);
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 0); // 20 21 22 23
d_u8 += dst_stride;
- vst1_u8(d_u8, res3);
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+ 1); // 30 31 32 33
d_u8 += dst_stride;
- } else {
- vst1_u8(d_u8, res0);
+ } else if ((w == 2) && (h != 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
d_u8 += dst_stride;
- vst1_u8(d_u8, res1);
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 0); // 20 21
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+ 2); // 30 31
+ d_u8 += dst_stride;
+ } else if ((w == 4) && (h == 2)) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 1); // 10 11 12 13
+ d_u8 += dst_stride;
+ } else if ((w == 2) && (h == 2)) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 2); // 10 11
d_u8 += dst_stride;
}
+
s0 = s4;
s1 = s5;
s2 = s6;
@@ -1519,17 +1419,28 @@
s6 = s10;
height -= 4;
#else
- s7 = vld1q_s16(v_s);
+ s7 = vld1_s16(v_s);
v_s += im_stride;
__builtin_prefetch(d_u8 + 0 * dst_stride);
- res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
- y_filter, round_shift_vec, offset_const,
- sub_const_vec, vec_round_bits);
+ d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+ round_shift_vec, offset_const,
+ sub_const_vec);
- vst1_u8(d_u8, res0);
- d_u8 += dst_stride;
+ dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+ d01 = vqmovn_u16(dd0);
+
+ if (w == 4) {
+ vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+ 0); // 00 01 02 03
+ d_u8 += dst_stride;
+
+ } else if (w == 2) {
+ vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+ 0); // 00 01
+ d_u8 += dst_stride;
+ }
s0 = s1;
s1 = s2;
@@ -1541,10 +1452,106 @@
height -= 1;
#endif
} while (height > 0);
- v_src_ptr += 8;
- dst_u8_ptr += 8;
- w -= 8;
- } while (w > 0);
+ } else {
+ // if width is a multiple of 8 & height is a multiple of 4
+ int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+ uint8x8_t res0;
+#if defined(__aarch64__)
+ int16x8_t s8, s9, s10;
+ uint8x8_t res1, res2, res3;
+#endif
+
+ do {
+ __builtin_prefetch(v_src_ptr + 0 * im_stride);
+ __builtin_prefetch(v_src_ptr + 1 * im_stride);
+ __builtin_prefetch(v_src_ptr + 2 * im_stride);
+ __builtin_prefetch(v_src_ptr + 3 * im_stride);
+ __builtin_prefetch(v_src_ptr + 4 * im_stride);
+ __builtin_prefetch(v_src_ptr + 5 * im_stride);
+ __builtin_prefetch(v_src_ptr + 6 * im_stride);
+ __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+ v_s = v_src_ptr;
+ load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+ v_s += (7 * im_stride);
+
+ d_u8 = dst_u8_ptr;
+ height = h;
+
+ do {
+#if defined(__aarch64__)
+ load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+ v_s += (im_stride << 2);
+
+ __builtin_prefetch(d_u8 + 4 * dst_stride);
+ __builtin_prefetch(d_u8 + 5 * dst_stride);
+ __builtin_prefetch(d_u8 + 6 * dst_stride);
+ __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+ offset_const, sub_const_vec, vec_round_bits);
+ res1 = convolve8_vert_8x4_s32(
+ s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round_shift_vec,
+ offset_const, sub_const_vec, vec_round_bits);
+ res2 = convolve8_vert_8x4_s32(
+ s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round_shift_vec,
+ offset_const, sub_const_vec, vec_round_bits);
+ res3 = convolve8_vert_8x4_s32(
+ s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round_shift_vec,
+ offset_const, sub_const_vec, vec_round_bits);
+
+ if (h != 2) {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res2);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res3);
+ d_u8 += dst_stride;
+ } else {
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+ vst1_u8(d_u8, res1);
+ d_u8 += dst_stride;
+ }
+ s0 = s4;
+ s1 = s5;
+ s2 = s6;
+ s3 = s7;
+ s4 = s8;
+ s5 = s9;
+ s6 = s10;
+ height -= 4;
+#else
+ s7 = vld1q_s16(v_s);
+ v_s += im_stride;
+
+ __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+ res0 = convolve8_vert_8x4_s32(
+ s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+ offset_const, sub_const_vec, vec_round_bits);
+
+ vst1_u8(d_u8, res0);
+ d_u8 += dst_stride;
+
+ s0 = s1;
+ s1 = s2;
+ s2 = s3;
+ s3 = s4;
+ s4 = s5;
+ s5 = s6;
+ s6 = s7;
+ height -= 1;
+#endif
+ } while (height > 0);
+ v_src_ptr += 8;
+ dst_u8_ptr += 8;
+ w -= 8;
+ } while (w > 0);
+ }
}
}
}
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 391c063..7f01e36 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -548,16 +548,8 @@
subpel_y_qn);
} else {
assert(need_x && need_y);
-
- if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
- av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_qn,
- subpel_y_qn, conv_params);
- } else {
- av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
- filter_params_x, filter_params_y, subpel_x_qn,
- subpel_y_qn, conv_params);
- }
+ av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+ filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
}
}
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 56196aa..ded5ce5 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -25,7 +25,7 @@
extern "C" {
#endif
-#define MAX_FILTER_TAP 8
+#define MAX_FILTER_TAP 12
typedef enum ATTRIBUTE_PACKED {
EIGHTTAP_REGULAR,
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 211f258..f78a7d0 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -26,87 +26,100 @@
const InterpFilterParams *filter_params_y,
const int subpel_x_qn, const int subpel_y_qn,
ConvolveParams *conv_params) {
- const int bd = 8;
- int im_stride = 8, i;
- DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
- const int bits =
- FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
- const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+ if (filter_params_x->taps > 8) {
+ if (w < 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ } else {
+ av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params);
+ }
+ } else {
+ const int bd = 8;
+ int im_stride = 8, i;
+ DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
- assert(conv_params->round_0 > 0);
+ assert(conv_params->round_0 > 0);
- const __m256i round_const_h = _mm256_set1_epi16(
- ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
- const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+ const __m256i round_const_h =
+ _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
+ (1 << (bd + FILTER_BITS - 2)));
+ const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
- const __m256i sum_round_v = _mm256_set1_epi32(
- (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
- const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+ const __m256i sum_round_v = _mm256_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
- const __m256i round_const_v = _mm256_set1_epi32(
- ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
- ((1 << (offset_bits - conv_params->round_1)) >> 1));
- const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+ const __m256i round_const_v = _mm256_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
- __m256i filt[4], coeffs_h[4], coeffs_v[4];
+ __m256i filt[4], coeffs_h[4], coeffs_v[4];
- filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
- filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+ filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+ filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
- prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
- prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
-
- const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
- int horiz_tap = SUBPEL_TAPS;
- int vert_tap = SUBPEL_TAPS;
-
- if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
- horiz_tap = 4;
- else if (!(filter_x[0] | filter_x[7]))
- horiz_tap = 6;
-
- if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
- vert_tap = 4;
- else if (!(filter_y[0] | filter_y[7]))
- vert_tap = 6;
-
- if (horiz_tap == 6)
- prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
- else
prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-
- if (vert_tap == 6)
- prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
- else
prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
- int im_h = h + vert_tap - 1;
- const int fo_vert = vert_tap / 2 - 1;
- const int fo_horiz = horiz_tap / 2 - 1;
- const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+ const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
- filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
- filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+ int horiz_tap = SUBPEL_TAPS;
+ int vert_tap = SUBPEL_TAPS;
- for (int j = 0; j < w; j += 8) {
- if (horiz_tap == 4) {
- CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
- } else if (horiz_tap == 6) {
- CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
- } else {
- CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
- }
+ if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
+ horiz_tap = 4;
+ else if (!(filter_x[0] | filter_x[7]))
+ horiz_tap = 6;
- if (vert_tap == 4) {
- CONVOLVE_SR_VERTICAL_FILTER_4TAP
- } else if (vert_tap == 6) {
- CONVOLVE_SR_VERTICAL_FILTER_6TAP
- } else {
- CONVOLVE_SR_VERTICAL_FILTER_8TAP
+ if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
+ vert_tap = 4;
+ else if (!(filter_y[0] | filter_y[7]))
+ vert_tap = 6;
+
+ if (horiz_tap == 6)
+ prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+ else
+ prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+
+ if (vert_tap == 6)
+ prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+ else
+ prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+ int im_h = h + vert_tap - 1;
+ const int fo_vert = vert_tap / 2 - 1;
+ const int fo_horiz = horiz_tap / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+ filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+ for (int j = 0; j < w; j += 8) {
+ if (horiz_tap == 4) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+ } else if (horiz_tap == 6) {
+ CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
+ }
+
+ if (vert_tap == 4) {
+ CONVOLVE_SR_VERTICAL_FILTER_4TAP
+ } else if (vert_tap == 6) {
+ CONVOLVE_SR_VERTICAL_FILTER_6TAP
+ } else {
+ CONVOLVE_SR_VERTICAL_FILTER_8TAP
+ }
}
}
}
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 1db9853..ca88bd7 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -16,20 +16,21 @@
#include "aom_dsp/aom_dsp_common.h"
#include "aom_dsp/aom_filter.h"
#include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
#include "av1/common/convolve.h"
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
- int dst_stride, int w, int h,
- const InterpFilterParams *filter_params_x,
- const InterpFilterParams *filter_params_y,
- const int subpel_x_qn, const int subpel_y_qn,
- ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
const int bd = 8;
DECLARE_ALIGNED(16, int16_t,
im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
int im_h = h + filter_params_y->taps - 1;
- int im_stride = MAX_SB_SIZE;
+ int im_stride = w;
int i, j;
const int fo_vert = filter_params_y->taps / 2 - 1;
const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -41,26 +42,11 @@
const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
assert(conv_params->round_0 > 0);
+ __m128i coeffs[6];
/* Horizontal filter */
{
- const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_x, subpel_x_qn & SUBPEL_MASK);
- const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+ prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
const __m128i round_const = _mm_set1_epi32(
(1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
@@ -70,34 +56,54 @@
for (j = 0; j < w; j += 8) {
const __m128i data =
_mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+ const __m128i data_2 =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
// Filter even-index pixels
const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i src_8 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i src_10 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
- __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
- _mm_add_epi32(res_2, res_6));
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
res_even =
_mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
// Filter odd-index pixels
const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i src_9 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i src_11 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
- __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
- _mm_add_epi32(res_3, res_7));
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
res_odd =
_mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
@@ -110,23 +116,7 @@
/* Vertical filter */
{
- const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
- filter_params_y, subpel_y_qn & SUBPEL_MASK);
- const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
- // coeffs 0 1 0 1 2 3 2 3
- const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
- // coeffs 4 5 4 5 6 7 6 7
- const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
- // coeffs 0 1 0 1 0 1 0 1
- const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
- // coeffs 2 3 2 3 2 3 2 3
- const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
- // coeffs 4 5 4 5 4 5 4 5
- const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
- // coeffs 6 7 6 7 6 7 6 7
- const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+ prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
const __m128i sum_round =
_mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
@@ -153,14 +143,24 @@
const __m128i src_6 =
_mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
*(__m128i *)(data + 7 * im_stride));
+ const __m128i src_8 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_10 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
- const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
- const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
- const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
- const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+ const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+ const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
- const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
_mm_add_epi32(res_4, res_6));
+ __m128i res_even =
+ _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
// Filter odd-index pixels
const __m128i src_1 =
@@ -175,14 +175,23 @@
const __m128i src_7 =
_mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
*(__m128i *)(data + 7 * im_stride));
+ const __m128i src_9 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
+ *(__m128i *)(data + 9 * im_stride));
+ const __m128i src_11 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
+ *(__m128i *)(data + 11 * im_stride));
- const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
- const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
- const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
- const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+ const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+ const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
- const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
- _mm_add_epi32(res_5, res_7));
+ const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
// Rearrange pixels back into the order 0 ... 7
const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
@@ -204,12 +213,223 @@
// Accumulate values into the destination buffer
__m128i *const p = (__m128i *)&dst[i * dst_stride + j];
- if (w == 2) {
- *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
- } else if (w == 4) {
- *(uint32_t *)p = _mm_cvtsi128_si32(res);
- } else {
- _mm_storel_epi64(p, res);
+ _mm_storel_epi64(p, res);
+ }
+ }
+ }
+}
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+ int dst_stride, int w, int h,
+ const InterpFilterParams *filter_params_x,
+ const InterpFilterParams *filter_params_y,
+ const int subpel_x_qn, const int subpel_y_qn,
+ ConvolveParams *conv_params) {
+ if (filter_params_x->taps > 8) {
+ if (w < 8) {
+ av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y, subpel_x_qn,
+ subpel_y_qn, conv_params);
+ } else {
+ av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+ filter_params_x, filter_params_y,
+ subpel_x_qn, subpel_y_qn, conv_params);
+ }
+ } else {
+ const int bd = 8;
+
+ DECLARE_ALIGNED(16, int16_t,
+ im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+ int im_h = h + filter_params_y->taps - 1;
+ int im_stride = MAX_SB_SIZE;
+ int i, j;
+ const int fo_vert = filter_params_y->taps / 2 - 1;
+ const int fo_horiz = filter_params_x->taps / 2 - 1;
+ const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+ const __m128i zero = _mm_setzero_si128();
+ const int bits =
+ FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+ const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+ assert(conv_params->round_0 > 0);
+
+ /* Horizontal filter */
+ {
+ const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_x, subpel_x_qn & SUBPEL_MASK);
+ const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+ for (i = 0; i < im_h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const __m128i data =
+ _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+ // Filter even-index pixels
+ const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i src_2 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i src_4 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i src_6 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+ _mm_add_epi32(res_2, res_6));
+ res_even =
+ _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i src_3 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i src_5 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i src_7 =
+ _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+ _mm_add_epi32(res_3, res_7));
+ res_odd =
+ _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+ // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+ __m128i res = _mm_packs_epi32(res_even, res_odd);
+ _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+ }
+ }
+ }
+
+ /* Vertical filter */
+ {
+ const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+ filter_params_y, subpel_y_qn & SUBPEL_MASK);
+ const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+ // coeffs 0 1 0 1 2 3 2 3
+ const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+ // coeffs 4 5 4 5 6 7 6 7
+ const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+ // coeffs 0 1 0 1 0 1 0 1
+ const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+ // coeffs 2 3 2 3 2 3 2 3
+ const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+ // coeffs 4 5 4 5 4 5 4 5
+ const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+ // coeffs 6 7 6 7 6 7 6 7
+ const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+ const __m128i sum_round = _mm_set1_epi32(
+ (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+ const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+ const __m128i round_const = _mm_set1_epi32(
+ ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+ ((1 << (offset_bits - conv_params->round_1)) >> 1));
+ const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ // Filter even-index pixels
+ const int16_t *data = &im_block[i * im_stride + j];
+ const __m128i src_0 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_2 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_4 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_6 =
+ _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+ const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+ const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+ const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+ const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+ _mm_add_epi32(res_4, res_6));
+
+ // Filter odd-index pixels
+ const __m128i src_1 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+ *(__m128i *)(data + 1 * im_stride));
+ const __m128i src_3 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+ *(__m128i *)(data + 3 * im_stride));
+ const __m128i src_5 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+ *(__m128i *)(data + 5 * im_stride));
+ const __m128i src_7 =
+ _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+ *(__m128i *)(data + 7 * im_stride));
+
+ const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+ const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+ const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+ const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+ const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+ _mm_add_epi32(res_5, res_7));
+
+ // Rearrange pixels back into the order 0 ... 7
+ const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+ const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+ __m128i res_lo_round =
+ _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+ __m128i res_hi_round =
+ _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+ res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+ round_shift);
+ res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+ round_shift);
+
+ const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+ const __m128i res = _mm_packus_epi16(res16, res16);
+
+ // Accumulate values into the destination buffer
+ __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+ if (w == 2) {
+ *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+ } else if (w == 4) {
+ *(uint32_t *)p = _mm_cvtsi128_si32(res);
+ } else {
+ _mm_storel_epi64(p, res);
+ }
}
}
}
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 3ae4504..b764e4d 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -763,6 +763,32 @@
}
}
+ public:
+ void SpeedTest() {
+ for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+ for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+ TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+ static_cast<InterpFilter>(v_f), 10000);
+ }
+ }
+ }
+
+ public:
+ void RunTest12Tap() {
+ for (int sub_x = 0; sub_x < 16; ++sub_x) {
+ for (int sub_y = 0; sub_y < 16; ++sub_y) {
+ TestConvolve(static_cast<InterpFilter>(MULTITAP_SHARP2),
+ static_cast<InterpFilter>(MULTITAP_SHARP2), sub_x, sub_y);
+ }
+ }
+ }
+
+ public:
+ void SpeedTest12Tap() {
+ TestConvolveSpeed(static_cast<InterpFilter>(MULTITAP_SHARP2),
+ static_cast<InterpFilter>(MULTITAP_SHARP2), 10000);
+ }
+
private:
void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
const int sub_x, const int sub_y) {
@@ -775,9 +801,9 @@
const uint8_t *input = FirstRandomInput8(GetParam());
DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
- av1_convolve_2d_sr(input, width, reference, kOutputStride, width, height,
- filter_params_x, filter_params_y, sub_x, sub_y,
- &conv_params1);
+ av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width, height,
+ filter_params_x, filter_params_y, sub_x, sub_y,
+ &conv_params1);
DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
@@ -785,10 +811,51 @@
&conv_params2);
AssertOutputBufferEq(reference, test, width, height);
}
+
+ private:
+ void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+ int num_iters) {
+ const int width = GetParam().Block().Width();
+ const int height = GetParam().Block().Height();
+ const InterpFilterParams *filter_params_x =
+ av1_get_interp_filter_params_with_block_size(h_f, width);
+ const InterpFilterParams *filter_params_y =
+ av1_get_interp_filter_params_with_block_size(v_f, height);
+ const uint8_t *input = FirstRandomInput8(GetParam());
+ DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+ ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+ aom_usec_timer timer;
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0, 0,
+ &conv_params1);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+ ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+ aom_usec_timer_start(&timer);
+ for (int i = 0; i < num_iters; ++i) {
+ GetParam().TestFunction()(input, width, test, kOutputStride, width,
+ height, filter_params_x, filter_params_y, 0, 0,
+ &conv_params2);
+ }
+ aom_usec_timer_mark(&timer);
+ const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+ time1, time2, time1 / time2);
+ }
};
TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
+TEST_P(AV1Convolve2DTest, RunTest12Tap) { RunTest12Tap(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest12Tap) { SpeedTest12Tap(); }
+
INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
BuildLowbdParams(av1_convolve_2d_sr_c));