Add Armv8.0 Neon horiz 2x1 scale spec. impl for convolve_2d_scale
AV1 has a limit on the scale ratio, specifically, the reference
resolution cannot be more than 2 times the source resolution in any
dimension. Given that the algorithm uses higher precision
(1/1024-pel) for the step size (chapter 7.11.3.4. [1]), the
horizontal scaling function can be easily optimised for this specific
case. The indices of the source pixel to be interpolated are
calculated using the (subpel_qn + x * step) >> 1024 equation, which
can be simplified if step is a multiple of 1024.
Add implementation that specialises on x_step_qn equals to 2048, that
gives an uplift of around 33% when a 2x1 scaling is applied.
[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf
Change-Id: I9127ca4e6b4188a4dabe4cfd416efe4d762b2e9f
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index b5deb9c..fa571a6 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -642,6 +642,28 @@
vst1_s16(s, s3);
}
+static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
+ const int16x4_t s0, const int16x4_t s1,
+ const int16x4_t s2, const int16x4_t s3,
+ const int16x4_t s4, const int16x4_t s5,
+ const int16x4_t s6, const int16x4_t s7) {
+ vst1_s16(s, s0);
+ s += dst_stride;
+ vst1_s16(s, s1);
+ s += dst_stride;
+ vst1_s16(s, s2);
+ s += dst_stride;
+ vst1_s16(s, s3);
+ s += dst_stride;
+ vst1_s16(s, s4);
+ s += dst_stride;
+ vst1_s16(s, s5);
+ s += dst_stride;
+ vst1_s16(s, s6);
+ s += dst_stride;
+ vst1_s16(s, s7);
+}
+
static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
const int16x8_t s0, const int16x8_t s1,
const int16x8_t s2, const int16x8_t s3) {
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 8027018..9fc4fb0 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -325,6 +325,41 @@
*a3 = vreinterpret_u8_u16(c1.val[1]);
}
+static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
+ uint8x16_t *a1,
+ uint8x16_t *a2,
+ uint8x16_t *a3) {
+ // Swap 8 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
+ // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
+ // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
+ // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
+ // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
+ // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
+ // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
+
+ const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
+ const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
+
+ // Swap 16 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34 08 18 28 38 012 112 212 312
+ // c0.val[1]: 02 12 22 32 06 16 26 36 09 19 29 39 013 113 213 313
+ // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
+ // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
+
+ const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+ vreinterpretq_u16_u8(b1.val[0]));
+ const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+ vreinterpretq_u16_u8(b1.val[1]));
+
+ *a0 = vreinterpretq_u8_u16(c0.val[0]);
+ *a1 = vreinterpretq_u8_u16(c1.val[0]);
+ *a2 = vreinterpretq_u8_u16(c0.val[1]);
+ *a3 = vreinterpretq_u8_u16(c1.val[1]);
+}
+
static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
uint8x8_t *a1) {
// Swap 16 bit elements. Goes from:
@@ -885,6 +920,40 @@
out[7] = d3.val[1];
}
+static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
+ int16x8_t *a2,
+ int16x8_t *a3) {
+ // Swap 16 bit elements. Goes from:
+ // a0: 00 01 02 03 04 05 06 07
+ // a1: 10 11 12 13 14 15 16 17
+ // a2: 20 21 22 23 24 25 26 27
+ // a3: 30 31 32 33 34 35 36 37
+ // to:
+ // b0.val[0]: 00 10 02 12 04 14 06 16
+ // b0.val[1]: 01 11 03 13 05 15 07 17
+ // b1.val[0]: 20 30 22 32 24 34 26 36
+ // b1.val[1]: 21 31 23 33 25 35 27 37
+
+ const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+ const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+
+ // Swap 32 bit elements resulting in:
+ // c0.val[0]: 00 10 20 30 04 14 24 34
+ // c0.val[1]: 01 11 21 31 05 15 25 35
+ // c1.val[0]: 02 12 22 32 06 16 26 36
+ // c1.val[1]: 03 13 23 33 07 17 27 37
+
+ const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+ vreinterpretq_s32_s16(b1.val[0]));
+ const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+ vreinterpretq_s32_s16(b1.val[1]));
+
+ *a0 = vreinterpretq_s16_s32(c0.val[0]);
+ *a1 = vreinterpretq_s16_s32(c1.val[0]);
+ *a2 = vreinterpretq_s16_s32(c0.val[1]);
+ *a3 = vreinterpretq_s16_s32(c1.val[1]);
+}
+
static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
uint16x4_t *a1,
uint16x4_t *a2,
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index 88d126e..114232d 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -351,6 +351,284 @@
}
}
+static INLINE void convolve_horiz_scale_2_8tap_neon(
+ const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filter) {
+ const int bd = 8;
+
+ if (w == 4) {
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_offset =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ const int16x8_t filter = vld1q_s16(x_filter);
+
+ do {
+ uint8x16_t t0, t1, t2, t3;
+ load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+ int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+ int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+ int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+ int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+ int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+ int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s1 = vget_low_s16(tt1);
+ int16x4_t s2 = vget_low_s16(tt2);
+ int16x4_t s3 = vget_low_s16(tt3);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s5 = vget_high_s16(tt1);
+ int16x4_t s6 = vget_high_s16(tt2);
+ int16x4_t s7 = vget_high_s16(tt3);
+ int16x4_t s8 = vget_low_s16(tt4);
+ int16x4_t s9 = vget_low_s16(tt5);
+ int16x4_t s10 = vget_low_s16(tt6);
+ int16x4_t s11 = vget_low_s16(tt7);
+ int16x4_t s12 = vget_high_s16(tt4);
+ int16x4_t s13 = vget_high_s16(tt5);
+
+ int16x4_t d0 =
+ convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+ int16x4_t d1 =
+ convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+ int16x4_t d2 =
+ convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+ int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+ horiz_offset);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+ store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The additional -1 is needed because we are halving the filter values.
+ const int16x8_t horiz_offset =
+ vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+ // Filter values are all even so halve them to allow convolution
+ // kernel computations to stay in 16-bit element types.
+ const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+ do {
+ const uint8_t *s = src;
+ int16_t *d = dst;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+ &t4, &t5, &t6, &t7);
+
+ s += 8;
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ do {
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+ transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+ &t10, &t11, &t12, &t13, &t14, &t15);
+
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ int16x8_t d0 =
+ convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+ int16x8_t d1 =
+ convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+ int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter,
+ horiz_offset);
+ int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+ horiz_offset);
+
+ transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+ store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+ vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+ vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+ s7 = s15;
+
+ s += 8;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+
+ dst += 8 * dst_stride;
+ src += 8 * src_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+}
+
+static INLINE void convolve_horiz_scale_2_6tap_neon(
+ const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+ int h, const int16_t *x_filter) {
+ const int bd = 8;
+
+ if (w == 4) {
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ const int32x4_t horiz_offset =
+ vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+ const int16x8_t filter = vld1q_s16(x_filter);
+
+ do {
+ uint8x16_t t0, t1, t2, t3;
+ load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+ transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+ int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+ int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+ int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+ int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+ int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+ int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+ int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+ int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+ int16x4_t s0 = vget_low_s16(tt0);
+ int16x4_t s1 = vget_low_s16(tt1);
+ int16x4_t s2 = vget_low_s16(tt2);
+ int16x4_t s3 = vget_high_s16(tt3);
+ int16x4_t s4 = vget_high_s16(tt0);
+ int16x4_t s5 = vget_high_s16(tt1);
+ int16x4_t s6 = vget_high_s16(tt2);
+ int16x4_t s7 = vget_low_s16(tt4);
+ int16x4_t s8 = vget_low_s16(tt5);
+ int16x4_t s9 = vget_low_s16(tt6);
+ int16x4_t s10 = vget_low_s16(tt7);
+ int16x4_t s11 = vget_high_s16(tt4);
+
+ int16x4_t d0 =
+ convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+ int16x4_t d1 =
+ convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+ int16x4_t d2 =
+ convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+ int16x4_t d3 =
+ convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+ transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+ store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+ dst += 4 * dst_stride;
+ src += 4 * src_stride;
+ h -= 4;
+ } while (h > 0);
+ } else {
+ // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+ // shifts - which are generally faster than rounding shifts on modern CPUs.
+ // The additional -1 is needed because we are halving the filter values.
+ const int16x8_t horiz_offset =
+ vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+ // Filter values are all even so halve them to allow convolution
+ // kernel computations to stay in 16-bit element types.
+ const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+ do {
+ const uint8_t *s = src;
+ int16_t *d = dst;
+ int width = w;
+
+ uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+ load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+ transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+ &t4, &t5, &t6, &t7);
+
+ s += 8;
+
+ int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1));
+ int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2));
+ int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3));
+ int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+ int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5));
+ int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6));
+ int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+ do {
+ uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+ load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+ &t15);
+ transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+ &t10, &t11, &t12, &t13, &t14, &t15);
+
+ int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+ int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+ int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+ int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+ int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+ int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+ int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+ int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+ int16x8_t d0 =
+ convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+ int16x8_t d1 =
+ convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+ int16x8_t d2 =
+ convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+ int16x8_t d3 =
+ convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+ transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+ store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+ vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+ vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+ s0 = s8;
+ s1 = s9;
+ s2 = s10;
+ s3 = s11;
+ s4 = s12;
+ s5 = s13;
+ s6 = s14;
+
+ s += 8;
+ d += 4;
+ width -= 4;
+ } while (width != 0);
+
+ dst += 8 * dst_stride;
+ src += 8 * src_stride;
+ h -= 8;
+ } while (h > 0);
+ }
+}
+
void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride, int w, int h,
const InterpFilterParams *filter_params_x,
@@ -382,14 +660,41 @@
const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
// Horizontal filter
- if (filter_params_x->interp_filter == MULTITAP_SHARP) {
- convolve_horiz_scale_8tap_neon(
- src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
- im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+ if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) {
+ if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+ convolve_horiz_scale_8tap_neon(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+ } else {
+ convolve_horiz_scale_6tap_neon(
+ src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+ im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+ }
} else {
- convolve_horiz_scale_6tap_neon(
- src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
- im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+ assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS));
+ // The filter index is calculated using the
+ // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS
+ // equation, where the values of x are from 0 to w. If x_step_qn is a
+ // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation.
+ const ptrdiff_t filter_offset =
+ SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+ const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset;
+
+ // The source index is calculated using the (subpel_x_qn + x * x_step_qn)
+ // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If
+ // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 <<
+ // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the
+ // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)).
+ if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+ convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter);
+ } else {
+ convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset,
+ src_stride, im_block, im_stride, w, im_h,
+ x_filter);
+ }
}
// Vertical filter