Add Armv8.0 Neon horiz 2x1 scale spec. impl for convolve_2d_scale

AV1 has a limit on the scale ratio, specifically, the reference
resolution cannot be more than 2 times the source resolution in any
dimension. Given that the algorithm uses higher precision
(1/1024-pel) for the step size (chapter 7.11.3.4. [1]), the
horizontal scaling function can be easily optimised for this specific
case. The indices of the source pixel to be interpolated are
calculated using the (subpel_qn + x * step) >> 1024 equation, which
can be simplified if step is a multiple of 1024.

Add implementation that specialises on x_step_qn equals to 2048, that
gives an uplift of around 33% when a 2x1 scaling is applied.

[1]https://aomediacodec.github.io/av1-spec/av1-spec.pdf

Change-Id: I9127ca4e6b4188a4dabe4cfd416efe4d762b2e9f
diff --git a/aom_dsp/arm/mem_neon.h b/aom_dsp/arm/mem_neon.h
index b5deb9c..fa571a6 100644
--- a/aom_dsp/arm/mem_neon.h
+++ b/aom_dsp/arm/mem_neon.h
@@ -642,6 +642,28 @@
   vst1_s16(s, s3);
 }
 
+static INLINE void store_s16_4x8(int16_t *s, ptrdiff_t dst_stride,
+                                 const int16x4_t s0, const int16x4_t s1,
+                                 const int16x4_t s2, const int16x4_t s3,
+                                 const int16x4_t s4, const int16x4_t s5,
+                                 const int16x4_t s6, const int16x4_t s7) {
+  vst1_s16(s, s0);
+  s += dst_stride;
+  vst1_s16(s, s1);
+  s += dst_stride;
+  vst1_s16(s, s2);
+  s += dst_stride;
+  vst1_s16(s, s3);
+  s += dst_stride;
+  vst1_s16(s, s4);
+  s += dst_stride;
+  vst1_s16(s, s5);
+  s += dst_stride;
+  vst1_s16(s, s6);
+  s += dst_stride;
+  vst1_s16(s, s7);
+}
+
 static INLINE void store_s16_8x4(int16_t *s, ptrdiff_t dst_stride,
                                  const int16x8_t s0, const int16x8_t s1,
                                  const int16x8_t s2, const int16x8_t s3) {
diff --git a/aom_dsp/arm/transpose_neon.h b/aom_dsp/arm/transpose_neon.h
index 8027018..9fc4fb0 100644
--- a/aom_dsp/arm/transpose_neon.h
+++ b/aom_dsp/arm/transpose_neon.h
@@ -325,6 +325,41 @@
   *a3 = vreinterpret_u8_u16(c1.val[1]);
 }
 
+static INLINE void transpose_elems_inplace_u8_16x4(uint8x16_t *a0,
+                                                   uint8x16_t *a1,
+                                                   uint8x16_t *a2,
+                                                   uint8x16_t *a3) {
+  // Swap 8 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07 08 09 010 011 012 013 014 015
+  // a1: 10 11 12 13 14 15 16 17 18 19 110 111 112 113 114 115
+  // a2: 20 21 22 23 24 25 26 27 28 29 210 211 212 213 214 215
+  // a3: 30 31 32 33 34 35 36 37 38 39 310 311 312 313 314 315
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16 08 18 010 110 012 112 014 114
+  // b0.val[1]: 01 11 03 13 05 15 07 17 09 19 011 111 013 113 015 115
+  // b1.val[0]: 20 30 22 32 24 34 26 36 28 38 210 310 212 312 214 314
+  // b1.val[1]: 21 31 23 33 25 35 27 37 29 39 211 311 213 313 215 315
+
+  const uint8x16x2_t b0 = vtrnq_u8(*a0, *a1);
+  const uint8x16x2_t b1 = vtrnq_u8(*a2, *a3);
+
+  // Swap 16 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34 08  18  28  38  012 112 212 312
+  // c0.val[1]: 02 12 22 32 06 16 26 36 09  19  29  39  013 113 213 313
+  // c1.val[0]: 01 11 21 31 05 15 25 35 010 110 210 310 014 114 214 314
+  // c1.val[1]: 03 13 23 33 07 17 27 37 011 111 211 311 015 115 215 315
+
+  const uint16x8x2_t c0 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[0]),
+                                    vreinterpretq_u16_u8(b1.val[0]));
+  const uint16x8x2_t c1 = vtrnq_u16(vreinterpretq_u16_u8(b0.val[1]),
+                                    vreinterpretq_u16_u8(b1.val[1]));
+
+  *a0 = vreinterpretq_u8_u16(c0.val[0]);
+  *a1 = vreinterpretq_u8_u16(c1.val[0]);
+  *a2 = vreinterpretq_u8_u16(c0.val[1]);
+  *a3 = vreinterpretq_u8_u16(c1.val[1]);
+}
+
 static INLINE void transpose_elems_inplace_u8_4x4(uint8x8_t *a0,
                                                   uint8x8_t *a1) {
   // Swap 16 bit elements. Goes from:
@@ -885,6 +920,40 @@
   out[7] = d3.val[1];
 }
 
+static INLINE void transpose_elems_inplace_s16_8x4(int16x8_t *a0, int16x8_t *a1,
+                                                   int16x8_t *a2,
+                                                   int16x8_t *a3) {
+  // Swap 16 bit elements. Goes from:
+  // a0: 00 01 02 03 04 05 06 07
+  // a1: 10 11 12 13 14 15 16 17
+  // a2: 20 21 22 23 24 25 26 27
+  // a3: 30 31 32 33 34 35 36 37
+  // to:
+  // b0.val[0]: 00 10 02 12 04 14 06 16
+  // b0.val[1]: 01 11 03 13 05 15 07 17
+  // b1.val[0]: 20 30 22 32 24 34 26 36
+  // b1.val[1]: 21 31 23 33 25 35 27 37
+
+  const int16x8x2_t b0 = vtrnq_s16(*a0, *a1);
+  const int16x8x2_t b1 = vtrnq_s16(*a2, *a3);
+
+  // Swap 32 bit elements resulting in:
+  // c0.val[0]: 00 10 20 30 04 14 24 34
+  // c0.val[1]: 01 11 21 31 05 15 25 35
+  // c1.val[0]: 02 12 22 32 06 16 26 36
+  // c1.val[1]: 03 13 23 33 07 17 27 37
+
+  const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]),
+                                   vreinterpretq_s32_s16(b1.val[0]));
+  const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]),
+                                   vreinterpretq_s32_s16(b1.val[1]));
+
+  *a0 = vreinterpretq_s16_s32(c0.val[0]);
+  *a1 = vreinterpretq_s16_s32(c1.val[0]);
+  *a2 = vreinterpretq_s16_s32(c0.val[1]);
+  *a3 = vreinterpretq_s16_s32(c1.val[1]);
+}
+
 static INLINE void transpose_elems_inplace_u16_4x4(uint16x4_t *a0,
                                                    uint16x4_t *a1,
                                                    uint16x4_t *a2,
diff --git a/av1/common/arm/av1_convolve_scale_neon.c b/av1/common/arm/av1_convolve_scale_neon.c
index 88d126e..114232d 100644
--- a/av1/common/arm/av1_convolve_scale_neon.c
+++ b/av1/common/arm/av1_convolve_scale_neon.c
@@ -351,6 +351,284 @@
   }
 }
 
+static INLINE void convolve_horiz_scale_2_8tap_neon(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+
+  if (w == 4) {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    const int16x8_t filter = vld1q_s16(x_filter);
+
+    do {
+      uint8x16_t t0, t1, t2, t3;
+      load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+      int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+      int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+      int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+      int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+      int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+      int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s1 = vget_low_s16(tt1);
+      int16x4_t s2 = vget_low_s16(tt2);
+      int16x4_t s3 = vget_low_s16(tt3);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s5 = vget_high_s16(tt1);
+      int16x4_t s6 = vget_high_s16(tt2);
+      int16x4_t s7 = vget_high_s16(tt3);
+      int16x4_t s8 = vget_low_s16(tt4);
+      int16x4_t s9 = vget_low_s16(tt5);
+      int16x4_t s10 = vget_low_s16(tt6);
+      int16x4_t s11 = vget_low_s16(tt7);
+      int16x4_t s12 = vget_high_s16(tt4);
+      int16x4_t s13 = vget_high_s16(tt5);
+
+      int16x4_t d0 =
+          convolve8_4_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+      int16x4_t d1 =
+          convolve8_4_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+      int16x4_t d2 =
+          convolve8_4_h(s4, s5, s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+      int16x4_t d3 = convolve8_4_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+                                   horiz_offset);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+    // Filter values are all even so halve them to allow convolution
+    // kernel computations to stay in 16-bit element types.
+    const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+                             &t4, &t5, &t6, &t7);
+
+      s += 8;
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+      do {
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                    &t15);
+        transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+                               &t10, &t11, &t12, &t13, &t14, &t15);
+
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t14));
+        int16x8_t s15 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+        int16x8_t d0 =
+            convolve8_8_h(s0, s1, s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+        int16x8_t d1 =
+            convolve8_8_h(s2, s3, s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+        int16x8_t d2 = convolve8_8_h(s4, s5, s6, s7, s8, s9, s10, s11, filter,
+                                     horiz_offset);
+        int16x8_t d3 = convolve8_8_h(s6, s7, s8, s9, s10, s11, s12, s13, filter,
+                                     horiz_offset);
+
+        transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+        store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+                      vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+                      vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+        s7 = s15;
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
+static INLINE void convolve_horiz_scale_2_6tap_neon(
+    const uint8_t *src, int src_stride, int16_t *dst, int dst_stride, int w,
+    int h, const int16_t *x_filter) {
+  const int bd = 8;
+
+  if (w == 4) {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    const int32x4_t horiz_offset =
+        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+    const int16x8_t filter = vld1q_s16(x_filter);
+
+    do {
+      uint8x16_t t0, t1, t2, t3;
+      load_u8_16x4(src, src_stride, &t0, &t1, &t2, &t3);
+      transpose_elems_inplace_u8_16x4(&t0, &t1, &t2, &t3);
+
+      int16x8_t tt0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t1)));
+      int16x8_t tt1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t2)));
+      int16x8_t tt2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t3)));
+      int16x8_t tt3 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t0)));
+      int16x8_t tt4 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t0)));
+      int16x8_t tt5 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t1)));
+      int16x8_t tt6 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t2)));
+      int16x8_t tt7 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t3)));
+
+      int16x4_t s0 = vget_low_s16(tt0);
+      int16x4_t s1 = vget_low_s16(tt1);
+      int16x4_t s2 = vget_low_s16(tt2);
+      int16x4_t s3 = vget_high_s16(tt3);
+      int16x4_t s4 = vget_high_s16(tt0);
+      int16x4_t s5 = vget_high_s16(tt1);
+      int16x4_t s6 = vget_high_s16(tt2);
+      int16x4_t s7 = vget_low_s16(tt4);
+      int16x4_t s8 = vget_low_s16(tt5);
+      int16x4_t s9 = vget_low_s16(tt6);
+      int16x4_t s10 = vget_low_s16(tt7);
+      int16x4_t s11 = vget_high_s16(tt4);
+
+      int16x4_t d0 =
+          convolve6_4_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+      int16x4_t d1 =
+          convolve6_4_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+      int16x4_t d2 =
+          convolve6_4_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+      int16x4_t d3 =
+          convolve6_4_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+      transpose_elems_inplace_s16_4x4(&d0, &d1, &d2, &d3);
+
+      store_s16_4x4(dst, dst_stride, d0, d1, d2, d3);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h > 0);
+  } else {
+    // A shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+    // shifts - which are generally faster than rounding shifts on modern CPUs.
+    // The additional -1 is needed because we are halving the filter values.
+    const int16x8_t horiz_offset =
+        vdupq_n_s16((1 << (bd + FILTER_BITS - 2)) + (1 << (ROUND0_BITS - 2)));
+    // Filter values are all even so halve them to allow convolution
+    // kernel computations to stay in 16-bit element types.
+    const int16x8_t filter = vshrq_n_s16(vld1q_s16(x_filter), 1);
+
+    do {
+      const uint8_t *s = src;
+      int16_t *d = dst;
+      int width = w;
+
+      uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+      load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+      transpose_elems_u8_8x8(t0, t1, t2, t3, t4, t5, t6, t7, &t0, &t1, &t2, &t3,
+                             &t4, &t5, &t6, &t7);
+
+      s += 8;
+
+      int16x8_t s0 = vreinterpretq_s16_u16(vmovl_u8(t1));
+      int16x8_t s1 = vreinterpretq_s16_u16(vmovl_u8(t2));
+      int16x8_t s2 = vreinterpretq_s16_u16(vmovl_u8(t3));
+      int16x8_t s3 = vreinterpretq_s16_u16(vmovl_u8(t4));
+      int16x8_t s4 = vreinterpretq_s16_u16(vmovl_u8(t5));
+      int16x8_t s5 = vreinterpretq_s16_u16(vmovl_u8(t6));
+      int16x8_t s6 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+      do {
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14, t15;
+        load_u8_8x8(s, src_stride, &t8, &t9, &t10, &t11, &t12, &t13, &t14,
+                    &t15);
+        transpose_elems_u8_8x8(t8, t9, t10, t11, t12, t13, t14, t15, &t8, &t9,
+                               &t10, &t11, &t12, &t13, &t14, &t15);
+
+        int16x8_t s7 = vreinterpretq_s16_u16(vmovl_u8(t8));
+        int16x8_t s8 = vreinterpretq_s16_u16(vmovl_u8(t9));
+        int16x8_t s9 = vreinterpretq_s16_u16(vmovl_u8(t10));
+        int16x8_t s10 = vreinterpretq_s16_u16(vmovl_u8(t11));
+        int16x8_t s11 = vreinterpretq_s16_u16(vmovl_u8(t12));
+        int16x8_t s12 = vreinterpretq_s16_u16(vmovl_u8(t13));
+        int16x8_t s13 = vreinterpretq_s16_u16(vmovl_u8(t14));
+        int16x8_t s14 = vreinterpretq_s16_u16(vmovl_u8(t15));
+
+        int16x8_t d0 =
+            convolve6_8_h(s0, s1, s2, s3, s4, s5, filter, horiz_offset);
+        int16x8_t d1 =
+            convolve6_8_h(s2, s3, s4, s5, s6, s7, filter, horiz_offset);
+        int16x8_t d2 =
+            convolve6_8_h(s4, s5, s6, s7, s8, s9, filter, horiz_offset);
+        int16x8_t d3 =
+            convolve6_8_h(s6, s7, s8, s9, s10, s11, filter, horiz_offset);
+
+        transpose_elems_inplace_s16_8x4(&d0, &d1, &d2, &d3);
+
+        store_s16_4x8(d, dst_stride, vget_low_s16(d0), vget_low_s16(d1),
+                      vget_low_s16(d2), vget_low_s16(d3), vget_high_s16(d0),
+                      vget_high_s16(d1), vget_high_s16(d2), vget_high_s16(d3));
+
+        s0 = s8;
+        s1 = s9;
+        s2 = s10;
+        s3 = s11;
+        s4 = s12;
+        s5 = s13;
+        s6 = s14;
+
+        s += 8;
+        d += 4;
+        width -= 4;
+      } while (width != 0);
+
+      dst += 8 * dst_stride;
+      src += 8 * src_stride;
+      h -= 8;
+    } while (h > 0);
+  }
+}
+
 void av1_convolve_2d_scale_neon(const uint8_t *src, int src_stride,
                                 uint8_t *dst, int dst_stride, int w, int h,
                                 const InterpFilterParams *filter_params_x,
@@ -382,14 +660,41 @@
   const ptrdiff_t vert_offset = (filter_params_y->taps / 2 - 1) * src_stride;
 
   // Horizontal filter
-  if (filter_params_x->interp_filter == MULTITAP_SHARP) {
-    convolve_horiz_scale_8tap_neon(
-        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+
+  if (x_step_qn != 2 * (1 << SCALE_SUBPEL_BITS)) {
+    if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+      convolve_horiz_scale_8tap_neon(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    } else {
+      convolve_horiz_scale_6tap_neon(
+          src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
+          im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    }
   } else {
-    convolve_horiz_scale_6tap_neon(
-        src - horiz_offset - vert_offset, src_stride, im_block, im_stride, w,
-        im_h, filter_params_x->filter_ptr, subpel_x_qn, x_step_qn);
+    assert(subpel_x_qn < (1 << SCALE_SUBPEL_BITS));
+    // The filter index is calculated using the
+    // ((subpel_x_qn + x * x_step_qn) & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS
+    // equation, where the values of x are from 0 to w. If x_step_qn is a
+    // multiple of SCALE_SUBPEL_MASK we can leave it out of the equation.
+    const ptrdiff_t filter_offset =
+        SUBPEL_TAPS * ((subpel_x_qn & SCALE_SUBPEL_MASK) >> SCALE_EXTRA_BITS);
+    const int16_t *x_filter = filter_params_x->filter_ptr + filter_offset;
+
+    // The source index is calculated using the (subpel_x_qn + x * x_step_qn)
+    // >> SCALE_SUBPEL_BITS, where the values of x are from 0 to w. If
+    // subpel_x_qn < (1 << SCALE_SUBPEL_BITS) and x_step_qn % (1 <<
+    // SCALE_SUBPEL_BITS) == 0, the source index can be determined using the
+    // value x * (x_step_qn / (1 << SCALE_SUBPEL_BITS)).
+    if (filter_params_x->interp_filter == MULTITAP_SHARP) {
+      convolve_horiz_scale_2_8tap_neon(src - horiz_offset - vert_offset,
+                                       src_stride, im_block, im_stride, w, im_h,
+                                       x_filter);
+    } else {
+      convolve_horiz_scale_2_6tap_neon(src - horiz_offset - vert_offset,
+                                       src_stride, im_block, im_stride, w, im_h,
+                                       x_filter);
+    }
   }
 
   // Vertical filter