Add SSE2 variant for MULTITAP_SHARP2 2D-convolve

This CL adds SSE2 intrinsics for av1_convolve_2d_sr_c for
MULTITAP_SHARP2 case, and updates AV1Convolve2DTest to support
the added intrinsic and to obtain module level gains.

Module gains improved by a factor of ~11.5x w.r.t. C

Change-Id: Ie32de0d73ea7cf2cdeb8382afca419b8e069ae13
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 9e3b73e..9312e9e 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -110,4 +110,11 @@
                                   const InterpFilterParams *filter_params_x,
                                   int subpel_x_qn, ConvolveParams *conv_params);
 
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params);
+
 #endif  // AOM_AOM_DSP_X86_CONVOLVE_COMMON_INTRIN_H_
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 61523da..530d129 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -947,569 +947,469 @@
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
-  int im_dst_stride;
-  int width, height;
-#if defined(__aarch64__)
-  uint8x8_t t0;
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-  const uint8_t *s;
-#endif
-
-  DECLARE_ALIGNED(16, int16_t,
-                  im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
-
-  const int bd = 8;
-  const int im_h = h + filter_params_y->taps - 1;
-  const int im_stride = MAX_SB_SIZE;
-  const int vert_offset = filter_params_y->taps / 2 - 1;
-  const int horiz_offset = filter_params_x->taps / 2 - 1;
-
-  const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
-
-  int16_t *dst_ptr;
-
-  dst_ptr = im_block;
-  im_dst_stride = im_stride;
-  height = im_h;
-  width = w;
-
-  const int16_t round_bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
-  const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-
-  int16_t x_filter_tmp[8];
-  int16x8_t filter_x_coef = vld1q_s16(x_filter);
-
-  // filter coeffs are even, so downshifting by 1 to reduce intermediate
-  // precision requirements.
-  filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
-  vst1q_s16(&x_filter_tmp[0], filter_x_coef);
-
-  assert(conv_params->round_0 > 0);
-
-  if (w <= 4) {
-    const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
-
-#if defined(__aarch64__)
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
-    do {
-      assert(height >= 4);
-      s = src_ptr;
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
-      s += 7;
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                             horiz_const, shift_round_0);
-
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-      if (w == 4) {
-        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
-        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
-        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
-        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
-      } else if (w == 2) {
-        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
-                      vreinterpret_u32_s16(d0), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
-                      vreinterpret_u32_s16(d1), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
-                      vreinterpret_u32_s16(d2), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
-                      vreinterpret_u32_s16(d3), 0);
-      }
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_dst_stride;
-      height -= 4;
-    } while (height >= 4);
-
-    if (height) {
-      assert(height < 4);
-      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                                 height, x_filter_tmp, horiz_const,
-                                 shift_round_0);
-    }
-#else
-    horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
-#endif
-
+  if (filter_params_x->taps > 8) {
+    av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                         filter_params_x, filter_params_y, subpel_x_qn,
+                         subpel_y_qn, conv_params);
   } else {
-    const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
-    const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
+    int im_dst_stride;
+    int width, height;
+#if defined(__aarch64__)
+    uint8x8_t t0;
+    uint8x8_t t1, t2, t3, t4, t5, t6, t7;
+    const uint8_t *s;
+#endif
+
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
+
+    const int bd = 8;
+    const int im_h = h + filter_params_y->taps - 1;
+    const int im_stride = MAX_SB_SIZE;
+    const int vert_offset = filter_params_y->taps / 2 - 1;
+    const int horiz_offset = filter_params_x->taps / 2 - 1;
+
+    const uint8_t *src_ptr = src - vert_offset * src_stride - horiz_offset;
+
+    int16_t *dst_ptr;
+
+    dst_ptr = im_block;
+    im_dst_stride = im_stride;
+    height = im_h;
+    width = w;
+
+    const int16_t round_bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int16x8_t vec_round_bits = vdupq_n_s16(-round_bits);
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+
+    int16_t x_filter_tmp[8];
+    int16x8_t filter_x_coef = vld1q_s16(x_filter);
+
+    // filter coeffs are even, so downshifting by 1 to reduce intermediate
+    // precision requirements.
+    filter_x_coef = vshrq_n_s16(filter_x_coef, 1);
+    vst1q_s16(&x_filter_tmp[0], filter_x_coef);
+
+    assert(conv_params->round_0 > 0);
+
+    if (w <= 4) {
+      const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
+      const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
 
 #if defined(__aarch64__)
-    int16_t *d_tmp;
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
-    int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
-    do {
-      assert(height >= 8);
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      width = w;
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
-
+      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, d0, d1, d2, d3;
       do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
+        assert(height >= 4);
+        s = src_ptr;
+        __builtin_prefetch(s + 0 * src_stride);
+        __builtin_prefetch(s + 1 * src_stride);
+        __builtin_prefetch(s + 2 * src_stride);
+        __builtin_prefetch(s + 3 * src_stride);
 
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-
-        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7);
-
-        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
-                      res6, res7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * im_dst_stride;
-      height -= 8;
-    } while (height >= 8);
-
-    if (height >= 4) {
-      assert(height < 8);
-      int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
-          reg10, reg11, reg12, reg13, reg14;
-      int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
-      int16x8_t out0, out1, out2, out3;
-
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
         load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
         transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-        reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-        reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-        reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-        reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
 
-        d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
-                           x_filter_tmp);
+        __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+        s += 7;
 
-        d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
-                           x_filter_tmp);
+        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
 
-        d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
-                           x_filter_tmp);
+        s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
 
-        d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
-                           x_filter_tmp);
+        d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                               horiz_const, shift_round_0);
+        d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                               horiz_const, shift_round_0);
+        d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                               horiz_const, shift_round_0);
+        d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
+                               horiz_const, shift_round_0);
 
-        d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
-                           x_filter_tmp);
+        transpose_s16_4x4d(&d0, &d1, &d2, &d3);
+        if (w == 4) {
+          vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
+          vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
+          vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
+          vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
+        } else if (w == 2) {
+          vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
+                        vreinterpret_u32_s16(d0), 0);
+          vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
+                        vreinterpret_u32_s16(d1), 0);
+          vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
+                        vreinterpret_u32_s16(d2), 0);
+          vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
+                        vreinterpret_u32_s16(d3), 0);
+        }
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * im_dst_stride;
+        height -= 4;
+      } while (height >= 4);
 
-        d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
-                           x_filter_tmp);
+      if (height) {
+        assert(height < 4);
+        horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride,
+                                   w, height, x_filter_tmp, horiz_const,
+                                   shift_round_0);
+      }
+#else
+      horiz_filter_w4_single_row(src_ptr, src_stride, dst_ptr, im_dst_stride, w,
+                                 height, x_filter_tmp, horiz_const,
+                                 shift_round_0);
+#endif
 
-        d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
-                           x_filter_tmp);
+    } else {
+      const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
+      const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
-        d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13, reg14,
-                           x_filter_tmp);
+#if defined(__aarch64__)
+      int16_t *d_tmp;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14;
+      int16x8_t res0, res1, res2, res3, res4, res5, res6, res7;
+      do {
+        assert(height >= 8);
+        __builtin_prefetch(src_ptr + 0 * src_stride);
+        __builtin_prefetch(src_ptr + 1 * src_stride);
+        __builtin_prefetch(src_ptr + 2 * src_stride);
+        __builtin_prefetch(src_ptr + 3 * src_stride);
+        __builtin_prefetch(src_ptr + 4 * src_stride);
+        __builtin_prefetch(src_ptr + 5 * src_stride);
+        __builtin_prefetch(src_ptr + 6 * src_stride);
+        __builtin_prefetch(src_ptr + 7 * src_stride);
 
-        transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0, &out1,
-                          &out2, &out3);
+        load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6,
+                    &t7);
 
-        out0 = vaddq_s16(out0, horiz_const);
-        out0 = vqrshlq_s16(out0, shift_round_0);
+        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        out1 = vaddq_s16(out1, horiz_const);
-        out1 = vqrshlq_s16(out1, shift_round_0);
+        s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
+        s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
+        s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
+        s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
+        s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
+        s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
+        s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
 
-        out2 = vaddq_s16(out2, horiz_const);
-        out2 = vqrshlq_s16(out2, shift_round_0);
+        width = w;
+        s = src_ptr + 7;
+        d_tmp = dst_ptr;
 
-        out3 = vaddq_s16(out3, horiz_const);
-        out3 = vqrshlq_s16(out3, shift_round_0);
+        __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
+        __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
 
-        store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+        do {
+          load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-        reg0 = reg8;
-        reg1 = reg9;
-        reg2 = reg10;
-        reg3 = reg11;
-        reg4 = reg12;
-        reg5 = reg13;
-        reg6 = reg14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_dst_stride;
-      height -= 4;
-    }
+          transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
 
-    if (height) {
-      assert(height < 4);
+          s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
+          s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
+          s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
+          s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
+          s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
+          s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
+          s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
+          s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
+
+          res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
+                                   horiz_const, shift_round_0);
+          res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
+                                   horiz_const, shift_round_0);
+          res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
+                                   horiz_const, shift_round_0);
+          res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10,
+                                   x_filter_tmp, horiz_const, shift_round_0);
+          res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11,
+                                   x_filter_tmp, horiz_const, shift_round_0);
+          res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
+                                   x_filter_tmp, horiz_const, shift_round_0);
+          res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
+                                   x_filter_tmp, horiz_const, shift_round_0);
+          res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
+                                   x_filter_tmp, horiz_const, shift_round_0);
+
+          transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
+                            &res7);
+
+          store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4,
+                        res5, res6, res7);
+
+          s0 = s8;
+          s1 = s9;
+          s2 = s10;
+          s3 = s11;
+          s4 = s12;
+          s5 = s13;
+          s6 = s14;
+          s += 8;
+          d_tmp += 8;
+          width -= 8;
+        } while (width > 0);
+        src_ptr += 8 * src_stride;
+        dst_ptr += 8 * im_dst_stride;
+        height -= 8;
+      } while (height >= 8);
+
+      if (height >= 4) {
+        assert(height < 8);
+        int16x4_t reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+            reg10, reg11, reg12, reg13, reg14;
+        int16x4_t d0, d1, d2, d3, d4, d5, d6, d7;
+        int16x8_t out0, out1, out2, out3;
+
+        __builtin_prefetch(src_ptr + 0 * src_stride);
+        __builtin_prefetch(src_ptr + 1 * src_stride);
+        __builtin_prefetch(src_ptr + 2 * src_stride);
+        __builtin_prefetch(src_ptr + 3 * src_stride);
+
+        load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3);
+        transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+        reg0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+        reg3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+        reg4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+        reg5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+        reg6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+
+        __builtin_prefetch(dst_ptr + 0 * dst_stride);
+        __builtin_prefetch(dst_ptr + 1 * dst_stride);
+        __builtin_prefetch(dst_ptr + 2 * dst_stride);
+        __builtin_prefetch(dst_ptr + 3 * dst_stride);
+
+        s = src_ptr + 7;
+        d_tmp = dst_ptr;
+        width = w;
+
+        do {
+          load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
+          transpose_u8_8x4(&t0, &t1, &t2, &t3);
+
+          reg7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          reg8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          reg9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          reg10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+          reg11 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
+          reg12 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
+          reg13 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
+          reg14 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
+
+          d0 = convolve8_4x4(reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7,
+                             x_filter_tmp);
+
+          d1 = convolve8_4x4(reg1, reg2, reg3, reg4, reg5, reg6, reg7, reg8,
+                             x_filter_tmp);
+
+          d2 = convolve8_4x4(reg2, reg3, reg4, reg5, reg6, reg7, reg8, reg9,
+                             x_filter_tmp);
+
+          d3 = convolve8_4x4(reg3, reg4, reg5, reg6, reg7, reg8, reg9, reg10,
+                             x_filter_tmp);
+
+          d4 = convolve8_4x4(reg4, reg5, reg6, reg7, reg8, reg9, reg10, reg11,
+                             x_filter_tmp);
+
+          d5 = convolve8_4x4(reg5, reg6, reg7, reg8, reg9, reg10, reg11, reg12,
+                             x_filter_tmp);
+
+          d6 = convolve8_4x4(reg6, reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                             x_filter_tmp);
+
+          d7 = convolve8_4x4(reg7, reg8, reg9, reg10, reg11, reg12, reg13,
+                             reg14, x_filter_tmp);
+
+          transpose_s16_4x8(&d0, &d1, &d2, &d3, &d4, &d5, &d6, &d7, &out0,
+                            &out1, &out2, &out3);
+
+          out0 = vaddq_s16(out0, horiz_const);
+          out0 = vqrshlq_s16(out0, shift_round_0);
+
+          out1 = vaddq_s16(out1, horiz_const);
+          out1 = vqrshlq_s16(out1, shift_round_0);
+
+          out2 = vaddq_s16(out2, horiz_const);
+          out2 = vqrshlq_s16(out2, shift_round_0);
+
+          out3 = vaddq_s16(out3, horiz_const);
+          out3 = vqrshlq_s16(out3, shift_round_0);
+
+          store_s16_8x4(d_tmp, im_dst_stride, out0, out1, out2, out3);
+
+          reg0 = reg8;
+          reg1 = reg9;
+          reg2 = reg10;
+          reg3 = reg11;
+          reg4 = reg12;
+          reg5 = reg13;
+          reg6 = reg14;
+          s += 8;
+          d_tmp += 8;
+          width -= 8;
+        } while (width > 0);
+        src_ptr += 4 * src_stride;
+        dst_ptr += 4 * im_dst_stride;
+        height -= 4;
+      }
+
+      if (height) {
+        assert(height < 4);
+        horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
+                                   height, x_filter_tmp, horiz_const,
+                                   shift_round_0);
+      }
+#else
+
       horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
                                  height, x_filter_tmp, horiz_const,
                                  shift_round_0);
+#endif
     }
-#else
 
-    horiz_filter_w8_single_row(src_ptr, src_stride, dst_ptr, im_stride, w,
-                               height, x_filter_tmp, horiz_const,
-                               shift_round_0);
-#endif
-  }
+    // vertical
+    {
+      uint8_t *dst_u8_ptr, *d_u8;
+      int16_t *v_src_ptr, *v_s;
 
-  // vertical
-  {
-    uint8_t *dst_u8_ptr, *d_u8;
-    int16_t *v_src_ptr, *v_s;
+      const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
+                                (1 << (offset_bits - conv_params->round_1 - 1));
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-    const int32_t sub_const = (1 << (offset_bits - conv_params->round_1)) +
-                              (1 << (offset_bits - conv_params->round_1 - 1));
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+      const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
+      const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
+      const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
 
-    const int32x4_t round_shift_vec = vdupq_n_s32(-(conv_params->round_1));
-    const int32x4_t offset_const = vdupq_n_s32(1 << offset_bits);
-    const int32x4_t sub_const_vec = vdupq_n_s32(sub_const);
+      src_stride = im_stride;
+      v_src_ptr = im_block;
+      dst_u8_ptr = dst;
 
-    src_stride = im_stride;
-    v_src_ptr = im_block;
-    dst_u8_ptr = dst;
+      height = h;
+      width = w;
 
-    height = h;
-    width = w;
-
-    if (width <= 4) {
-      int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint16x4_t d0;
-      uint16x8_t dd0;
-      uint8x8_t d01;
+      if (width <= 4) {
+        int16x4_t s0, s1, s2, s3, s4, s5, s6, s7;
+        uint16x4_t d0;
+        uint16x8_t dd0;
+        uint8x8_t d01;
 
 #if defined(__aarch64__)
-      int16x4_t s8, s9, s10;
-      uint16x4_t d1, d2, d3;
-      uint16x8_t dd1;
-      uint8x8_t d23;
+        int16x4_t s8, s9, s10;
+        uint16x4_t d1, d2, d3;
+        uint16x8_t dd1;
+        uint8x8_t d23;
 #endif
 
-      d_u8 = dst_u8_ptr;
-      v_s = v_src_ptr;
-
-      __builtin_prefetch(v_s + 0 * im_stride);
-      __builtin_prefetch(v_s + 1 * im_stride);
-      __builtin_prefetch(v_s + 2 * im_stride);
-      __builtin_prefetch(v_s + 3 * im_stride);
-      __builtin_prefetch(v_s + 4 * im_stride);
-      __builtin_prefetch(v_s + 5 * im_stride);
-      __builtin_prefetch(v_s + 6 * im_stride);
-      __builtin_prefetch(v_s + 7 * im_stride);
-
-      load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-      v_s += (7 * im_stride);
-
-      do {
-#if defined(__aarch64__)
-        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
-        v_s += (im_stride << 2);
-
-        __builtin_prefetch(d_u8 + 0 * dst_stride);
-        __builtin_prefetch(d_u8 + 1 * dst_stride);
-        __builtin_prefetch(d_u8 + 2 * dst_stride);
-        __builtin_prefetch(d_u8 + 3 * dst_stride);
-
-        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
-        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
-
-        d01 = vqmovn_u16(dd0);
-        d23 = vqmovn_u16(dd1);
-
-        if ((w == 4) && (h != 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        0);  // 20 21 22 23
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        1);  // 30 31 32 33
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h != 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        0);  // 20 21
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        2);  // 30 31
-          d_u8 += dst_stride;
-        } else if ((w == 4) && (h == 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h == 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
-        }
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        height -= 4;
-#else
-        s7 = vld1_s16(v_s);
-        v_s += im_stride;
-
-        __builtin_prefetch(d_u8 + 0 * dst_stride);
-
-        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
-        d01 = vqmovn_u16(dd0);
-
-        if (w == 4) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-
-        } else if (w == 2) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-        }
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        height -= 1;
-#endif
-      } while (height > 0);
-    } else {
-      // if width is a multiple of 8 & height is a multiple of 4
-      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-      uint8x8_t res0;
-#if defined(__aarch64__)
-      int16x8_t s8, s9, s10;
-      uint8x8_t res1, res2, res3;
-#endif
-
-      do {
-        __builtin_prefetch(v_src_ptr + 0 * im_stride);
-        __builtin_prefetch(v_src_ptr + 1 * im_stride);
-        __builtin_prefetch(v_src_ptr + 2 * im_stride);
-        __builtin_prefetch(v_src_ptr + 3 * im_stride);
-        __builtin_prefetch(v_src_ptr + 4 * im_stride);
-        __builtin_prefetch(v_src_ptr + 5 * im_stride);
-        __builtin_prefetch(v_src_ptr + 6 * im_stride);
-        __builtin_prefetch(v_src_ptr + 7 * im_stride);
-
-        v_s = v_src_ptr;
-        load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
-        v_s += (7 * im_stride);
-
         d_u8 = dst_u8_ptr;
-        height = h;
+        v_s = v_src_ptr;
+
+        __builtin_prefetch(v_s + 0 * im_stride);
+        __builtin_prefetch(v_s + 1 * im_stride);
+        __builtin_prefetch(v_s + 2 * im_stride);
+        __builtin_prefetch(v_s + 3 * im_stride);
+        __builtin_prefetch(v_s + 4 * im_stride);
+        __builtin_prefetch(v_s + 5 * im_stride);
+        __builtin_prefetch(v_s + 6 * im_stride);
+        __builtin_prefetch(v_s + 7 * im_stride);
+
+        load_s16_4x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+        v_s += (7 * im_stride);
 
         do {
 #if defined(__aarch64__)
-          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+          load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
           v_s += (im_stride << 2);
 
-          __builtin_prefetch(d_u8 + 4 * dst_stride);
-          __builtin_prefetch(d_u8 + 5 * dst_stride);
-          __builtin_prefetch(d_u8 + 6 * dst_stride);
-          __builtin_prefetch(d_u8 + 7 * dst_stride);
+          __builtin_prefetch(d_u8 + 0 * dst_stride);
+          __builtin_prefetch(d_u8 + 1 * dst_stride);
+          __builtin_prefetch(d_u8 + 2 * dst_stride);
+          __builtin_prefetch(d_u8 + 3 * dst_stride);
 
-          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
+          d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_shift_vec, offset_const,
+                                      sub_const_vec);
+          d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
+                                      round_shift_vec, offset_const,
+                                      sub_const_vec);
+          d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
+                                      round_shift_vec, offset_const,
+                                      sub_const_vec);
+          d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
+                                      round_shift_vec, offset_const,
+                                      sub_const_vec);
 
-          if (h != 2) {
-            vst1_u8(d_u8, res0);
+          dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
+          dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
+
+          d01 = vqmovn_u16(dd0);
+          d23 = vqmovn_u16(dd1);
+
+          if ((w == 4) && (h != 2)) {
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                          0);  // 00 01 02 03
             d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                          1);  // 10 11 12 13
             d_u8 += dst_stride;
-            vst1_u8(d_u8, res2);
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                          0);  // 20 21 22 23
             d_u8 += dst_stride;
-            vst1_u8(d_u8, res3);
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
+                          1);  // 30 31 32 33
             d_u8 += dst_stride;
-          } else {
-            vst1_u8(d_u8, res0);
+          } else if ((w == 2) && (h != 2)) {
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                          0);  // 00 01
             d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                          2);  // 10 11
+            d_u8 += dst_stride;
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                          0);  // 20 21
+            d_u8 += dst_stride;
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
+                          2);  // 30 31
+            d_u8 += dst_stride;
+          } else if ((w == 4) && (h == 2)) {
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                          0);  // 00 01 02 03
+            d_u8 += dst_stride;
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                          1);  // 10 11 12 13
+            d_u8 += dst_stride;
+          } else if ((w == 2) && (h == 2)) {
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                          0);  // 00 01
+            d_u8 += dst_stride;
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                          2);  // 10 11
             d_u8 += dst_stride;
           }
+
           s0 = s4;
           s1 = s5;
           s2 = s6;
@@ -1519,17 +1419,28 @@
           s6 = s10;
           height -= 4;
 #else
-          s7 = vld1q_s16(v_s);
+          s7 = vld1_s16(v_s);
           v_s += im_stride;
 
           __builtin_prefetch(d_u8 + 0 * dst_stride);
 
-          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
+          d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
+                                      round_shift_vec, offset_const,
+                                      sub_const_vec);
 
-          vst1_u8(d_u8, res0);
-          d_u8 += dst_stride;
+          dd0 = vqrshlq_u16(vcombine_u16(d0, d0), vec_round_bits);
+          d01 = vqmovn_u16(dd0);
+
+          if (w == 4) {
+            vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
+                          0);  // 00 01 02 03
+            d_u8 += dst_stride;
+
+          } else if (w == 2) {
+            vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
+                          0);  // 00 01
+            d_u8 += dst_stride;
+          }
 
           s0 = s1;
           s1 = s2;
@@ -1541,10 +1452,106 @@
           height -= 1;
 #endif
         } while (height > 0);
-        v_src_ptr += 8;
-        dst_u8_ptr += 8;
-        w -= 8;
-      } while (w > 0);
+      } else {
+        // if width is a multiple of 8 & height is a multiple of 4
+        int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+        uint8x8_t res0;
+#if defined(__aarch64__)
+        int16x8_t s8, s9, s10;
+        uint8x8_t res1, res2, res3;
+#endif
+
+        do {
+          __builtin_prefetch(v_src_ptr + 0 * im_stride);
+          __builtin_prefetch(v_src_ptr + 1 * im_stride);
+          __builtin_prefetch(v_src_ptr + 2 * im_stride);
+          __builtin_prefetch(v_src_ptr + 3 * im_stride);
+          __builtin_prefetch(v_src_ptr + 4 * im_stride);
+          __builtin_prefetch(v_src_ptr + 5 * im_stride);
+          __builtin_prefetch(v_src_ptr + 6 * im_stride);
+          __builtin_prefetch(v_src_ptr + 7 * im_stride);
+
+          v_s = v_src_ptr;
+          load_s16_8x8(v_s, im_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);
+          v_s += (7 * im_stride);
+
+          d_u8 = dst_u8_ptr;
+          height = h;
+
+          do {
+#if defined(__aarch64__)
+            load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
+            v_s += (im_stride << 2);
+
+            __builtin_prefetch(d_u8 + 4 * dst_stride);
+            __builtin_prefetch(d_u8 + 5 * dst_stride);
+            __builtin_prefetch(d_u8 + 6 * dst_stride);
+            __builtin_prefetch(d_u8 + 7 * dst_stride);
+
+            res0 = convolve8_vert_8x4_s32(
+                s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+                offset_const, sub_const_vec, vec_round_bits);
+            res1 = convolve8_vert_8x4_s32(
+                s1, s2, s3, s4, s5, s6, s7, s8, y_filter, round_shift_vec,
+                offset_const, sub_const_vec, vec_round_bits);
+            res2 = convolve8_vert_8x4_s32(
+                s2, s3, s4, s5, s6, s7, s8, s9, y_filter, round_shift_vec,
+                offset_const, sub_const_vec, vec_round_bits);
+            res3 = convolve8_vert_8x4_s32(
+                s3, s4, s5, s6, s7, s8, s9, s10, y_filter, round_shift_vec,
+                offset_const, sub_const_vec, vec_round_bits);
+
+            if (h != 2) {
+              vst1_u8(d_u8, res0);
+              d_u8 += dst_stride;
+              vst1_u8(d_u8, res1);
+              d_u8 += dst_stride;
+              vst1_u8(d_u8, res2);
+              d_u8 += dst_stride;
+              vst1_u8(d_u8, res3);
+              d_u8 += dst_stride;
+            } else {
+              vst1_u8(d_u8, res0);
+              d_u8 += dst_stride;
+              vst1_u8(d_u8, res1);
+              d_u8 += dst_stride;
+            }
+            s0 = s4;
+            s1 = s5;
+            s2 = s6;
+            s3 = s7;
+            s4 = s8;
+            s5 = s9;
+            s6 = s10;
+            height -= 4;
+#else
+            s7 = vld1q_s16(v_s);
+            v_s += im_stride;
+
+            __builtin_prefetch(d_u8 + 0 * dst_stride);
+
+            res0 = convolve8_vert_8x4_s32(
+                s0, s1, s2, s3, s4, s5, s6, s7, y_filter, round_shift_vec,
+                offset_const, sub_const_vec, vec_round_bits);
+
+            vst1_u8(d_u8, res0);
+            d_u8 += dst_stride;
+
+            s0 = s1;
+            s1 = s2;
+            s2 = s3;
+            s3 = s4;
+            s4 = s5;
+            s5 = s6;
+            s6 = s7;
+            height -= 1;
+#endif
+          } while (height > 0);
+          v_src_ptr += 8;
+          dst_u8_ptr += 8;
+          w -= 8;
+        } while (w > 0);
+      }
     }
   }
 }
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 391c063..7f01e36 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -548,16 +548,8 @@
                       subpel_y_qn);
   } else {
     assert(need_x && need_y);
-
-    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
-      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
-                           filter_params_x, filter_params_y, subpel_x_qn,
-                           subpel_y_qn, conv_params);
-    } else {
-      av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h,
-                         filter_params_x, filter_params_y, subpel_x_qn,
-                         subpel_y_qn, conv_params);
-    }
+    av1_convolve_2d_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
+                       filter_params_y, subpel_x_qn, subpel_y_qn, conv_params);
   }
 }
 
diff --git a/av1/common/filter.h b/av1/common/filter.h
index 56196aa..ded5ce5 100644
--- a/av1/common/filter.h
+++ b/av1/common/filter.h
@@ -25,7 +25,7 @@
 extern "C" {
 #endif
 
-#define MAX_FILTER_TAP 8
+#define MAX_FILTER_TAP 12
 
 typedef enum ATTRIBUTE_PACKED {
   EIGHTTAP_REGULAR,
diff --git a/av1/common/x86/convolve_2d_avx2.c b/av1/common/x86/convolve_2d_avx2.c
index 211f258..f78a7d0 100644
--- a/av1/common/x86/convolve_2d_avx2.c
+++ b/av1/common/x86/convolve_2d_avx2.c
@@ -26,87 +26,100 @@
                              const InterpFilterParams *filter_params_y,
                              const int subpel_x_qn, const int subpel_y_qn,
                              ConvolveParams *conv_params) {
-  const int bd = 8;
-  int im_stride = 8, i;
-  DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
-  const int bits =
-      FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
-  const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+  if (filter_params_x->taps > 8) {
+    if (w < 8) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+    } else {
+      av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params);
+    }
+  } else {
+    const int bd = 8;
+    int im_stride = 8, i;
+    DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
+    const int bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
-  assert(conv_params->round_0 > 0);
+    assert(conv_params->round_0 > 0);
 
-  const __m256i round_const_h = _mm256_set1_epi16(
-      ((1 << (conv_params->round_0 - 1)) >> 1) + (1 << (bd + FILTER_BITS - 2)));
-  const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
+    const __m256i round_const_h =
+        _mm256_set1_epi16(((1 << (conv_params->round_0 - 1)) >> 1) +
+                          (1 << (bd + FILTER_BITS - 2)));
+    const __m128i round_shift_h = _mm_cvtsi32_si128(conv_params->round_0 - 1);
 
-  const __m256i sum_round_v = _mm256_set1_epi32(
-      (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
-  const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
+    const __m256i sum_round_v = _mm256_set1_epi32(
+        (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+    const __m128i sum_shift_v = _mm_cvtsi32_si128(conv_params->round_1);
 
-  const __m256i round_const_v = _mm256_set1_epi32(
-      ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
-      ((1 << (offset_bits - conv_params->round_1)) >> 1));
-  const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
+    const __m256i round_const_v = _mm256_set1_epi32(
+        ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+        ((1 << (offset_bits - conv_params->round_1)) >> 1));
+    const __m128i round_shift_v = _mm_cvtsi32_si128(bits);
 
-  __m256i filt[4], coeffs_h[4], coeffs_v[4];
+    __m256i filt[4], coeffs_h[4], coeffs_v[4];
 
-  filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
-  filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
+    filt[0] = _mm256_load_si256((__m256i const *)(filt_global_avx2));
+    filt[1] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32));
 
-  prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
-
-  const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
-      filter_params_x, subpel_x_qn & SUBPEL_MASK);
-  const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-
-  int horiz_tap = SUBPEL_TAPS;
-  int vert_tap = SUBPEL_TAPS;
-
-  if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
-    horiz_tap = 4;
-  else if (!(filter_x[0] | filter_x[7]))
-    horiz_tap = 6;
-
-  if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
-    vert_tap = 4;
-  else if (!(filter_y[0] | filter_y[7]))
-    vert_tap = 6;
-
-  if (horiz_tap == 6)
-    prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-  else
     prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
-
-  if (vert_tap == 6)
-    prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
-  else
     prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
 
-  int im_h = h + vert_tap - 1;
-  const int fo_vert = vert_tap / 2 - 1;
-  const int fo_horiz = horiz_tap / 2 - 1;
-  const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+    const int16_t *const filter_x = av1_get_interp_filter_subpel_kernel(
+        filter_params_x, subpel_x_qn & SUBPEL_MASK);
+    const int16_t *const filter_y = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
 
-  filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
-  filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+    int horiz_tap = SUBPEL_TAPS;
+    int vert_tap = SUBPEL_TAPS;
 
-  for (int j = 0; j < w; j += 8) {
-    if (horiz_tap == 4) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
-    } else if (horiz_tap == 6) {
-      CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
-    } else {
-      CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
-    }
+    if (!(filter_x[0] | filter_x[1] | filter_x[6] | filter_x[7]))
+      horiz_tap = 4;
+    else if (!(filter_x[0] | filter_x[7]))
+      horiz_tap = 6;
 
-    if (vert_tap == 4) {
-      CONVOLVE_SR_VERTICAL_FILTER_4TAP
-    } else if (vert_tap == 6) {
-      CONVOLVE_SR_VERTICAL_FILTER_6TAP
-    } else {
-      CONVOLVE_SR_VERTICAL_FILTER_8TAP
+    if (!(filter_y[0] | filter_y[1] | filter_y[6] | filter_y[7]))
+      vert_tap = 4;
+    else if (!(filter_y[0] | filter_y[7]))
+      vert_tap = 6;
+
+    if (horiz_tap == 6)
+      prepare_coeffs_6t_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+    else
+      prepare_coeffs_lowbd(filter_params_x, subpel_x_qn, coeffs_h);
+
+    if (vert_tap == 6)
+      prepare_coeffs_6t(filter_params_y, subpel_y_qn, coeffs_v);
+    else
+      prepare_coeffs(filter_params_y, subpel_y_qn, coeffs_v);
+
+    int im_h = h + vert_tap - 1;
+    const int fo_vert = vert_tap / 2 - 1;
+    const int fo_horiz = horiz_tap / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    filt[2] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 2));
+    filt[3] = _mm256_load_si256((__m256i const *)(filt_global_avx2 + 32 * 3));
+
+    for (int j = 0; j < w; j += 8) {
+      if (horiz_tap == 4) {
+        CONVOLVE_SR_HORIZONTAL_FILTER_4TAP
+      } else if (horiz_tap == 6) {
+        CONVOLVE_SR_HORIZONTAL_FILTER_6TAP
+      } else {
+        CONVOLVE_SR_HORIZONTAL_FILTER_8TAP
+      }
+
+      if (vert_tap == 4) {
+        CONVOLVE_SR_VERTICAL_FILTER_4TAP
+      } else if (vert_tap == 6) {
+        CONVOLVE_SR_VERTICAL_FILTER_6TAP
+      } else {
+        CONVOLVE_SR_VERTICAL_FILTER_8TAP
+      }
     }
   }
 }
diff --git a/av1/common/x86/convolve_2d_sse2.c b/av1/common/x86/convolve_2d_sse2.c
index 1db9853..ca88bd7 100644
--- a/av1/common/x86/convolve_2d_sse2.c
+++ b/av1/common/x86/convolve_2d_sse2.c
@@ -16,20 +16,21 @@
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
 #include "aom_dsp/x86/convolve_sse2.h"
+#include "aom_dsp/x86/convolve_common_intrin.h"
 #include "av1/common/convolve.h"
 
-void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                             int dst_stride, int w, int h,
-                             const InterpFilterParams *filter_params_x,
-                             const InterpFilterParams *filter_params_y,
-                             const int subpel_x_qn, const int subpel_y_qn,
-                             ConvolveParams *conv_params) {
+void av1_convolve_2d_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride, int w, int h,
+                                   const InterpFilterParams *filter_params_x,
+                                   const InterpFilterParams *filter_params_y,
+                                   const int subpel_x_qn, const int subpel_y_qn,
+                                   ConvolveParams *conv_params) {
   const int bd = 8;
 
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
   int im_h = h + filter_params_y->taps - 1;
-  int im_stride = MAX_SB_SIZE;
+  int im_stride = w;
   int i, j;
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const int fo_horiz = filter_params_x->taps / 2 - 1;
@@ -41,26 +42,11 @@
   const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
 
   assert(conv_params->round_0 > 0);
+  __m128i coeffs[6];
 
   /* Horizontal filter */
   {
-    const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_x, subpel_x_qn & SUBPEL_MASK);
-    const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+    prepare_coeffs_12tap(filter_params_x, subpel_x_qn, coeffs);
 
     const __m128i round_const = _mm_set1_epi32(
         (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
@@ -70,34 +56,54 @@
       for (j = 0; j < w; j += 8) {
         const __m128i data =
             _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i data_2 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 4)]);
 
         // Filter even-index pixels
         const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
         const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+        const __m128i src_4 = _mm_unpacklo_epi8(data_2, zero);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+        const __m128i src_6 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 2), zero);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+        const __m128i src_8 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 4), zero);
+        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+        const __m128i src_10 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 6), zero);
+        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
 
-        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
-                                         _mm_add_epi32(res_2, res_6));
+        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                               _mm_add_epi32(res_2, res_6));
+        __m128i res_even =
+            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
         res_even =
             _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
 
         // Filter odd-index pixels
         const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
         const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+        const __m128i src_5 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 1), zero);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+        const __m128i src_7 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 3), zero);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+        const __m128i src_9 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 5), zero);
+        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+        const __m128i src_11 =
+            _mm_unpacklo_epi8(_mm_srli_si128(data_2, 7), zero);
+        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
 
-        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
-                                        _mm_add_epi32(res_3, res_7));
+        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                               _mm_add_epi32(res_3, res_7));
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
         res_odd =
             _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
 
@@ -110,23 +116,7 @@
 
   /* Vertical filter */
   {
-    const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
-        filter_params_y, subpel_y_qn & SUBPEL_MASK);
-    const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
-
-    // coeffs 0 1 0 1 2 3 2 3
-    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
-    // coeffs 4 5 4 5 6 7 6 7
-    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
-
-    // coeffs 0 1 0 1 0 1 0 1
-    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
-    // coeffs 2 3 2 3 2 3 2 3
-    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
-    // coeffs 4 5 4 5 4 5 4 5
-    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
-    // coeffs 6 7 6 7 6 7 6 7
-    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+    prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
 
     const __m128i sum_round =
         _mm_set1_epi32((1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
@@ -153,14 +143,24 @@
         const __m128i src_6 =
             _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
                                *(__m128i *)(data + 7 * im_stride));
+        const __m128i src_8 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 8 * im_stride),
+                               *(__m128i *)(data + 9 * im_stride));
+        const __m128i src_10 =
+            _mm_unpacklo_epi16(*(__m128i *)(data + 10 * im_stride),
+                               *(__m128i *)(data + 11 * im_stride));
 
-        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
-        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
-        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
-        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+        const __m128i res_0 = _mm_madd_epi16(src_0, coeffs[0]);
+        const __m128i res_2 = _mm_madd_epi16(src_2, coeffs[1]);
+        const __m128i res_4 = _mm_madd_epi16(src_4, coeffs[2]);
+        const __m128i res_6 = _mm_madd_epi16(src_6, coeffs[3]);
+        const __m128i res_8 = _mm_madd_epi16(src_8, coeffs[4]);
+        const __m128i res_10 = _mm_madd_epi16(src_10, coeffs[5]);
 
-        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+        const __m128i res_0246 = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                                _mm_add_epi32(res_4, res_6));
+        __m128i res_even =
+            _mm_add_epi32(_mm_add_epi32(res_8, res_10), res_0246);
 
         // Filter odd-index pixels
         const __m128i src_1 =
@@ -175,14 +175,23 @@
         const __m128i src_7 =
             _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
                                *(__m128i *)(data + 7 * im_stride));
+        const __m128i src_9 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 8 * im_stride),
+                               *(__m128i *)(data + 9 * im_stride));
+        const __m128i src_11 =
+            _mm_unpackhi_epi16(*(__m128i *)(data + 10 * im_stride),
+                               *(__m128i *)(data + 11 * im_stride));
 
-        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
-        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
-        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
-        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+        const __m128i res_1 = _mm_madd_epi16(src_1, coeffs[0]);
+        const __m128i res_3 = _mm_madd_epi16(src_3, coeffs[1]);
+        const __m128i res_5 = _mm_madd_epi16(src_5, coeffs[2]);
+        const __m128i res_7 = _mm_madd_epi16(src_7, coeffs[3]);
+        const __m128i res_9 = _mm_madd_epi16(src_9, coeffs[4]);
+        const __m128i res_11 = _mm_madd_epi16(src_11, coeffs[5]);
 
-        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
-                                              _mm_add_epi32(res_5, res_7));
+        const __m128i res_1357 = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                               _mm_add_epi32(res_3, res_7));
+        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_9, res_11), res_1357);
 
         // Rearrange pixels back into the order 0 ... 7
         const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
@@ -204,12 +213,223 @@
         // Accumulate values into the destination buffer
         __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
 
-        if (w == 2) {
-          *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
-        } else if (w == 4) {
-          *(uint32_t *)p = _mm_cvtsi128_si32(res);
-        } else {
-          _mm_storel_epi64(p, res);
+        _mm_storel_epi64(p, res);
+      }
+    }
+  }
+}
+
+void av1_convolve_2d_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                             int dst_stride, int w, int h,
+                             const InterpFilterParams *filter_params_x,
+                             const InterpFilterParams *filter_params_y,
+                             const int subpel_x_qn, const int subpel_y_qn,
+                             ConvolveParams *conv_params) {
+  if (filter_params_x->taps > 8) {
+    if (w < 8) {
+      av1_convolve_2d_sr_c(src, src_stride, dst, dst_stride, w, h,
+                           filter_params_x, filter_params_y, subpel_x_qn,
+                           subpel_y_qn, conv_params);
+    } else {
+      av1_convolve_2d_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                    filter_params_x, filter_params_y,
+                                    subpel_x_qn, subpel_y_qn, conv_params);
+    }
+  } else {
+    const int bd = 8;
+
+    DECLARE_ALIGNED(16, int16_t,
+                    im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE]);
+    int im_h = h + filter_params_y->taps - 1;
+    int im_stride = MAX_SB_SIZE;
+    int i, j;
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const int fo_horiz = filter_params_x->taps / 2 - 1;
+    const uint8_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
+
+    const __m128i zero = _mm_setzero_si128();
+    const int bits =
+        FILTER_BITS * 2 - conv_params->round_0 - conv_params->round_1;
+    const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
+
+    assert(conv_params->round_0 > 0);
+
+    /* Horizontal filter */
+    {
+      const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params_x, subpel_x_qn & SUBPEL_MASK);
+      const __m128i coeffs_x = _mm_loadu_si128((__m128i *)x_filter);
+
+      // coeffs 0 1 0 1 2 3 2 3
+      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
+      // coeffs 4 5 4 5 6 7 6 7
+      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);
+
+      // coeffs 0 1 0 1 0 1 0 1
+      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+      // coeffs 2 3 2 3 2 3 2 3
+      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+      // coeffs 4 5 4 5 4 5 4 5
+      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+      // coeffs 6 7 6 7 6 7 6 7
+      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+      const __m128i round_const = _mm_set1_epi32(
+          (1 << (bd + FILTER_BITS - 1)) + ((1 << conv_params->round_0) >> 1));
+      const __m128i round_shift = _mm_cvtsi32_si128(conv_params->round_0);
+
+      for (i = 0; i < im_h; ++i) {
+        for (j = 0; j < w; j += 8) {
+          const __m128i data =
+              _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+
+          // Filter even-index pixels
+          const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
+          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+          const __m128i src_2 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
+          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+          const __m128i src_4 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
+          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+          const __m128i src_6 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
+          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
+                                           _mm_add_epi32(res_2, res_6));
+          res_even =
+              _mm_sra_epi32(_mm_add_epi32(res_even, round_const), round_shift);
+
+          // Filter odd-index pixels
+          const __m128i src_1 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
+          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+          const __m128i src_3 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
+          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+          const __m128i src_5 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
+          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+          const __m128i src_7 =
+              _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
+          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
+                                          _mm_add_epi32(res_3, res_7));
+          res_odd =
+              _mm_sra_epi32(_mm_add_epi32(res_odd, round_const), round_shift);
+
+          // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
+          __m128i res = _mm_packs_epi32(res_even, res_odd);
+          _mm_storeu_si128((__m128i *)&im_block[i * im_stride + j], res);
+        }
+      }
+    }
+
+    /* Vertical filter */
+    {
+      const int16_t *y_filter = av1_get_interp_filter_subpel_kernel(
+          filter_params_y, subpel_y_qn & SUBPEL_MASK);
+      const __m128i coeffs_y = _mm_loadu_si128((__m128i *)y_filter);
+
+      // coeffs 0 1 0 1 2 3 2 3
+      const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
+      // coeffs 4 5 4 5 6 7 6 7
+      const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);
+
+      // coeffs 0 1 0 1 0 1 0 1
+      const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
+      // coeffs 2 3 2 3 2 3 2 3
+      const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
+      // coeffs 4 5 4 5 4 5 4 5
+      const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
+      // coeffs 6 7 6 7 6 7 6 7
+      const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);
+
+      const __m128i sum_round = _mm_set1_epi32(
+          (1 << offset_bits) + ((1 << conv_params->round_1) >> 1));
+      const __m128i sum_shift = _mm_cvtsi32_si128(conv_params->round_1);
+
+      const __m128i round_const = _mm_set1_epi32(
+          ((1 << bits) >> 1) - (1 << (offset_bits - conv_params->round_1)) -
+          ((1 << (offset_bits - conv_params->round_1)) >> 1));
+      const __m128i round_shift = _mm_cvtsi32_si128(bits);
+
+      for (i = 0; i < h; ++i) {
+        for (j = 0; j < w; j += 8) {
+          // Filter even-index pixels
+          const int16_t *data = &im_block[i * im_stride + j];
+          const __m128i src_0 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 0 * im_stride),
+                                 *(__m128i *)(data + 1 * im_stride));
+          const __m128i src_2 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 2 * im_stride),
+                                 *(__m128i *)(data + 3 * im_stride));
+          const __m128i src_4 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 4 * im_stride),
+                                 *(__m128i *)(data + 5 * im_stride));
+          const __m128i src_6 =
+              _mm_unpacklo_epi16(*(__m128i *)(data + 6 * im_stride),
+                                 *(__m128i *)(data + 7 * im_stride));
+
+          const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
+          const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
+          const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
+          const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);
+
+          const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
+                                                 _mm_add_epi32(res_4, res_6));
+
+          // Filter odd-index pixels
+          const __m128i src_1 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 0 * im_stride),
+                                 *(__m128i *)(data + 1 * im_stride));
+          const __m128i src_3 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 2 * im_stride),
+                                 *(__m128i *)(data + 3 * im_stride));
+          const __m128i src_5 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 4 * im_stride),
+                                 *(__m128i *)(data + 5 * im_stride));
+          const __m128i src_7 =
+              _mm_unpackhi_epi16(*(__m128i *)(data + 6 * im_stride),
+                                 *(__m128i *)(data + 7 * im_stride));
+
+          const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
+          const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
+          const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
+          const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);
+
+          const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
+                                                _mm_add_epi32(res_5, res_7));
+
+          // Rearrange pixels back into the order 0 ... 7
+          const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+          const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+          __m128i res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, sum_round), sum_shift);
+          __m128i res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, sum_round), sum_shift);
+
+          res_lo_round = _mm_sra_epi32(_mm_add_epi32(res_lo_round, round_const),
+                                       round_shift);
+          res_hi_round = _mm_sra_epi32(_mm_add_epi32(res_hi_round, round_const),
+                                       round_shift);
+
+          const __m128i res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          const __m128i res = _mm_packus_epi16(res16, res16);
+
+          // Accumulate values into the destination buffer
+          __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
+
+          if (w == 2) {
+            *(uint16_t *)p = (uint16_t)_mm_cvtsi128_si32(res);
+          } else if (w == 4) {
+            *(uint32_t *)p = _mm_cvtsi128_si32(res);
+          } else {
+            _mm_storel_epi64(p, res);
+          }
         }
       }
     }
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 3ae4504..b764e4d 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -763,6 +763,32 @@
     }
   }
 
+ public:
+  void SpeedTest() {
+    for (int h_f = EIGHTTAP_REGULAR; h_f < INTERP_FILTERS_ALL; ++h_f) {
+      for (int v_f = EIGHTTAP_REGULAR; v_f < INTERP_FILTERS_ALL; ++v_f) {
+        TestConvolveSpeed(static_cast<InterpFilter>(h_f),
+                          static_cast<InterpFilter>(v_f), 10000);
+      }
+    }
+  }
+
+ public:
+  void RunTest12Tap() {
+    for (int sub_x = 0; sub_x < 16; ++sub_x) {
+      for (int sub_y = 0; sub_y < 16; ++sub_y) {
+        TestConvolve(static_cast<InterpFilter>(MULTITAP_SHARP2),
+                     static_cast<InterpFilter>(MULTITAP_SHARP2), sub_x, sub_y);
+      }
+    }
+  }
+
+ public:
+  void SpeedTest12Tap() {
+    TestConvolveSpeed(static_cast<InterpFilter>(MULTITAP_SHARP2),
+                      static_cast<InterpFilter>(MULTITAP_SHARP2), 10000);
+  }
+
  private:
   void TestConvolve(const InterpFilter h_f, const InterpFilter v_f,
                     const int sub_x, const int sub_y) {
@@ -775,9 +801,9 @@
     const uint8_t *input = FirstRandomInput8(GetParam());
     DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
     ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
-    av1_convolve_2d_sr(input, width, reference, kOutputStride, width, height,
-                       filter_params_x, filter_params_y, sub_x, sub_y,
-                       &conv_params1);
+    av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width, height,
+                         filter_params_x, filter_params_y, sub_x, sub_y,
+                         &conv_params1);
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
@@ -785,10 +811,51 @@
                               &conv_params2);
     AssertOutputBufferEq(reference, test, width, height);
   }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter h_f, const InterpFilter v_f,
+                         int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+    const InterpFilterParams *filter_params_x =
+        av1_get_interp_filter_params_with_block_size(h_f, width);
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(v_f, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+    ConvolveParams conv_params1 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_convolve_2d_sr_c(input, width, reference, kOutputStride, width,
+                           height, filter_params_x, filter_params_y, 0, 0,
+                           &conv_params1);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+    ConvolveParams conv_params2 = get_conv_params_no_round(0, 0, NULL, 0, 0, 8);
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      GetParam().TestFunction()(input, width, test, kOutputStride, width,
+                                height, filter_params_x, filter_params_y, 0, 0,
+                                &conv_params2);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d - %d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", h_f, v_f, width, height,
+           time1, time2, time1 / time2);
+  }
 };
 
 TEST_P(AV1Convolve2DTest, RunTest) { RunTest(); }
 
+TEST_P(AV1Convolve2DTest, RunTest12Tap) { RunTest12Tap(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest) { SpeedTest(); }
+
+TEST_P(AV1Convolve2DTest, DISABLED_SpeedTest12Tap) { SpeedTest12Tap(); }
+
 INSTANTIATE_TEST_SUITE_P(C, AV1Convolve2DTest,
                          BuildLowbdParams(av1_convolve_2d_sr_c));