Add SSE2 variant for MULTITAP_SHARP2 y-convolve

This CL adds SSE2 intrinsics for av1_convolve_y_sr_c for
MULTITAP_SHARP2 case, and updates AV1ConvolveYTest to support
the added intrinsic and to obtain module level gains.

Module gains improved by a factor of ~11.5x w.r.t. C

Change-Id: I09e274344f62fd1eafbc620944580af6b3bcd58f
diff --git a/aom_dsp/x86/convolve_common_intrin.h b/aom_dsp/x86/convolve_common_intrin.h
index 114828d..9e3b73e 100644
--- a/aom_dsp/x86/convolve_common_intrin.h
+++ b/aom_dsp/x86/convolve_common_intrin.h
@@ -74,6 +74,37 @@
   return convolve_12tap(ss, coeffs);
 }
 
+static INLINE __m128i convolve_lo_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpacklo_epi8(s[0], zero);
+  ss[1] = _mm_unpacklo_epi8(s[2], zero);
+  ss[2] = _mm_unpacklo_epi8(s[4], zero);
+  ss[3] = _mm_unpacklo_epi8(s[6], zero);
+  ss[4] = _mm_unpacklo_epi8(s[8], zero);
+  ss[5] = _mm_unpacklo_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
+
+static INLINE __m128i convolve_hi_y_12tap(const __m128i *s,
+                                          const __m128i *coeffs) {
+  __m128i ss[6];
+  const __m128i zero = _mm_setzero_si128();
+  ss[0] = _mm_unpackhi_epi8(s[0], zero);
+  ss[1] = _mm_unpackhi_epi8(s[2], zero);
+  ss[2] = _mm_unpackhi_epi8(s[4], zero);
+  ss[3] = _mm_unpackhi_epi8(s[6], zero);
+  ss[4] = _mm_unpackhi_epi8(s[8], zero);
+  ss[5] = _mm_unpackhi_epi8(s[10], zero);
+  return convolve_12tap(ss, coeffs);
+}
+
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_y,
+                                  int subpel_y_qn);
+
 void av1_convolve_x_sr_12tap_sse2(const uint8_t *src, int src_stride,
                                   uint8_t *dst, int dst_stride, int w, int h,
                                   const InterpFilterParams *filter_params_x,
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 7954ad1..391c063 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -544,13 +544,8 @@
     av1_convolve_x_sr(src, src_stride, dst, dst_stride, w, h, filter_params_x,
                       subpel_x_qn, conv_params);
   } else if (!need_x && need_y) {
-    if (filter_params_x->taps > 8 || filter_params_y->taps > 8) {
-      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
-                          filter_params_y, subpel_y_qn);
-    } else {
-      av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
-                        subpel_y_qn);
-    }
+    av1_convolve_y_sr(src, src_stride, dst, dst_stride, w, h, filter_params_y,
+                      subpel_y_qn);
   } else {
     assert(need_x && need_y);
 
diff --git a/av1/common/x86/convolve_avx2.c b/av1/common/x86/convolve_avx2.c
index ed5fafc..3eeae67 100644
--- a/av1/common/x86/convolve_avx2.c
+++ b/av1/common/x86/convolve_avx2.c
@@ -22,341 +22,351 @@
                             int dst_stride, int w, int h,
                             const InterpFilterParams *filter_params_y,
                             const int subpel_y_qn) {
-  int i, j, vert_tap = SUBPEL_TAPS;
-  // right shift is F-1 because we are already dividing
-  // filter co-efficients by 2
-  const int right_shift_bits = (FILTER_BITS - 1);
-  const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
-  const __m256i right_shift_const =
-      _mm256_set1_epi16((1 << right_shift_bits) >> 1);
+  if (filter_params_y->taps > 8) {
+    if (w < 4 || h < 4) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_y, subpel_y_qn);
+    } else {
+      av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn);
+    }
+  } else {
+    int i, j, vert_tap = SUBPEL_TAPS;
+    // right shift is F-1 because we are already dividing
+    // filter co-efficients by 2
+    const int right_shift_bits = (FILTER_BITS - 1);
+    const __m128i right_shift = _mm_cvtsi32_si128(right_shift_bits);
+    const __m256i right_shift_const =
+        _mm256_set1_epi16((1 << right_shift_bits) >> 1);
 
-  __m256i coeffs[4], s[8];
-  __m128i d[6];
+    __m256i coeffs[4], s[8];
+    __m128i d[6];
 
-  // Condition for checking valid vert_filt taps
-  const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
-      filter_params_y, subpel_y_qn & SUBPEL_MASK);
-  if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
-    vert_tap = 4;
-  } else if (!(filter[0] | filter[7])) {
-    vert_tap = 6;
-  }
+    // Condition for checking valid vert_filt taps
+    const int16_t *const filter = av1_get_interp_filter_subpel_kernel(
+        filter_params_y, subpel_y_qn & SUBPEL_MASK);
+    if (!(filter[0] | filter[1] | filter[6] | filter[7])) {
+      vert_tap = 4;
+    } else if (!(filter[0] | filter[7])) {
+      vert_tap = 6;
+    }
 
-  if (vert_tap == 6)
-    prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
-  else
-    prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
+    if (vert_tap == 6)
+      prepare_coeffs_6t_lowbd(filter_params_y, subpel_y_qn, coeffs);
+    else
+      prepare_coeffs_lowbd(filter_params_y, subpel_y_qn, coeffs);
 
-  // vert_filt as 4 tap
-  if (vert_tap == 4) {
-    const int fo_vert = 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+    // vert_filt as 4 tap
+    if (vert_tap == 4) {
+      const int fo_vert = 1;
+      const uint8_t *const src_ptr = src - fo_vert * src_stride;
+      for (j = 0; j < w; j += 16) {
+        const uint8_t *data = &src_ptr[j];
+        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+        d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+        d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
 
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src_01a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
 
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+        const __m256i src_12a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
 
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+        const __m256i src_23a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
 
-      const __m256i src_34a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+        const __m256i src_34a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
 
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
 
-      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+        s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+        s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
 
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
+        for (i = 0; i < h; i += 2) {
+          data = &src_ptr[i * src_stride + j];
+          d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+          const __m256i src_45a = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
+
+          d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+          const __m256i src_56a = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
+
+          s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+          s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+          const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_lo = _mm256_sra_epi16(
+              _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+          if (w - j > 8) {
+            const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
+
+            /* rounding code */
+            // shift by F - 1
+            const __m256i res_16b_hi = _mm256_sra_epi16(
+                _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+            // 8 bit conversion and saturation to uint8
+            __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+            __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_a);
+            const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                               res_1);
+            } else if (w - j > 2) {
+              xx_storel_32(&dst[i * dst_stride + j], res_0);
+              xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+            } else {
+              __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+              __m128i *const p_1 =
+                  (__m128i *)&dst[i * dst_stride + j + dst_stride];
+              *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+              *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+            }
+          }
+          s[0] = s[1];
+          s[1] = s[2];
+
+          s[3] = s[4];
+          s[4] = s[5];
+        }
+      }
+    } else if (vert_tap == 6) {
+      const int fo_vert = vert_tap / 2 - 1;
+      const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+      for (j = 0; j < w; j += 16) {
+        const uint8_t *data = &src_ptr[j];
+        __m256i src6;
+
+        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+        d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src_01a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+        const __m256i src_12a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+        const __m256i src_23a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+        src6 = _mm256_castsi128_si256(
+            _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
+        const __m256i src_34a =
+            _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
+
+        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
+
+        s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
+        s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
+
+        for (i = 0; i < h; i += 2) {
+          data = &src_ptr[i * src_stride + j];
+          const __m256i src_45a = _mm256_permute2x128_si256(
+              src6,
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+              0x20);
+
+          src6 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
+          const __m256i src_56a = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
+              src6, 0x20);
+
+          s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
+          s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
+
+          const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+
+          /* rounding code */
+          // shift by F - 1
+          const __m256i res_16b_lo = _mm256_sra_epi16(
+              _mm256_add_epi16(res_lo, right_shift_const), right_shift);
+          // 8 bit conversion and saturation to uint8
+          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+
+          if (w - j > 8) {
+            const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+
+            /* rounding code */
+            // shift by F - 1
+            const __m256i res_16b_hi = _mm256_sra_epi16(
+                _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+            // 8 bit conversion and saturation to uint8
+            __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+
+            __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_a);
+            const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                             res_1);
+          } else {
+            const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                               res_1);
+            } else if (w - j > 2) {
+              xx_storel_32(&dst[i * dst_stride + j], res_0);
+              xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+            } else {
+              __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+              __m128i *const p_1 =
+                  (__m128i *)&dst[i * dst_stride + j + dst_stride];
+              *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+              *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+            }
+          }
+          s[0] = s[1];
+          s[1] = s[2];
+          s[3] = s[4];
+          s[4] = s[5];
+        }
+      }
+    } else {
+      const int fo_vert = filter_params_y->taps / 2 - 1;
+      const uint8_t *const src_ptr = src - fo_vert * src_stride;
+
+      for (j = 0; j < w; j += 16) {
+        const uint8_t *data = &src_ptr[j];
+        __m256i src6;
+
+        d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+        d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+        d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+        d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+        d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
         d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+        // Load lines a and b. Line a to lower 128, line b to upper 128
+        const __m256i src_01a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
+
+        const __m256i src_12a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
+
+        const __m256i src_23a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
+
+        const __m256i src_34a = _mm256_permute2x128_si256(
+            _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
+
         const __m256i src_45a = _mm256_permute2x128_si256(
             _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
 
-        d[4] = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
-        const __m256i src_56a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(d[5]), _mm256_castsi128_si256(d[4]), 0x20);
-
-        s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-        const __m256i res_lo = convolve_lowbd_4tap(s, coeffs + 1);
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd_4tap(s + 3, coeffs + 1);
-
-          /* rounding code */
-          // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                             res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
-          } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else if (vert_tap == 6) {
-    const int fo_vert = vert_tap / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src6;
-
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 4 * src_stride)));
-      const __m256i src_34a =
-          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[3]), src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-
-      s[3] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[4] = _mm256_unpackhi_epi8(src_23a, src_34a);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-        const __m256i src_45a = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-            0x20);
-
         src6 = _mm256_castsi128_si256(
             _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-        const __m256i src_56a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
-            src6, 0x20);
+        const __m256i src_56a =
+            _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
 
+        s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
+        s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
         s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-        s[5] = _mm256_unpackhi_epi8(src_45a, src_56a);
 
-        const __m256i res_lo = convolve_lowbd_6tap(s, coeffs);
+        s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
+        s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
+        s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
 
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
+        for (i = 0; i < h; i += 2) {
+          data = &src_ptr[i * src_stride + j];
+          const __m256i src_67a = _mm256_permute2x128_si256(
+              src6,
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+              0x20);
 
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd_6tap(s + 3, coeffs);
+          src6 = _mm256_castsi128_si256(
+              _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
+          const __m256i src_78a = _mm256_permute2x128_si256(
+              _mm256_castsi128_si256(
+                  _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
+              src6, 0x20);
+
+          s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
+          s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
+
+          const __m256i res_lo = convolve_lowbd(s, coeffs);
 
           /* rounding code */
           // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+          const __m256i res_16b_lo = _mm256_sra_epi16(
+              _mm256_add_epi16(res_lo, right_shift_const), right_shift);
           // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
+          __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
 
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+          if (w - j > 8) {
+            const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
 
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+            /* rounding code */
+            // shift by F - 1
+            const __m256i res_16b_hi = _mm256_sra_epi16(
+                _mm256_add_epi16(res_hi, right_shift_const), right_shift);
+            // 8 bit conversion and saturation to uint8
+            __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
 
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+            __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
+
+            const __m128i res_0 = _mm256_castsi256_si128(res_a);
+            const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
+
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
+            _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
                              res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
           } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+            const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
+            const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
+            if (w - j > 4) {
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
+              _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                               res_1);
+            } else if (w - j > 2) {
+              xx_storel_32(&dst[i * dst_stride + j], res_0);
+              xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
+            } else {
+              __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
+              __m128i *const p_1 =
+                  (__m128i *)&dst[i * dst_stride + j + dst_stride];
+              *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
+              *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
+            }
           }
+          s[0] = s[1];
+          s[1] = s[2];
+          s[2] = s[3];
+
+          s[4] = s[5];
+          s[5] = s[6];
+          s[6] = s[7];
         }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[3] = s[4];
-        s[4] = s[5];
-      }
-    }
-  } else {
-    const int fo_vert = filter_params_y->taps / 2 - 1;
-    const uint8_t *const src_ptr = src - fo_vert * src_stride;
-
-    for (j = 0; j < w; j += 16) {
-      const uint8_t *data = &src_ptr[j];
-      __m256i src6;
-
-      d[0] = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
-      d[1] = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
-      d[2] = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
-      d[3] = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
-      d[4] = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
-      d[5] = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
-      // Load lines a and b. Line a to lower 128, line b to upper 128
-      const __m256i src_01a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[0]), _mm256_castsi128_si256(d[1]), 0x20);
-
-      const __m256i src_12a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[1]), _mm256_castsi128_si256(d[2]), 0x20);
-
-      const __m256i src_23a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[2]), _mm256_castsi128_si256(d[3]), 0x20);
-
-      const __m256i src_34a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[3]), _mm256_castsi128_si256(d[4]), 0x20);
-
-      const __m256i src_45a = _mm256_permute2x128_si256(
-          _mm256_castsi128_si256(d[4]), _mm256_castsi128_si256(d[5]), 0x20);
-
-      src6 = _mm256_castsi128_si256(
-          _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
-      const __m256i src_56a =
-          _mm256_permute2x128_si256(_mm256_castsi128_si256(d[5]), src6, 0x20);
-
-      s[0] = _mm256_unpacklo_epi8(src_01a, src_12a);
-      s[1] = _mm256_unpacklo_epi8(src_23a, src_34a);
-      s[2] = _mm256_unpacklo_epi8(src_45a, src_56a);
-
-      s[4] = _mm256_unpackhi_epi8(src_01a, src_12a);
-      s[5] = _mm256_unpackhi_epi8(src_23a, src_34a);
-      s[6] = _mm256_unpackhi_epi8(src_45a, src_56a);
-
-      for (i = 0; i < h; i += 2) {
-        data = &src_ptr[i * src_stride + j];
-        const __m256i src_67a = _mm256_permute2x128_si256(
-            src6,
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            0x20);
-
-        src6 = _mm256_castsi128_si256(
-            _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
-        const __m256i src_78a = _mm256_permute2x128_si256(
-            _mm256_castsi128_si256(
-                _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
-            src6, 0x20);
-
-        s[3] = _mm256_unpacklo_epi8(src_67a, src_78a);
-        s[7] = _mm256_unpackhi_epi8(src_67a, src_78a);
-
-        const __m256i res_lo = convolve_lowbd(s, coeffs);
-
-        /* rounding code */
-        // shift by F - 1
-        const __m256i res_16b_lo = _mm256_sra_epi16(
-            _mm256_add_epi16(res_lo, right_shift_const), right_shift);
-        // 8 bit conversion and saturation to uint8
-        __m256i res_8b_lo = _mm256_packus_epi16(res_16b_lo, res_16b_lo);
-
-        if (w - j > 8) {
-          const __m256i res_hi = convolve_lowbd(s + 4, coeffs);
-
-          /* rounding code */
-          // shift by F - 1
-          const __m256i res_16b_hi = _mm256_sra_epi16(
-              _mm256_add_epi16(res_hi, right_shift_const), right_shift);
-          // 8 bit conversion and saturation to uint8
-          __m256i res_8b_hi = _mm256_packus_epi16(res_16b_hi, res_16b_hi);
-
-          __m256i res_a = _mm256_unpacklo_epi64(res_8b_lo, res_8b_hi);
-
-          const __m128i res_0 = _mm256_castsi256_si128(res_a);
-          const __m128i res_1 = _mm256_extracti128_si256(res_a, 1);
-
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_0);
-          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                           res_1);
-        } else {
-          const __m128i res_0 = _mm256_castsi256_si128(res_8b_lo);
-          const __m128i res_1 = _mm256_extracti128_si256(res_8b_lo, 1);
-          if (w - j > 4) {
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_0);
-            _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
-                             res_1);
-          } else if (w - j > 2) {
-            xx_storel_32(&dst[i * dst_stride + j], res_0);
-            xx_storel_32(&dst[i * dst_stride + j + dst_stride], res_1);
-          } else {
-            __m128i *const p_0 = (__m128i *)&dst[i * dst_stride + j];
-            __m128i *const p_1 =
-                (__m128i *)&dst[i * dst_stride + j + dst_stride];
-            *(uint16_t *)p_0 = (uint16_t)_mm_cvtsi128_si32(res_0);
-            *(uint16_t *)p_1 = (uint16_t)_mm_cvtsi128_si32(res_1);
-          }
-        }
-        s[0] = s[1];
-        s[1] = s[2];
-        s[2] = s[3];
-
-        s[4] = s[5];
-        s[5] = s[6];
-        s[6] = s[7];
       }
     }
   }
diff --git a/av1/common/x86/convolve_sse2.c b/av1/common/x86/convolve_sse2.c
index 5bf456e..cd5521e 100644
--- a/av1/common/x86/convolve_sse2.c
+++ b/av1/common/x86/convolve_sse2.c
@@ -75,72 +75,91 @@
   return convolve(ss, coeffs);
 }
 
-void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
-                            int dst_stride, int w, int h,
-                            const InterpFilterParams *filter_params_y,
-                            const int subpel_y_qn) {
+void av1_convolve_y_sr_12tap_sse2(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams *filter_params_y,
+                                  int subpel_y_qn) {
   const int fo_vert = filter_params_y->taps / 2 - 1;
   const uint8_t *src_ptr = src - fo_vert * src_stride;
   const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
   const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
-  __m128i coeffs[4];
+  __m128i coeffs[6];
 
-  prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+  prepare_coeffs_12tap(filter_params_y, subpel_y_qn, coeffs);
 
-  if (w <= 4) {
-    __m128i s[8], src6, res, res_round, res16;
-    uint32_t res_int;
-    src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
-    s[0] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
-    s[1] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
-    s[2] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
-    s[3] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
-    s[4] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
-    s[5] = _mm_unpacklo_epi8(
-        _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
+  int j = 0;
+  do {
+    __m128i s[12], src10, res_lo, res_hi;
+    __m128i res_lo_round, res_hi_round, res16, res;
+    const uint8_t *data = &src_ptr[j];
 
+    src10 = _mm_loadl_epi64((__m128i *)(data + 10 * src_stride));
+    s[0] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+    s[1] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+    s[2] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+    s[3] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+    s[4] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+    s[5] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 5 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 6 * src_stride)));
+    s[6] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 6 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+    s[7] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 7 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 8 * src_stride)));
+    s[8] =
+        _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(data + 8 * src_stride)),
+                          _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)));
+    s[9] = _mm_unpacklo_epi8(
+        _mm_loadl_epi64((__m128i *)(data + 9 * src_stride)), src10);
+
+    int i = 0;
     do {
-      s[6] = _mm_unpacklo_epi8(
-          src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
-      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
-      s[7] = _mm_unpacklo_epi8(
-          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
+      data = &src_ptr[i * src_stride + j];
+      s[10] = _mm_unpacklo_epi8(
+          src10, _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)));
+      src10 = _mm_loadl_epi64((__m128i *)(data + 12 * src_stride));
+      s[11] = _mm_unpacklo_epi8(
+          _mm_loadl_epi64((__m128i *)(data + 11 * src_stride)), src10);
 
-      res = convolve_lo_y(s + 0, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+      res_lo = convolve_lo_y_12tap(s, coeffs);  // Filter low index pixels
+      res_hi = convolve_hi_y_12tap(s, coeffs);  // Filter high index pixels
 
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      res_hi_round =
+          _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
 
-      src_ptr += src_stride;
-      dst += dst_stride;
+      res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+      res = _mm_packus_epi16(res16, res16);
 
-      res = convolve_lo_y(s + 1, coeffs);
-      res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
-      res16 = _mm_packs_epi32(res_round, res_round);
-      res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
+      _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+      i++;
 
-      if (w == 2)
-        *(uint16_t *)dst = (uint16_t)res_int;
-      else
-        *(uint32_t *)dst = res_int;
+      res_lo = convolve_lo_y_12tap(s + 1, coeffs);  // Filter low index pixels
+      res_hi = convolve_hi_y_12tap(s + 1, coeffs);  // Filter high index pixels
 
-      src_ptr += src_stride;
-      dst += dst_stride;
+      res_lo_round =
+          _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+      res_hi_round =
+          _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+      res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+      res = _mm_packus_epi16(res16, res16);
+
+      _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+      i++;
 
       s[0] = s[2];
       s[1] = s[3];
@@ -148,71 +167,90 @@
       s[3] = s[5];
       s[4] = s[6];
       s[5] = s[7];
-      h -= 2;
-    } while (h);
+      s[6] = s[8];
+      s[7] = s[9];
+      s[8] = s[10];
+      s[9] = s[11];
+    } while (i < h);
+    j += 8;
+  } while (j < w);
+}
+
+void av1_convolve_y_sr_sse2(const uint8_t *src, int src_stride, uint8_t *dst,
+                            int dst_stride, int w, int h,
+                            const InterpFilterParams *filter_params_y,
+                            const int subpel_y_qn) {
+  if (filter_params_y->taps > 8) {
+    if (w < 8) {
+      av1_convolve_y_sr_c(src, src_stride, dst, dst_stride, w, h,
+                          filter_params_y, subpel_y_qn);
+    } else {
+      av1_convolve_y_sr_12tap_sse2(src, src_stride, dst, dst_stride, w, h,
+                                   filter_params_y, subpel_y_qn);
+    }
   } else {
-    assert(!(w % 8));
-    int j = 0;
-    do {
-      __m128i s[8], src6, res_lo, res_hi;
-      __m128i res_lo_round, res_hi_round, res16, res;
-      const uint8_t *data = &src_ptr[j];
+    const int fo_vert = filter_params_y->taps / 2 - 1;
+    const uint8_t *src_ptr = src - fo_vert * src_stride;
+    const __m128i round_const = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+    const __m128i round_shift = _mm_cvtsi32_si128(FILTER_BITS);
+    __m128i coeffs[4];
 
-      src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+    prepare_coeffs(filter_params_y, subpel_y_qn, coeffs);
+
+    if (w <= 4) {
+      __m128i s[8], src6, res, res_round, res16;
+      uint32_t res_int;
+      src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 6 * src_stride));
       s[0] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 0 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)));
       s[1] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 1 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)));
       s[2] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 2 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)));
       s[3] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 3 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)));
       s[4] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 4 * src_stride)),
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)));
       s[5] = _mm_unpacklo_epi8(
-          _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+          _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 5 * src_stride)), src6);
 
-      int i = 0;
       do {
-        data = &src_ptr[i * src_stride + j];
         s[6] = _mm_unpacklo_epi8(
-            src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
-        src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+            src6, _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)));
+        src6 = _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 8 * src_stride));
         s[7] = _mm_unpacklo_epi8(
-            _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+            _mm_cvtsi32_si128(*(uint32_t *)(src_ptr + 7 * src_stride)), src6);
 
-        res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+        res = convolve_lo_y(s + 0, coeffs);
+        res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+        res16 = _mm_packs_epi32(res_round, res_round);
+        res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+        if (w == 2)
+          *(uint16_t *)dst = (uint16_t)res_int;
+        else
+          *(uint32_t *)dst = res_int;
 
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
+        src_ptr += src_stride;
+        dst += dst_stride;
 
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
+        res = convolve_lo_y(s + 1, coeffs);
+        res_round = _mm_sra_epi32(_mm_add_epi32(res, round_const), round_shift);
+        res16 = _mm_packs_epi32(res_round, res_round);
+        res_int = _mm_cvtsi128_si32(_mm_packus_epi16(res16, res16));
 
-        res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
-        res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+        if (w == 2)
+          *(uint16_t *)dst = (uint16_t)res_int;
+        else
+          *(uint32_t *)dst = res_int;
 
-        res_lo_round =
-            _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
-        res_hi_round =
-            _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
-
-        res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
-        res = _mm_packus_epi16(res16, res16);
-
-        _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
-        i++;
+        src_ptr += src_stride;
+        dst += dst_stride;
 
         s[0] = s[2];
         s[1] = s[3];
@@ -220,9 +258,82 @@
         s[3] = s[5];
         s[4] = s[6];
         s[5] = s[7];
-      } while (i < h);
-      j += 8;
-    } while (j < w);
+        h -= 2;
+      } while (h);
+    } else {
+      assert(!(w % 8));
+      int j = 0;
+      do {
+        __m128i s[8], src6, res_lo, res_hi;
+        __m128i res_lo_round, res_hi_round, res16, res;
+        const uint8_t *data = &src_ptr[j];
+
+        src6 = _mm_loadl_epi64((__m128i *)(data + 6 * src_stride));
+        s[0] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 0 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)));
+        s[1] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 1 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)));
+        s[2] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 2 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)));
+        s[3] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 3 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)));
+        s[4] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 4 * src_stride)),
+            _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)));
+        s[5] = _mm_unpacklo_epi8(
+            _mm_loadl_epi64((__m128i *)(data + 5 * src_stride)), src6);
+
+        int i = 0;
+        do {
+          data = &src_ptr[i * src_stride + j];
+          s[6] = _mm_unpacklo_epi8(
+              src6, _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)));
+          src6 = _mm_loadl_epi64((__m128i *)(data + 8 * src_stride));
+          s[7] = _mm_unpacklo_epi8(
+              _mm_loadl_epi64((__m128i *)(data + 7 * src_stride)), src6);
+
+          res_lo = convolve_lo_y(s, coeffs);  // Filter low index pixels
+          res_hi = convolve_hi_y(s, coeffs);  // Filter high index pixels
+
+          res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+          res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          res = _mm_packus_epi16(res16, res16);
+
+          _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+          i++;
+
+          res_lo = convolve_lo_y(s + 1, coeffs);  // Filter low index pixels
+          res_hi = convolve_hi_y(s + 1, coeffs);  // Filter high index pixels
+
+          res_lo_round =
+              _mm_sra_epi32(_mm_add_epi32(res_lo, round_const), round_shift);
+          res_hi_round =
+              _mm_sra_epi32(_mm_add_epi32(res_hi, round_const), round_shift);
+
+          res16 = _mm_packs_epi32(res_lo_round, res_hi_round);
+          res = _mm_packus_epi16(res16, res16);
+
+          _mm_storel_epi64((__m128i *)(dst + i * dst_stride + j), res);
+          i++;
+
+          s[0] = s[2];
+          s[1] = s[3];
+          s[2] = s[4];
+          s[3] = s[5];
+          s[4] = s[6];
+          s[5] = s[7];
+        } while (i < h);
+        j += 8;
+      } while (j < w);
+    }
   }
 }
 
diff --git a/test/av1_convolve_test.cc b/test/av1_convolve_test.cc
index 06f05e9..3ae4504 100644
--- a/test/av1_convolve_test.cc
+++ b/test/av1_convolve_test.cc
@@ -498,7 +498,7 @@
  public:
   void RunTest() {
     for (int sub_y = 0; sub_y < 16; ++sub_y) {
-      for (int filter = EIGHTTAP_REGULAR; filter < INTERP_FILTERS_ALL;
+      for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
            ++filter) {
         InterpFilter f = static_cast<InterpFilter>(filter);
         TestConvolve(sub_y, f);
@@ -506,6 +506,15 @@
     }
   }
 
+ public:
+  void SpeedTest() {
+    for (int filter = EIGHTTAP_REGULAR; filter <= INTERP_FILTERS_ALL;
+         ++filter) {
+      InterpFilter f = static_cast<InterpFilter>(filter);
+      TestConvolveSpeed(f, 10000);
+    }
+  }
+
  private:
   void TestConvolve(const int sub_y, const InterpFilter filter) {
     const int width = GetParam().Block().Width();
@@ -515,17 +524,51 @@
         av1_get_interp_filter_params_with_block_size(filter, height);
     const uint8_t *input = FirstRandomInput8(GetParam());
     DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
-    av1_convolve_y_sr(input, width, reference, kOutputStride, width, height,
-                      filter_params_y, sub_y);
+    av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+                        filter_params_y, sub_y);
     DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
     GetParam().TestFunction()(input, width, test, kOutputStride, width, height,
                               filter_params_y, sub_y);
     AssertOutputBufferEq(reference, test, width, height);
   }
+
+ private:
+  void TestConvolveSpeed(const InterpFilter filter, const int num_iters) {
+    const int width = GetParam().Block().Width();
+    const int height = GetParam().Block().Height();
+
+    const InterpFilterParams *filter_params_y =
+        av1_get_interp_filter_params_with_block_size(filter, height);
+    const uint8_t *input = FirstRandomInput8(GetParam());
+    DECLARE_ALIGNED(32, uint8_t, reference[MAX_SB_SQUARE]);
+
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      av1_convolve_y_sr_c(input, width, reference, kOutputStride, width, height,
+                          filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time1 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+
+    DECLARE_ALIGNED(32, uint8_t, test[MAX_SB_SQUARE]);
+
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_iters; ++i) {
+      GetParam().TestFunction()(input, width, test, kOutputStride, width,
+                                height, filter_params_y, 0);
+    }
+    aom_usec_timer_mark(&timer);
+    const double time2 = static_cast<double>(aom_usec_timer_elapsed(&timer));
+    printf("%d %3dx%-3d:%7.2f/%7.2fns (%3.2f)\n", filter, width, height, time1,
+           time2, time1 / time2);
+  }
 };
 
 TEST_P(AV1ConvolveYTest, RunTest) { RunTest(); }
 
+TEST_P(AV1ConvolveYTest, DISABLED_SpeedTest) { SpeedTest(); }
+
 INSTANTIATE_TEST_SUITE_P(C, AV1ConvolveYTest,
                          BuildLowbdParams(av1_convolve_y_sr_c));