Remove no longer needed special case in Neon convolutions The neon_dotprod and neon_i8mm implementations of some convolution functions have a special case to handle the no-op filter, as 128 does not fit in a signed 8-bit integer. This no-op filter is never used in practice - the unit tests have been updated to reflect this in a previous patch - so remove the code to handle the no-op filter. Change-Id: Ifd43a4f752a6cb9de8bbfbe7b4577764a4208d7e

commit: b21d9b0ed94b204d9cb55c9a0cf465b8a43d0d70 [log] [tgz]
author: Salome Thirot <salome.thirot@arm.com> Thu May 09 10:06:29 2024 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> Wed May 15 16:37:31 2024 +0000
tree: 7389a866b8a638caa4a83ad9c7c3926f0ea3ff90
parent: 24e42c92e4aadc216cd0b366356e4d0e85771d6b [diff]
diff --git a/av1/common/arm/convolve_neon_dotprod.c b/av1/common/arm/convolve_neon_dotprod.c
index b558744..9c50890 100644
--- a/av1/common/arm/convolve_neon_dotprod.c
+++ b/av1/common/arm/convolve_neon_dotprod.c

@@ -105,6 +105,9 @@
 static INLINE void convolve_x_sr_12tap_neon_dotprod(
     const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w,
     int h, const int16_t *x_filter_ptr) {
+  // The no-op filter should never be used here.
+  assert(x_filter_ptr[5] != 128);
+
   const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
   const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
@@ -120,89 +123,60 @@
   const uint8x16_t range_limit = vdupq_n_u8(128);
   const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    // Undo the horizontal offset in the calling function.
-    src += 5;
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
 
+      int16x4_t d0 =
+          convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
+
+      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
     do {
       const uint8_t *s = src;
       uint8_t *d = dst;
       int width = w;
 
       do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        uint8x8_t d0 =
+            convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d1 =
+            convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d2 =
+            convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
+        uint8x8_t d3 =
+            convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
+
+        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 =
-            convolve12_4_x(s0, filter, correction, range_limit, permute_tbl);
-        int16x4_t d1 =
-            convolve12_4_x(s1, filter, correction, range_limit, permute_tbl);
-        int16x4_t d2 =
-            convolve12_4_x(s2, filter, correction, range_limit, permute_tbl);
-        int16x4_t d3 =
-            convolve12_4_x(s3, filter, correction, range_limit, permute_tbl);
-
-        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
-        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
-
-        store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          uint8x8_t d0 =
-              convolve12_8_x(s0, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d1 =
-              convolve12_8_x(s1, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d2 =
-              convolve12_8_x(s2, filter, correction, range_limit, permute_tbl);
-          uint8x8_t d3 =
-              convolve12_8_x(s3, filter, correction, range_limit, permute_tbl);
-
-          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    }
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -520,44 +494,100 @@
 static INLINE void convolve_y_sr_12tap_neon_dotprod(
     const uint8_t *src_ptr, int src_stride, uint8_t *dst_ptr, int dst_stride,
     int w, int h, const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  // The no-op filter should never be used here.
+  assert(y_filter_ptr[5] != 128);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
+    load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
+                 &t8, &t9, &tA);
+    src_ptr += 11 * src_stride;
+
+    // Transform sample range to [-128, 127] for 8-bit signed dot product.
+    int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
+    int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
+    int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
+    int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
+    int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
+    int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
+    int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
+    int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
+    int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
+    int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
+    int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
+
+    int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, &s789A);
 
     do {
+      uint8x8_t tB, tC, tD, tE;
+      load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
+
+      int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
+      int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
+      int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
+      int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
+
+      int8x16_t s89AB, s9ABC, sABCD, sBCDE;
+      transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+      // Merge new data into block from previous iteration.
+      int8x16x2_t samples_LUT = { { s789A, sBCDE } };
+      s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
+      s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
+      sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 =
+          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+      int16x4_t d1 =
+          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+      int16x4_t d2 =
+          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+      int16x4_t d3 =
+          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s789A;
+      s4567 = s89AB;
+      s5678 = s9ABC;
+      s6789 = sABCD;
+      s789A = sBCDE;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
       const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
-      int width = w;
 
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
-    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
-
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
-
-    if (w == 4) {
       uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
-      load_u8_8x11(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7,
-                   &t8, &t9, &tA);
-      src_ptr += 11 * src_stride;
+      load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
+                   &t9, &tA);
+      s += 11 * src_stride;
 
       // Transform sample range to [-128, 127] for 8-bit signed dot product.
       int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
@@ -572,169 +602,87 @@
       int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
       int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
 
-      int8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
-      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
-      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
-      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
-      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+          s6789_hi, s789A_lo, s789A_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+      transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
 
       do {
         uint8x8_t tB, tC, tD, tE;
-        load_u8_8x4(src_ptr, src_stride, &tB, &tC, &tD, &tE);
+        load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
 
         int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
         int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
         int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
         int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
 
-        int8x16_t s89AB, s9ABC, sABCD, sBCDE;
-        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+        int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+            sBCDE_lo, sBCDE_hi;
+        transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
 
         // Merge new data into block from previous iteration.
-        int8x16x2_t samples_LUT = { { s789A, sBCDE } };
-        s89AB = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[0]);
-        s9ABC = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[1]);
-        sABCD = vqtbl2q_s8(samples_LUT, merge_block_tbl.val[2]);
+        int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+        s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
+        sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
 
-        int16x4_t d0 =
-            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
-        int16x4_t d1 =
-            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
-        int16x4_t d2 =
-            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
-        int16x4_t d3 =
-            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+        int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+        s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
+        sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
 
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        uint8x8_t d0 =
+            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                           s89AB_hi, filter_0_7, filter_4_11);
+        uint8x8_t d1 =
+            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                           s9ABC_hi, filter_0_7, filter_4_11);
+        uint8x8_t d2 =
+            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                           sABCD_hi, filter_0_7, filter_4_11);
+        uint8x8_t d3 =
+            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                           sBCDE_hi, filter_0_7, filter_4_11);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s789A;
-        s4567 = s89AB;
-        s5678 = s9ABC;
-        s6789 = sABCD;
-        s789A = sBCDE;
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s789A_lo;
+        s3456_hi = s789A_hi;
+        s4567_lo = s89AB_lo;
+        s4567_hi = s89AB_hi;
+        s5678_lo = s9ABC_lo;
+        s5678_hi = s9ABC_hi;
+        s6789_lo = sABCD_lo;
+        s6789_hi = sABCD_hi;
+        s789A_lo = sBCDE_lo;
+        s789A_hi = sBCDE_hi;
 
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, tA;
-        load_u8_8x11(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7, &t8,
-                     &t9, &tA);
-        s += 11 * src_stride;
-
-        // Transform sample range to [-128, 127] for 8-bit signed dot product.
-        int8x8_t s0 = vreinterpret_s8_u8(vsub_u8(t0, vdup_n_u8(128)));
-        int8x8_t s1 = vreinterpret_s8_u8(vsub_u8(t1, vdup_n_u8(128)));
-        int8x8_t s2 = vreinterpret_s8_u8(vsub_u8(t2, vdup_n_u8(128)));
-        int8x8_t s3 = vreinterpret_s8_u8(vsub_u8(t3, vdup_n_u8(128)));
-        int8x8_t s4 = vreinterpret_s8_u8(vsub_u8(t4, vdup_n_u8(128)));
-        int8x8_t s5 = vreinterpret_s8_u8(vsub_u8(t5, vdup_n_u8(128)));
-        int8x8_t s6 = vreinterpret_s8_u8(vsub_u8(t6, vdup_n_u8(128)));
-        int8x8_t s7 = vreinterpret_s8_u8(vsub_u8(t7, vdup_n_u8(128)));
-        int8x8_t s8 = vreinterpret_s8_u8(vsub_u8(t8, vdup_n_u8(128)));
-        int8x8_t s9 = vreinterpret_s8_u8(vsub_u8(t9, vdup_n_u8(128)));
-        int8x8_t sA = vreinterpret_s8_u8(vsub_u8(tA, vdup_n_u8(128)));
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        int8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
-            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
-        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
-        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
-        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
-
-        do {
-          uint8x8_t tB, tC, tD, tE;
-          load_u8_8x4(s, src_stride, &tB, &tC, &tD, &tE);
-
-          int8x8_t sB = vreinterpret_s8_u8(vsub_u8(tB, vdup_n_u8(128)));
-          int8x8_t sC = vreinterpret_s8_u8(vsub_u8(tC, vdup_n_u8(128)));
-          int8x8_t sD = vreinterpret_s8_u8(vsub_u8(tD, vdup_n_u8(128)));
-          int8x8_t sE = vreinterpret_s8_u8(vsub_u8(tE, vdup_n_u8(128)));
-
-          int8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
-              sBCDE_lo, sBCDE_hi;
-          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
-
-          // Merge new data into block from previous iteration.
-          int8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
-          s89AB_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s9ABC_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[1]);
-          sABCD_lo = vqtbl2q_s8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          int8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
-          s89AB_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s9ABC_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[1]);
-          sABCD_hi = vqtbl2q_s8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
-                             s89AB_hi, filter_0_7, filter_4_11);
-          uint8x8_t d1 =
-              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
-                             s9ABC_hi, filter_0_7, filter_4_11);
-          uint8x8_t d2 =
-              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
-                             sABCD_hi, filter_0_7, filter_4_11);
-          uint8x8_t d3 =
-              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
-                             sBCDE_hi, filter_0_7, filter_4_11);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s789A_lo;
-          s3456_hi = s789A_hi;
-          s4567_lo = s89AB_lo;
-          s4567_hi = s89AB_hi;
-          s5678_lo = s9ABC_lo;
-          s5678_hi = s9ABC_hi;
-          s6789_lo = sABCD_lo;
-          s6789_hi = sABCD_hi;
-          s789A_lo = sBCDE_lo;
-          s789A_hi = sBCDE_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -1026,15 +974,88 @@
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
   const int bd = 8;
 
-  // Special case the following no-op filter as 128 won't fit into the 8-bit
-  // signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
+
+  // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
+  // shifts - which are generally faster than rounding shifts on modern CPUs.
+  const int32_t horiz_const =
+      ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  // Dot product constants.
+  const int32x4_t correction = vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
+  const uint8x16_t range_limit = vdupq_n_u8(128);
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d1 =
+          convolve12_4_2d_h(s1, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d2 =
+          convolve12_4_2d_h(s2, x_filter, correction, range_limit, permute_tbl);
+      int16x4_t d3 =
+          convolve12_4_2d_h(s3, x_filter, correction, range_limit, permute_tbl);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 =
+          convolve12_4_2d_h(s0, x_filter, correction, range_limit, permute_tbl);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction, range_limit,
+                                         permute_tbl);
+        int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction, range_limit,
+                                         permute_tbl);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
 
     do {
       const uint8_t *s = src_ptr;
@@ -1042,123 +1063,20 @@
       int width = w;
 
       do {
-        uint8x8_t s0 = vld1_u8(s);
-        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
-        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
-        // Store 8 elements to avoid additional branches. This is safe if the
-        // actual block width is < 8 because the intermediate buffer is large
-        // enough to accommodate 128x128 blocks.
-        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction, range_limit,
+                                         permute_tbl);
+        vst1q_s16(d, d0);
 
-        d += 8;
         s += 8;
+        d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src_ptr += src_stride;
       dst_ptr += dst_stride;
     } while (--h != 0);
-
-  } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-
-    // Adding a shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding
-    // shifts - which are generally faster than rounding shifts on modern CPUs.
-    const int32_t horiz_const =
-        ((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    // Dot product constants.
-    const int32x4_t correction =
-        vdupq_n_s32((128 << FILTER_BITS) + horiz_const);
-    const uint8x16_t range_limit = vdupq_n_u8(128);
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, correction, range_limit,
-                                         permute_tbl);
-        int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, correction, range_limit,
-                                         permute_tbl);
-
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
-
-      do {
-        uint8x16_t s0 = vld1q_u8(src_ptr);
-        int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, correction, range_limit,
-                                         permute_tbl);
-        vst1_s16(dst_ptr, d0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-
-    } else {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d1 = convolve12_8_2d_h(s1, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d2 = convolve12_8_2d_h(s2, x_filter, correction,
-                                           range_limit, permute_tbl);
-          int16x8_t d3 = convolve12_8_2d_h(s3, x_filter, correction,
-                                           range_limit, permute_tbl);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
-
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-          int16x8_t d0 = convolve12_8_2d_h(s0, x_filter, correction,
-                                           range_limit, permute_tbl);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-    }
   }
 }
 

diff --git a/av1/common/arm/convolve_neon_i8mm.c b/av1/common/arm/convolve_neon_i8mm.c
index b2f489f..7ba8b66 100644
--- a/av1/common/arm/convolve_neon_i8mm.c
+++ b/av1/common/arm/convolve_neon_i8mm.c

@@ -95,94 +95,68 @@
                                                  int src_stride, uint8_t *dst,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *x_filter_ptr) {
+  // The no-op filter should never be used here.
+  assert(x_filter_ptr[5] != 128);
+
   const int16x8_t filter_0_7 = vld1q_s16(x_filter_ptr);
   const int16x4_t filter_8_11 = vld1_s16(x_filter_ptr + 8);
   const int16x8_t filter_8_15 = vcombine_s16(filter_8_11, vdup_n_s16(0));
   const int8x16_t filter =
       vcombine_s8(vmovn_s16(filter_0_7), vmovn_s16(filter_8_15));
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(filter_0_7, 5) == 128) {
-    // Undo the horizontal offset in the calling function.
-    src += 5;
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
+  // right shift by FILTER_BITS - instead of a first rounding right shift by
+  // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
+  // ROUND0_BITS.
+  const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
 
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
+
+      uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
+      uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
+
+      dst += 4 * dst_stride;
+      src += 4 * src_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
     do {
       const uint8_t *s = src;
       uint8_t *d = dst;
       int width = w;
 
       do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
+        uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
+        uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
+        uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
+
+        store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
 
         s += 8;
         d += 8;
         width -= 8;
-      } while (width > 0);
-      src += src_stride;
-      dst += dst_stride;
-    } while (--h != 0);
-  } else {
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use a single rounding
-    // right shift by FILTER_BITS - instead of a first rounding right shift by
-    // ROUND0_BITS, followed by second rounding right shift by FILTER_BITS -
-    // ROUND0_BITS.
-    const int32x4_t horiz_const = vdupq_n_s32(1 << (ROUND0_BITS - 1));
-
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 = convolve12_4_x(s0, filter, permute_tbl, horiz_const);
-        int16x4_t d1 = convolve12_4_x(s1, filter, permute_tbl, horiz_const);
-        int16x4_t d2 = convolve12_4_x(s2, filter, permute_tbl, horiz_const);
-        int16x4_t d3 = convolve12_4_x(s3, filter, permute_tbl, horiz_const);
-
-        uint8x8_t d01 = vqmovun_s16(vcombine_s16(d0, d1));
-        uint8x8_t d23 = vqmovun_s16(vcombine_s16(d2, d3));
-
-        store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d23);
-
-        dst += 4 * dst_stride;
-        src += 4 * src_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        const uint8_t *s = src;
-        uint8_t *d = dst;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          uint8x8_t d0 = convolve12_8_x(s0, filter, permute_tbl, horiz_const);
-          uint8x8_t d1 = convolve12_8_x(s1, filter, permute_tbl, horiz_const);
-          uint8x8_t d2 = convolve12_8_x(s2, filter, permute_tbl, horiz_const);
-          uint8x8_t d3 = convolve12_8_x(s3, filter, permute_tbl, horiz_const);
-
-          store_u8_8x4(d + 0 * dst_stride, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src += 4 * src_stride;
-        dst += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    }
+      } while (width != 0);
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
   }
 }
 
@@ -470,187 +444,161 @@
                                                  uint8_t *dst_ptr,
                                                  int dst_stride, int w, int h,
                                                  const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  // The no-op filter should never be used here.
+  assert(y_filter_ptr[5] != 128);
+
+  const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
+  const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
+    load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
+                 &s8, &s9, &sA);
+    src_ptr += 11 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+    transpose_concat_4x4(s4, s5, s6, s7, &s4567);
+    transpose_concat_4x4(s5, s6, s7, s8, &s5678);
+    transpose_concat_4x4(s6, s7, s8, s9, &s6789);
+    transpose_concat_4x4(s7, s8, s9, sA, &s789A);
 
     do {
+      uint8x8_t sB, sC, sD, sE;
+      load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
+
+      uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
+      transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
+      s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 =
+          convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
+      int16x4_t d1 =
+          convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
+      int16x4_t d2 =
+          convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
+      int16x4_t d3 =
+          convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s789A;
+      s4567 = s89AB;
+      s5678 = s9ABC;
+      s6789 = sABCD;
+      s789A = sBCDE;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
       const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
-      int width = w;
 
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter_0_7 = vmovn_s16(vld1q_s16(y_filter_ptr));
-    const int8x8_t filter_4_11 = vmovn_s16(vld1q_s16(y_filter_ptr + 4));
-
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
-
-    if (w == 4) {
       uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
-      load_u8_8x11(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7,
-                   &s8, &s9, &sA);
-      src_ptr += 11 * src_stride;
+      load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
+                   &s9, &sA);
+      s += 11 * src_stride;
 
-      // This operation combines a conventional transpose and the sample permute
-      // (see horizontal case) required before computing the dot product.
-      uint8x16_t s0123, s1234, s2345, s3456, s4567, s5678, s6789, s789A;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
-      transpose_concat_4x4(s4, s5, s6, s7, &s4567);
-      transpose_concat_4x4(s5, s6, s7, s8, &s5678);
-      transpose_concat_4x4(s6, s7, s8, s9, &s6789);
-      transpose_concat_4x4(s7, s8, s9, sA, &s789A);
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo,
+          s6789_hi, s789A_lo, s789A_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
+      transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
+      transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
+      transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
+      transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
 
       do {
         uint8x8_t sB, sC, sD, sE;
-        load_u8_8x4(src_ptr, src_stride, &sB, &sC, &sD, &sE);
+        load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
 
-        uint8x16_t s89AB, s9ABC, sABCD, sBCDE;
-        transpose_concat_4x4(sB, sC, sD, sE, &sBCDE);
+        uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
+            sBCDE_lo, sBCDE_hi;
+        transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
 
         // Merge new data into block from previous iteration.
-        uint8x16x2_t samples_LUT = { { s789A, sBCDE } };
-        s89AB = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s9ABC = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        sABCD = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
+        s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+        sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
 
-        int16x4_t d0 =
-            convolve12_4_y(s0123, s4567, s89AB, filter_0_7, filter_4_11);
-        int16x4_t d1 =
-            convolve12_4_y(s1234, s5678, s9ABC, filter_0_7, filter_4_11);
-        int16x4_t d2 =
-            convolve12_4_y(s2345, s6789, sABCD, filter_0_7, filter_4_11);
-        int16x4_t d3 =
-            convolve12_4_y(s3456, s789A, sBCDE, filter_0_7, filter_4_11);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+        uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
+        s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+        sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
 
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        uint8x8_t d0 =
+            convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
+                           s89AB_hi, filter_0_7, filter_4_11);
+        uint8x8_t d1 =
+            convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
+                           s9ABC_hi, filter_0_7, filter_4_11);
+        uint8x8_t d2 =
+            convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
+                           sABCD_hi, filter_0_7, filter_4_11);
+        uint8x8_t d3 =
+            convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
+                           sBCDE_hi, filter_0_7, filter_4_11);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s789A;
-        s4567 = s89AB;
-        s5678 = s9ABC;
-        s6789 = sABCD;
-        s789A = sBCDE;
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s789A_lo;
+        s3456_hi = s789A_hi;
+        s4567_lo = s89AB_lo;
+        s4567_hi = s89AB_hi;
+        s5678_lo = s9ABC_lo;
+        s5678_hi = s9ABC_hi;
+        s6789_lo = sABCD_lo;
+        s6789_hi = sABCD_hi;
+        s789A_lo = sBCDE_lo;
+        s789A_hi = sBCDE_hi;
 
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, sA;
-        load_u8_8x11(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7, &s8,
-                     &s9, &sA);
-        s += 11 * src_stride;
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi, s4567_lo, s4567_hi, s5678_lo, s5678_hi,
-            s6789_lo, s6789_hi, s789A_lo, s789A_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-        transpose_concat_8x4(s4, s5, s6, s7, &s4567_lo, &s4567_hi);
-        transpose_concat_8x4(s5, s6, s7, s8, &s5678_lo, &s5678_hi);
-        transpose_concat_8x4(s6, s7, s8, s9, &s6789_lo, &s6789_hi);
-        transpose_concat_8x4(s7, s8, s9, sA, &s789A_lo, &s789A_hi);
-
-        do {
-          uint8x8_t sB, sC, sD, sE;
-          load_u8_8x4(s, src_stride, &sB, &sC, &sD, &sE);
-
-          uint8x16_t s89AB_lo, s89AB_hi, s9ABC_lo, s9ABC_hi, sABCD_lo, sABCD_hi,
-              sBCDE_lo, sBCDE_hi;
-          transpose_concat_8x4(sB, sC, sD, sE, &sBCDE_lo, &sBCDE_hi);
-
-          // Merge new data into block from previous iteration.
-          uint8x16x2_t samples_LUT_lo = { { s789A_lo, sBCDE_lo } };
-          s89AB_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s9ABC_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
-          sABCD_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          uint8x16x2_t samples_LUT_hi = { { s789A_hi, sBCDE_hi } };
-          s89AB_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s9ABC_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
-          sABCD_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve12_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, s89AB_lo,
-                             s89AB_hi, filter_0_7, filter_4_11);
-          uint8x8_t d1 =
-              convolve12_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, s9ABC_lo,
-                             s9ABC_hi, filter_0_7, filter_4_11);
-          uint8x8_t d2 =
-              convolve12_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, sABCD_lo,
-                             sABCD_hi, filter_0_7, filter_4_11);
-          uint8x8_t d3 =
-              convolve12_8_y(s3456_lo, s3456_hi, s789A_lo, s789A_hi, sBCDE_lo,
-                             sBCDE_hi, filter_0_7, filter_4_11);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s789A_lo;
-          s3456_hi = s789A_hi;
-          s4567_lo = s89AB_lo;
-          s4567_hi = s89AB_hi;
-          s5678_lo = s9ABC_lo;
-          s5678_hi = s9ABC_hi;
-          s6789_lo = sABCD_lo;
-          s6789_hi = sABCD_hi;
-          s789A_lo = sBCDE_lo;
-          s789A_hi = sBCDE_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -684,155 +632,126 @@
                                                 uint8_t *dst_ptr,
                                                 int dst_stride, int w, int h,
                                                 const int16_t *y_filter_ptr) {
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (y_filter_ptr[5] == 128) {
-    // Undo the vertical offset in the calling function.
-    src_ptr += 5 * src_stride;
+  const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
+
+  const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
+
+  if (w == 4) {
+    uint8x8_t s0, s1, s2, s3, s4, s5, s6;
+    load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+    src_ptr += 7 * src_stride;
+
+    // This operation combines a conventional transpose and the sample permute
+    // (see horizontal case) required before computing the dot product.
+    uint8x16_t s0123, s1234, s2345, s3456;
+    transpose_concat_4x4(s0, s1, s2, s3, &s0123);
+    transpose_concat_4x4(s1, s2, s3, s4, &s1234);
+    transpose_concat_4x4(s2, s3, s4, s5, &s2345);
+    transpose_concat_4x4(s3, s4, s5, s6, &s3456);
 
     do {
+      uint8x8_t s7, s8, s9, s10;
+      load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+
+      uint8x16_t s4567, s5678, s6789, s78910;
+      transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+
+      // Merge new data into block from previous iteration.
+      uint8x16x2_t samples_LUT = { { s3456, s78910 } };
+      s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
+      s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
+      s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+
+      int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
+      int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
+      int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
+      int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
+      uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
+      uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+
+      store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
+      store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+
+      // Prepare block for next iteration - re-using as much as possible.
+      // Shuffle everything up four rows.
+      s0123 = s4567;
+      s1234 = s5678;
+      s2345 = s6789;
+      s3456 = s78910;
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h != 0);
+  } else {
+    do {
+      int height = h;
       const uint8_t *s = src_ptr;
       uint8_t *d = dst_ptr;
-      int width = w;
 
-      do {
-        uint8x8_t d0 = vld1_u8(s);
-        if (w == 4) {
-          store_u8_4x1(d, d0);
-        } else {
-          vst1_u8(d, d0);
-        }
-
-        s += 8;
-        d += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += dst_stride;
-    } while (--h != 0);
-  } else {
-    const int8x8_t filter = vmovn_s16(vld1q_s16(y_filter_ptr));
-
-    const uint8x16x3_t merge_block_tbl = vld1q_u8_x3(kDotProdMergeBlockTbl);
-
-    if (w == 4) {
       uint8x8_t s0, s1, s2, s3, s4, s5, s6;
-      load_u8_8x7(src_ptr, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-      src_ptr += 7 * src_stride;
+      load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
+      s += 7 * src_stride;
 
-      // This operation combines a conventional transpose and the sample permute
-      // (see horizontal case) required before computing the dot product.
-      uint8x16_t s0123, s1234, s2345, s3456;
-      transpose_concat_4x4(s0, s1, s2, s3, &s0123);
-      transpose_concat_4x4(s1, s2, s3, s4, &s1234);
-      transpose_concat_4x4(s2, s3, s4, s5, &s2345);
-      transpose_concat_4x4(s3, s4, s5, s6, &s3456);
+      // This operation combines a conventional transpose and the sample
+      // permute (see horizontal case) required before computing the dot
+      // product.
+      uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
+          s3456_lo, s3456_hi;
+      transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
+      transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
+      transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
+      transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
 
       do {
         uint8x8_t s7, s8, s9, s10;
-        load_u8_8x4(src_ptr, src_stride, &s7, &s8, &s9, &s10);
+        load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
 
-        uint8x16_t s4567, s5678, s6789, s78910;
-        transpose_concat_4x4(s7, s8, s9, s10, &s78910);
+        uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
+            s78910_lo, s78910_hi;
+        transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
 
         // Merge new data into block from previous iteration.
-        uint8x16x2_t samples_LUT = { { s3456, s78910 } };
-        s4567 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[0]);
-        s5678 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[1]);
-        s6789 = vqtbl2q_u8(samples_LUT, merge_block_tbl.val[2]);
+        uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
+        s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
+        s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
+        s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
 
-        int16x4_t d0 = convolve8_4_y(s0123, s4567, filter);
-        int16x4_t d1 = convolve8_4_y(s1234, s5678, filter);
-        int16x4_t d2 = convolve8_4_y(s2345, s6789, filter);
-        int16x4_t d3 = convolve8_4_y(s3456, s78910, filter);
-        uint8x8_t d01 = vqrshrun_n_s16(vcombine_s16(d0, d1), FILTER_BITS);
-        uint8x8_t d23 = vqrshrun_n_s16(vcombine_s16(d2, d3), FILTER_BITS);
+        uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
+        s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
+        s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
+        s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
 
-        store_u8x4_strided_x2(dst_ptr + 0 * dst_stride, dst_stride, d01);
-        store_u8x4_strided_x2(dst_ptr + 2 * dst_stride, dst_stride, d23);
+        uint8x8_t d0 =
+            convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
+        uint8x8_t d1 =
+            convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
+        uint8x8_t d2 =
+            convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
+        uint8x8_t d3 =
+            convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
+
+        store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
 
         // Prepare block for next iteration - re-using as much as possible.
         // Shuffle everything up four rows.
-        s0123 = s4567;
-        s1234 = s5678;
-        s2345 = s6789;
-        s3456 = s78910;
+        s0123_lo = s4567_lo;
+        s0123_hi = s4567_hi;
+        s1234_lo = s5678_lo;
+        s1234_hi = s5678_hi;
+        s2345_lo = s6789_lo;
+        s2345_hi = s6789_hi;
+        s3456_lo = s78910_lo;
+        s3456_hi = s78910_hi;
 
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h != 0);
-    } else {
-      do {
-        int height = h;
-        const uint8_t *s = src_ptr;
-        uint8_t *d = dst_ptr;
-
-        uint8x8_t s0, s1, s2, s3, s4, s5, s6;
-        load_u8_8x7(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6);
-        s += 7 * src_stride;
-
-        // This operation combines a conventional transpose and the sample
-        // permute (see horizontal case) required before computing the dot
-        // product.
-        uint8x16_t s0123_lo, s0123_hi, s1234_lo, s1234_hi, s2345_lo, s2345_hi,
-            s3456_lo, s3456_hi;
-        transpose_concat_8x4(s0, s1, s2, s3, &s0123_lo, &s0123_hi);
-        transpose_concat_8x4(s1, s2, s3, s4, &s1234_lo, &s1234_hi);
-        transpose_concat_8x4(s2, s3, s4, s5, &s2345_lo, &s2345_hi);
-        transpose_concat_8x4(s3, s4, s5, s6, &s3456_lo, &s3456_hi);
-
-        do {
-          uint8x8_t s7, s8, s9, s10;
-          load_u8_8x4(s, src_stride, &s7, &s8, &s9, &s10);
-
-          uint8x16_t s4567_lo, s4567_hi, s5678_lo, s5678_hi, s6789_lo, s6789_hi,
-              s78910_lo, s78910_hi;
-          transpose_concat_8x4(s7, s8, s9, s10, &s78910_lo, &s78910_hi);
-
-          // Merge new data into block from previous iteration.
-          uint8x16x2_t samples_LUT_lo = { { s3456_lo, s78910_lo } };
-          s4567_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[0]);
-          s5678_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[1]);
-          s6789_lo = vqtbl2q_u8(samples_LUT_lo, merge_block_tbl.val[2]);
-
-          uint8x16x2_t samples_LUT_hi = { { s3456_hi, s78910_hi } };
-          s4567_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[0]);
-          s5678_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[1]);
-          s6789_hi = vqtbl2q_u8(samples_LUT_hi, merge_block_tbl.val[2]);
-
-          uint8x8_t d0 =
-              convolve8_8_y(s0123_lo, s0123_hi, s4567_lo, s4567_hi, filter);
-          uint8x8_t d1 =
-              convolve8_8_y(s1234_lo, s1234_hi, s5678_lo, s5678_hi, filter);
-          uint8x8_t d2 =
-              convolve8_8_y(s2345_lo, s2345_hi, s6789_lo, s6789_hi, filter);
-          uint8x8_t d3 =
-              convolve8_8_y(s3456_lo, s3456_hi, s78910_lo, s78910_hi, filter);
-
-          store_u8_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          // Prepare block for next iteration - re-using as much as possible.
-          // Shuffle everything up four rows.
-          s0123_lo = s4567_lo;
-          s0123_hi = s4567_hi;
-          s1234_lo = s5678_lo;
-          s1234_hi = s5678_hi;
-          s2345_lo = s6789_lo;
-          s2345_hi = s6789_hi;
-          s3456_lo = s78910_lo;
-          s3456_hi = s78910_hi;
-
-          s += 4 * src_stride;
-          d += 4 * dst_stride;
-          height -= 4;
-        } while (height != 0);
-        src_ptr += 8;
-        dst_ptr += 8;
-        w -= 8;
-      } while (w != 0);
-    }
+        s += 4 * src_stride;
+        d += 4 * dst_stride;
+        height -= 4;
+      } while (height != 0);
+      src_ptr += 8;
+      dst_ptr += 8;
+      w -= 8;
+    } while (w != 0);
   }
 }
 
@@ -928,15 +847,80 @@
     const uint8_t *src_ptr, int src_stride, int16_t *dst_ptr,
     const int dst_stride, int w, int h, const int16x8_t x_filter_0_7,
     const int16x4_t x_filter_8_11) {
+  // The no-op filter should never be used here.
+  assert(vgetq_lane_s16(x_filter_0_7, 5) != 128);
+
   const int bd = 8;
 
-  // Special case the following no-op filter as 128 won't fit into the
-  // 8-bit signed dot-product instruction:
-  // { 0, 0, 0, 0, 0, 128, 0, 0, 0, 0, 0, 0 }
-  if (vgetq_lane_s16(x_filter_0_7, 5) == 128) {
-    const uint16x8_t horiz_const = vdupq_n_u16((1 << (bd - 1)));
-    // Undo the horizontal offset in the calling function.
-    src_ptr += 5;
+  // Narrow filter values to 8-bit.
+  const int16x8x2_t x_filter_s16 = {
+    { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
+  };
+  const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
+                                         vmovn_s16(x_filter_s16.val[1]));
+  // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
+  // - which are generally faster than rounding shifts on modern CPUs.
+  const int32x4_t horiz_const =
+      vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
+  const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
+
+  if (w <= 4) {
+    do {
+      uint8x16_t s0, s1, s2, s3;
+      load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
+
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      int16x4_t d1 = convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
+      int16x4_t d2 = convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
+      int16x4_t d3 = convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+      store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
+
+    do {
+      uint8x16_t s0 = vld1q_u8(src_ptr);
+      int16x4_t d0 = convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
+      vst1_s16(dst_ptr, d0);
+
+      src_ptr += src_stride;
+      dst_ptr += dst_stride;
+    } while (--h != 0);
+
+  } else {
+    do {
+      const uint8_t *s = src_ptr;
+      int16_t *d = dst_ptr;
+      int width = w;
+
+      do {
+        uint8x16_t s0[2], s1[2], s2[2], s3[2];
+        load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
+        load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
+
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        int16x8_t d1 =
+            convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
+        int16x8_t d2 =
+            convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
+        int16x8_t d3 =
+            convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
+
+        store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
+
+        s += 8;
+        d += 8;
+        width -= 8;
+      } while (width != 0);
+
+      src_ptr += 4 * src_stride;
+      dst_ptr += 4 * dst_stride;
+      h -= 4;
+    } while (h > 4);
 
     do {
       const uint8_t *s = src_ptr;
@@ -944,119 +928,20 @@
       int width = w;
 
       do {
-        uint8x8_t s0 = vld1_u8(s);
-        uint16x8_t d0 = vaddw_u8(horiz_const, s0);
-        d0 = vshlq_n_u16(d0, FILTER_BITS - ROUND0_BITS);
-        // Store 8 elements to avoid additional branches. This is safe if the
-        // actual block width is < 8 because the intermediate buffer is large
-        // enough to accommodate 128x128 blocks.
-        vst1q_s16(d, vreinterpretq_s16_u16(d0));
+        uint8x16_t s0[2];
+        s0[0] = vld1q_u8(s);
+        s0[1] = vld1q_u8(s + 4);
+        int16x8_t d0 =
+            convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
+        vst1q_s16(d, d0);
 
-        d += 8;
         s += 8;
+        d += 8;
         width -= 8;
-      } while (width > 0);
+      } while (width != 0);
       src_ptr += src_stride;
       dst_ptr += dst_stride;
     } while (--h != 0);
-
-  } else {
-    // Narrow filter values to 8-bit.
-    const int16x8x2_t x_filter_s16 = {
-      { x_filter_0_7, vcombine_s16(x_filter_8_11, vdup_n_s16(0)) }
-    };
-    const int8x16_t x_filter = vcombine_s8(vmovn_s16(x_filter_s16.val[0]),
-                                           vmovn_s16(x_filter_s16.val[1]));
-    // This shim of 1 << (ROUND0_BITS - 1) enables us to use non-rounding shifts
-    // - which are generally faster than rounding shifts on modern CPUs.
-    const int32x4_t horiz_const =
-        vdupq_n_s32((1 << (bd + FILTER_BITS - 1)) + (1 << (ROUND0_BITS - 1)));
-    const uint8x16x3_t permute_tbl = vld1q_u8_x3(kDotProdPermuteTbl);
-
-    if (w <= 4) {
-      do {
-        uint8x16_t s0, s1, s2, s3;
-        load_u8_16x4(src_ptr, src_stride, &s0, &s1, &s2, &s3);
-
-        int16x4_t d0 =
-            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        int16x4_t d1 =
-            convolve12_4_2d_h(s1, x_filter, permute_tbl, horiz_const);
-        int16x4_t d2 =
-            convolve12_4_2d_h(s2, x_filter, permute_tbl, horiz_const);
-        int16x4_t d3 =
-            convolve12_4_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-        store_s16_4x4(dst_ptr, dst_stride, d0, d1, d2, d3);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
-
-      do {
-        uint8x16_t s0 = vld1q_u8(src_ptr);
-        int16x4_t d0 =
-            convolve12_4_2d_h(s0, x_filter, permute_tbl, horiz_const);
-        vst1_s16(dst_ptr, d0);
-
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-
-    } else {
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2], s1[2], s2[2], s3[2];
-          load_u8_16x4(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0]);
-          load_u8_16x4(s + 4, src_stride, &s0[1], &s1[1], &s2[1], &s3[1]);
-
-          int16x8_t d0 =
-              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-          int16x8_t d1 =
-              convolve12_8_2d_h(s1, x_filter, permute_tbl, horiz_const);
-          int16x8_t d2 =
-              convolve12_8_2d_h(s2, x_filter, permute_tbl, horiz_const);
-          int16x8_t d3 =
-              convolve12_8_2d_h(s3, x_filter, permute_tbl, horiz_const);
-
-          store_s16_8x4(d, dst_stride, d0, d1, d2, d3);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-
-        src_ptr += 4 * src_stride;
-        dst_ptr += 4 * dst_stride;
-        h -= 4;
-      } while (h > 4);
-
-      do {
-        const uint8_t *s = src_ptr;
-        int16_t *d = dst_ptr;
-        int width = w;
-
-        do {
-          uint8x16_t s0[2];
-          s0[0] = vld1q_u8(s);
-          s0[1] = vld1q_u8(s + 4);
-          int16x8_t d0 =
-              convolve12_8_2d_h(s0, x_filter, permute_tbl, horiz_const);
-          vst1q_s16(d, d0);
-
-          s += 8;
-          d += 8;
-          width -= 8;
-        } while (width != 0);
-        src_ptr += src_stride;
-        dst_ptr += dst_stride;
-      } while (--h != 0);
-    }
   }
 }
commit	b21d9b0ed94b204d9cb55c9a0cf465b8a43d0d70	[log] [tgz]
author	Salome Thirot <salome.thirot@arm.com>	Thu May 09 10:06:29 2024 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	Wed May 15 16:37:31 2024 +0000
tree	7389a866b8a638caa4a83ad9c7c3926f0ea3ff90
parent	24e42c92e4aadc216cd0b366356e4d0e85771d6b [diff]