Clean up z2 predictor Neon implementation

A few small cleanup changes:

* Clean up a bunch of unnecessary forward declarations.

* Rework the indices used by the existing lane-index load code to enable
  us to use widening subtract/multiply instructions.

* Refactor of ORR(AND(a, b), BIC(c, b)) => BSL(a, b, c), matching a
  similar change already done for the z1 predictor.

Change-Id: Ie27da938e9f44bfa23dc1ebabf5ce886eb20de4c
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index ba17f8a..52daddf 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1527,11 +1527,9 @@
   int16x4_t dy64 = vdup_n_s16(dy);
   int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
   int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
-  int16x4_t v_one = vdup_lane_s16(v_1234, 0);
 
   for (int r = 0; r < N; r++) {
     uint16x8_t res, shift;
-    uint16x4_t ydx;
     uint8x8_t resx, resy;
     uint16x4x2_t v_shift;
     v_shift.val[1] = vdup_n_u16(0);
@@ -1555,7 +1553,7 @@
       v_shift.val[0] = vreinterpret_u16_u8(v_zero_u8);
       v_shift.val[1] = vreinterpret_u16_u8(v_zero_u8);
     } else {
-      ydx = vdup_n_u16(y * dx);
+      uint16x4_t ydx = vdup_n_u16(y * dx);
 
       if (upsample_above) {
         uint8x8x2_t v_tmp;
@@ -1578,25 +1576,26 @@
     }
 
     // y calc
-    uint8x8_t a0_y, a1_y;
     if (base_x < min_base_x) {
-      DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
       int16x4_t v_r6 = vdup_n_s16(r << 6);
       int16x4_t y_c64 = vmls_s16(v_r6, v_1234, dy64);
       int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
       uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
 
       base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
+
+      DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
+
       vst1_s16(base_y_c, base_y_c64);
-      a0_y = v_zero_u8;
+      uint8x8_t a0_y = vdup_n_u8(0);
       a0_y = vld1_lane_u8(left + base_y_c[0], a0_y, 0);
       a0_y = vld1_lane_u8(left + base_y_c[1], a0_y, 2);
       a0_y = vld1_lane_u8(left + base_y_c[2], a0_y, 4);
       a0_y = vld1_lane_u8(left + base_y_c[3], a0_y, 6);
 
-      base_y_c64 = vadd_s16(base_y_c64, v_one);
+      base_y_c64 = vadd_s16(base_y_c64, vdup_n_s16(1));
       vst1_s16(base_y_c, base_y_c64);
-      a1_y = v_zero_u8;
+      uint8x8_t a1_y = vdup_n_u8(0);
       a1_y = vld1_lane_u8(left + base_y_c[0], a1_y, 0);
       a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
       a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
@@ -1623,7 +1622,7 @@
     resy = vext_u8(resx, v_zero_u8, 4);
 
     uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
-    uint8x8_t v_resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+    uint8x8_t v_resxy = vbsl_u8(mask, resy, resx);
     vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(v_resxy), 0);
 
     dst += stride;
@@ -1667,27 +1666,21 @@
   //   above[x+1] - above[x]
   // final pixels will be calculated as:
   //   (above[x] * 32 + 16 + (above[x+1] - above[x]) * shift) >> 5
-  uint8x16x2_t a0_x, a1_x;
   uint16x8x2_t diff, a32;
-  uint16x8_t c1234, a16, c3f;
-  uint8x16_t a0_x128, a1_x128;
-  int16x8_t min_base_y128, dy128;
-  uint16x8_t v_32 = vdupq_n_u16(32);
   uint8x16_t v_zero = vdupq_n_u8(0);
   int16x8_t v_upsample_left = vdupq_n_s16(upsample_left);
   int16x8_t v_upsample_above = vdupq_n_s16(upsample_above);
   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
 
-  a16 = vdupq_n_u16(16);
-  c3f = vdupq_n_u16(0x3f);
-  min_base_y128 = vdupq_n_s16(min_base_y);
-  dy128 = vdupq_n_s16(dy);
-  c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
-                       vcreate_u16(0x0008000700060005));
+  uint16x8_t a16 = vdupq_n_u16(16);
+  uint16x8_t c3f = vdupq_n_u16(0x3f);
+  int16x8_t min_base_y128 = vdupq_n_s16(min_base_y);
+  int16x8_t dy128 = vdupq_n_s16(dy);
+  uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
+                                  vcreate_u16(0x0008000700060005));
 
   for (int r = 0; r < N; r++) {
     uint8x8_t resx, resy, resxy;
-    uint16x8_t r6, ydx;
     uint16x8x2_t res, shift;
     shift.val[1] = vdupq_n_u16(0);
 
@@ -1705,16 +1698,16 @@
       if (base_min_diff < 0) base_min_diff = 0;
     }
 
+    uint8x8_t a0_x0, a1_x0;
     if (base_shift > 7) {
-      a0_x.val[0] = v_zero;
-      a0_x.val[1] = v_zero;
-      a1_x.val[0] = v_zero;
-      a1_x.val[1] = v_zero;
+      a0_x0 = vdup_n_u8(0);
+      a1_x0 = vdup_n_u8(0);
       shift.val[0] = vreinterpretq_u16_u8(v_zero);
       shift.val[1] = vreinterpretq_u16_u8(v_zero);
     } else {
-      ydx = vdupq_n_u16(y * dx);
-      r6 = vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
+      uint16x8_t ydx = vdupq_n_u16(y * dx);
+      uint16x8_t r6 =
+          vshlq_n_u16(vextq_u16(c1234, vreinterpretq_u16_u8(v_zero), 2), 6);
 
       if (upsample_above) {
         uint8x8x2_t v_tmp;
@@ -1724,32 +1717,27 @@
         uint8x8_t v_index_high = vld1_u8(EvenOddMaskx[base_shift] + 8);
         shift.val[0] = vshrq_n_u16(
             vandq_u16(vshlq_u16(vsubq_u16(r6, ydx), v_upsample_above), c3f), 1);
-        a0_x.val[0] =
-            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_low)));
-        a1_x.val[0] =
-            vreinterpretq_u8_u16(vmovl_u8(vtbl2_u8(v_tmp, v_index_high)));
+        a0_x0 = vtbl2_u8(v_tmp, v_index_low);
+        a1_x0 = vtbl2_u8(v_tmp, v_index_high);
       } else {
+        uint8x16_t a0_x128, a1_x128;
         a0_x128 = vld1q_u8(above + base_x + base_shift);
         a1_x128 = vextq_u8(a0_x128, v_zero, 1);
         vector_shuffle(&a0_x128, &v_zero, base_shift);
         vector_shuffle(&a1_x128, &v_zero, base_shift);
         shift.val[0] = vshrq_n_u16(vandq_u16(vsubq_u16(r6, ydx), c3f), 1);
-        a0_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a0_x128)));
-        a1_x.val[0] = vreinterpretq_u8_u16(vmovl_u8(vget_low_u8(a1_x128)));
+        a0_x0 = vget_low_u8(a0_x128);
+        a1_x0 = vget_low_u8(a1_x128);
       }
     }
 
-    diff.val[0] =
-        vsubq_u16(vreinterpretq_u16_u8(a1_x.val[0]),
-                  vreinterpretq_u16_u8(a0_x.val[0]));  // a[x+1] - a[x]
-    a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[0]),
-                           v_32);  // a[x] * 32 + 16
+    diff.val[0] = vsubl_u8(a1_x0, a0_x0);              // a[x+1] - a[x]
+    a32.val[0] = vmlal_u8(a16, a0_x0, vdup_n_u8(32));  // a[x] * 32 + 16
     res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shift.val[0]);
     resx = vshrn_n_u16(res.val[0], 5);
 
     // y calc
     if (base_x < min_base_x) {
-      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
       int16x8_t y_c128, base_y_c128;
       uint16x8_t mask128;
       int16x8_t v_r6 = vdupq_n_s16(r << 6);
@@ -1759,29 +1747,31 @@
       mask128 = vcgtq_s16(min_base_y128, base_y_c128);
 
       base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
-      vst1q_s16(base_y_c, base_y_c128);
-      a0_x.val[1] = v_zero;
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a0_x.val[1], 0);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a0_x.val[1], 2);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a0_x.val[1], 4);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a0_x.val[1], 6);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a0_x.val[1], 8);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a0_x.val[1], 10);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a0_x.val[1], 12);
-      a0_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a0_x.val[1], 14);
 
-      base_y_c128 =
-          vaddq_s16(base_y_c128, vreinterpretq_s16_u16(vshrq_n_u16(a16, 4)));
+      DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
       vst1q_s16(base_y_c, base_y_c128);
-      a1_x.val[1] = v_zero;
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[0], a1_x.val[1], 0);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[1], a1_x.val[1], 2);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[2], a1_x.val[1], 4);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[3], a1_x.val[1], 6);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[4], a1_x.val[1], 8);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[5], a1_x.val[1], 10);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[6], a1_x.val[1], 12);
-      a1_x.val[1] = vld1q_lane_u8(left + base_y_c[7], a1_x.val[1], 14);
+      uint8x8_t a0_x1 = vdup_n_u8(0);
+      a0_x1 = vld1_lane_u8(left + base_y_c[0], a0_x1, 0);
+      a0_x1 = vld1_lane_u8(left + base_y_c[1], a0_x1, 1);
+      a0_x1 = vld1_lane_u8(left + base_y_c[2], a0_x1, 2);
+      a0_x1 = vld1_lane_u8(left + base_y_c[3], a0_x1, 3);
+      a0_x1 = vld1_lane_u8(left + base_y_c[4], a0_x1, 4);
+      a0_x1 = vld1_lane_u8(left + base_y_c[5], a0_x1, 5);
+      a0_x1 = vld1_lane_u8(left + base_y_c[6], a0_x1, 6);
+      a0_x1 = vld1_lane_u8(left + base_y_c[7], a0_x1, 7);
+
+      base_y_c128 = vaddq_s16(base_y_c128, vdupq_n_s16(1));
+      vst1q_s16(base_y_c, base_y_c128);
+      uint8x8_t a1_x1 = vdup_n_u8(0);
+      a1_x1 = vld1_lane_u8(left + base_y_c[0], a1_x1, 0);
+      a1_x1 = vld1_lane_u8(left + base_y_c[1], a1_x1, 1);
+      a1_x1 = vld1_lane_u8(left + base_y_c[2], a1_x1, 2);
+      a1_x1 = vld1_lane_u8(left + base_y_c[3], a1_x1, 3);
+      a1_x1 = vld1_lane_u8(left + base_y_c[4], a1_x1, 4);
+      a1_x1 = vld1_lane_u8(left + base_y_c[5], a1_x1, 5);
+      a1_x1 = vld1_lane_u8(left + base_y_c[6], a1_x1, 6);
+      a1_x1 = vld1_lane_u8(left + base_y_c[7], a1_x1, 7);
 
       if (upsample_left) {
         shift.val[1] = vshrq_n_u16(
@@ -1792,15 +1782,13 @@
         shift.val[1] =
             vshrq_n_u16(vandq_u16(vreinterpretq_u16_s16(y_c128), c3f), 1);
       }
-      diff.val[1] =
-          vsubq_u16(vreinterpretq_u16_u8(a1_x.val[1]),
-                    vreinterpretq_u16_u8(a0_x.val[1]));  // a[x+1] - a[x]
-      a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_x.val[1]),
-                             v_32);  // a[x] * 32 + 16
+
+      diff.val[1] = vsubl_u8(a1_x1, a0_x1);
+      a32.val[1] = vmlal_u8(a16, a0_x1, vdup_n_u8(32));
       res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shift.val[1]);
       resy = vshrn_n_u16(res.val[1], 5);
       uint8x8_t mask = vld1_u8(BaseMask[base_min_diff]);
-      resxy = vorr_u8(vand_u8(mask, resy), vbic_u8(resx, mask));
+      resxy = vbsl_u8(mask, resy, resx);
       vst1_u8(dst, resxy);
     } else {
       vst1_u8(dst, resx);
@@ -1823,22 +1811,17 @@
   const int frac_bits_x = 6;
   const int frac_bits_y = 6;
 
-  uint16x8_t a16, c1, c3f;
-  int16x8_t min_base_y256, dy256;
   uint16x8x2_t a32, c0123, c1234, diff, shifty;
-  uint8x16x2_t a0_x, a1_x, a0_y, a1_y;
-  uint8x16_t a0_x128, a1_x128;
+  uint8x16x2_t a0_x, a1_x;
   uint16x8_t v_32 = vdupq_n_u16(32);
   uint8x16_t v_zero = vdupq_n_u8(0);
   int16x8_t v_frac_bits_y = vdupq_n_s16(-frac_bits_y);
 
-  DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
-
-  a16 = vdupq_n_u16(16);
-  c1 = vshrq_n_u16(a16, 4);
-  min_base_y256 = vdupq_n_s16(min_base_y);
-  c3f = vdupq_n_u16(0x3f);
-  dy256 = vdupq_n_s16(dy);
+  uint16x8_t a16 = vdupq_n_u16(16);
+  uint16x8_t c1 = vshrq_n_u16(a16, 4);
+  int16x8_t min_base_y256 = vdupq_n_s16(min_base_y);
+  uint16x8_t c3f = vdupq_n_u16(0x3f);
+  int16x8_t dy256 = vdupq_n_s16(dy);
   c0123.val[0] = vcombine_u16(vcreate_u16(0x0003000200010000),
                               vcreate_u16(0x0007000600050004));
   c0123.val[1] = vcombine_u16(vcreate_u16(0x000B000A00090008),
@@ -1848,10 +1831,10 @@
 
   for (int r = 0; r < H; r++) {
     uint16x8x2_t res, r6, shift;
-    uint16x8_t ydx, j256;
+    uint16x8_t j256;
     uint8x16_t resx, resy, resxy;
     int y = r + 1;
-    ydx = vdupq_n_u16((uint16_t)(y * dx));
+    uint16x8_t ydx = vdupq_n_u16((uint16_t)(y * dx));
 
     int base_x = (-y * dx) >> frac_bits_x;
     for (int j = 0; j < W; j += 16) {
@@ -1869,6 +1852,7 @@
       }
 
       if (base_shift < 16) {
+        uint8x16_t a0_x128, a1_x128;
         a0_x128 = vld1q_u8(above + base_x + base_shift + j);
         a1_x128 = vld1q_u8(above + base_x + base_shift + 1 + j);
         vector_shuffle(&a0_x128, &v_zero, base_shift);
@@ -1923,19 +1907,20 @@
         mask256.val[0] = vcgtq_s16(min_base_y256, base_y_c256.val[0]);
         mask256.val[1] = vcgtq_s16(min_base_y256, base_y_c256.val[1]);
 
-        base_y_c256.val[0] = vorrq_s16(
-            vandq_s16(vreinterpretq_s16_u16(mask256.val[0]), min_base_y256),
-            vbicq_s16(base_y_c256.val[0],
-                      vreinterpretq_s16_u16(mask256.val[0])));
-        base_y_c256.val[1] = vorrq_s16(
-            vandq_s16(vreinterpretq_s16_u16(mask256.val[1]), min_base_y256),
-            vbicq_s16(base_y_c256.val[1],
-                      vreinterpretq_s16_u16(mask256.val[1])));
+        base_y_c256.val[0] =
+            vbslq_s16(mask256.val[0], min_base_y256, base_y_c256.val[0]);
+        base_y_c256.val[1] =
+            vbslq_s16(mask256.val[1], min_base_y256, base_y_c256.val[1]);
 
         int16_t min_y = vgetq_lane_s16(base_y_c256.val[1], 7);
         int16_t max_y = vgetq_lane_s16(base_y_c256.val[0], 0);
         int16_t offset_diff = max_y - min_y;
 
+        uint8x8_t a0_y0;
+        uint8x8_t a0_y1;
+        uint8x8_t a1_y0;
+        uint8x8_t a1_y1;
+
         if (offset_diff < 16) {
           assert(offset_diff >= 0);
           int16x8_t min_y256 =
@@ -1976,73 +1961,74 @@
           v_res.val[1] = vtbl2_u8(v_tmp, v_index_high);
           a1_y128 = vcombine_u8(v_res.val[0], v_res.val[1]);
 #endif
-          a0_y = vzipq_u8(a0_y128, v_zero);
-          a1_y = vzipq_u8(a1_y128, v_zero);
+          a0_y0 = vget_low_u8(a0_y128);
+          a0_y1 = vget_high_u8(a0_y128);
+          a1_y0 = vget_low_u8(a1_y128);
+          a1_y1 = vget_high_u8(a1_y128);
         } else {
           base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
                                          vreinterpretq_s16_u16(mask256.val[0]));
           base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
                                          vreinterpretq_s16_u16(mask256.val[1]));
+
+          DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
+
           vst1q_s16(base_y_c, base_y_c256.val[0]);
           vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
-          a0_y.val[0] = v_zero;
-          a0_y.val[1] = v_zero;
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a0_y.val[0], 0);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a0_y.val[0], 2);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a0_y.val[0], 4);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a0_y.val[0], 6);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a0_y.val[0], 8);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a0_y.val[0], 10);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a0_y.val[0], 12);
-          a0_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a0_y.val[0], 14);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a0_y.val[1], 0);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a0_y.val[1], 2);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a0_y.val[1], 4);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a0_y.val[1], 6);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a0_y.val[1], 8);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a0_y.val[1], 10);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a0_y.val[1], 12);
-          a0_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a0_y.val[1], 14);
+          a0_y0 = vdup_n_u8(0);
+          a0_y0 = vld1_lane_u8(left + base_y_c[0], a0_y0, 0);
+          a0_y0 = vld1_lane_u8(left + base_y_c[1], a0_y0, 1);
+          a0_y0 = vld1_lane_u8(left + base_y_c[2], a0_y0, 2);
+          a0_y0 = vld1_lane_u8(left + base_y_c[3], a0_y0, 3);
+          a0_y0 = vld1_lane_u8(left + base_y_c[4], a0_y0, 4);
+          a0_y0 = vld1_lane_u8(left + base_y_c[5], a0_y0, 5);
+          a0_y0 = vld1_lane_u8(left + base_y_c[6], a0_y0, 6);
+          a0_y0 = vld1_lane_u8(left + base_y_c[7], a0_y0, 7);
+          a0_y1 = vdup_n_u8(0);
+          a0_y1 = vld1_lane_u8(left + base_y_c[8], a0_y1, 0);
+          a0_y1 = vld1_lane_u8(left + base_y_c[9], a0_y1, 1);
+          a0_y1 = vld1_lane_u8(left + base_y_c[10], a0_y1, 2);
+          a0_y1 = vld1_lane_u8(left + base_y_c[11], a0_y1, 3);
+          a0_y1 = vld1_lane_u8(left + base_y_c[12], a0_y1, 4);
+          a0_y1 = vld1_lane_u8(left + base_y_c[13], a0_y1, 5);
+          a0_y1 = vld1_lane_u8(left + base_y_c[14], a0_y1, 6);
+          a0_y1 = vld1_lane_u8(left + base_y_c[15], a0_y1, 7);
 
           base_y_c256.val[0] =
               vaddq_s16(base_y_c256.val[0], vreinterpretq_s16_u16(c1));
           base_y_c256.val[1] =
               vaddq_s16(base_y_c256.val[1], vreinterpretq_s16_u16(c1));
+
           vst1q_s16(base_y_c, base_y_c256.val[0]);
           vst1q_s16(base_y_c + 8, base_y_c256.val[1]);
-          a1_y.val[0] = v_zero;
-          a1_y.val[1] = v_zero;
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[0], a1_y.val[0], 0);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[1], a1_y.val[0], 2);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[2], a1_y.val[0], 4);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[3], a1_y.val[0], 6);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[4], a1_y.val[0], 8);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[5], a1_y.val[0], 10);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[6], a1_y.val[0], 12);
-          a1_y.val[0] = vld1q_lane_u8(left + base_y_c[7], a1_y.val[0], 14);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[8], a1_y.val[1], 0);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[9], a1_y.val[1], 2);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[10], a1_y.val[1], 4);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[11], a1_y.val[1], 6);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[12], a1_y.val[1], 8);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[13], a1_y.val[1], 10);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[14], a1_y.val[1], 12);
-          a1_y.val[1] = vld1q_lane_u8(left + base_y_c[15], a1_y.val[1], 14);
+          a1_y0 = vdup_n_u8(0);
+          a1_y0 = vld1_lane_u8(left + base_y_c[0], a1_y0, 0);
+          a1_y0 = vld1_lane_u8(left + base_y_c[1], a1_y0, 1);
+          a1_y0 = vld1_lane_u8(left + base_y_c[2], a1_y0, 2);
+          a1_y0 = vld1_lane_u8(left + base_y_c[3], a1_y0, 3);
+          a1_y0 = vld1_lane_u8(left + base_y_c[4], a1_y0, 4);
+          a1_y0 = vld1_lane_u8(left + base_y_c[5], a1_y0, 5);
+          a1_y0 = vld1_lane_u8(left + base_y_c[6], a1_y0, 6);
+          a1_y0 = vld1_lane_u8(left + base_y_c[7], a1_y0, 7);
+          a1_y1 = vdup_n_u8(0);
+          a1_y1 = vld1_lane_u8(left + base_y_c[8], a1_y1, 0);
+          a1_y1 = vld1_lane_u8(left + base_y_c[9], a1_y1, 1);
+          a1_y1 = vld1_lane_u8(left + base_y_c[10], a1_y1, 2);
+          a1_y1 = vld1_lane_u8(left + base_y_c[11], a1_y1, 3);
+          a1_y1 = vld1_lane_u8(left + base_y_c[12], a1_y1, 4);
+          a1_y1 = vld1_lane_u8(left + base_y_c[13], a1_y1, 5);
+          a1_y1 = vld1_lane_u8(left + base_y_c[14], a1_y1, 6);
+          a1_y1 = vld1_lane_u8(left + base_y_c[15], a1_y1, 7);
         }
+
         shifty.val[0] = vshrq_n_u16(
             vandq_u16(vreinterpretq_u16_s16(y_c256.val[0]), c3f), 1);
         shifty.val[1] = vshrq_n_u16(
             vandq_u16(vreinterpretq_u16_s16(y_c256.val[1]), c3f), 1);
-        diff.val[0] =
-            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[0]),
-                      vreinterpretq_u16_u8(a0_y.val[0]));  // a[x+1] - a[x]
-        diff.val[1] =
-            vsubq_u16(vreinterpretq_u16_u8(a1_y.val[1]),
-                      vreinterpretq_u16_u8(a0_y.val[1]));  // a[x+1] - a[x]
-        a32.val[0] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[0]),
-                               v_32);  // a[x] * 32 + 16
-        a32.val[1] = vmlaq_u16(a16, vreinterpretq_u16_u8(a0_y.val[1]),
-                               v_32);  // a[x] * 32 + 16
+        diff.val[0] = vsubl_u8(a1_y0, a0_y0);              // a[x+1] - a[x]
+        diff.val[1] = vsubl_u8(a1_y1, a0_y1);              // a[x+1] - a[x]
+        a32.val[0] = vmlal_u8(a16, a0_y0, vdup_n_u8(32));  // a[x] * 32 + 16
+        a32.val[1] = vmlal_u8(a16, a0_y1, vdup_n_u8(32));  // a[x] * 32 + 16
         res.val[0] = vmlaq_u16(a32.val[0], diff.val[0], shifty.val[0]);
         res.val[1] = vmlaq_u16(a32.val[1], diff.val[1], shifty.val[1]);
 
@@ -2052,7 +2038,7 @@
         resy = v_zero;
       }
       uint8x16_t mask = vld1q_u8(BaseMask[base_min_diff]);
-      resxy = vorrq_u8(vandq_u8(mask, resy), vbicq_u8(resx, mask));
+      resxy = vbslq_u8(mask, resy, resx);
       vst1q_u8(dst + j, resxy);
     }  // for j
     dst += stride;