Use TBL for z2 predictor on AArch64 Neon

The existing code loads each lane of the resulting vector independently,
however in this case the range of possible values for indices is
sufficiently small such that we can simply load all possible values and
then use TBL to select the appropriate ones.

The 'q' variants of the TBL intrinsics are only available in AArch64, so
restrict the new implementation appropriately.

Change-Id: I9d168a093c91525aed7a60e3ef7bd877c293b43e
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 52daddf..58b31a3 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -1528,6 +1528,14 @@
   int16x4_t v_frac_bits_y = vdup_n_s16(-frac_bits_y);
   int16x4_t min_base_y64 = vdup_n_s16(min_base_y);
 
+#if defined(__aarch64__)
+  // Use ext rather than loading left + 14 directly to avoid over-read.
+  const uint8x16_t left_m2 = vld1q_u8(left - 2);
+  const uint8x16_t left_0 = vld1q_u8(left);
+  const uint8x16_t left_14 = vextq_u8(left_0, left_0, 14);
+  const uint8x16x2_t left_vals = { { left_m2, left_14 } };
+#endif  // defined(__aarch64__)
+
   for (int r = 0; r < N; r++) {
     uint16x8_t res, shift;
     uint8x8_t resx, resy;
@@ -1582,8 +1590,16 @@
       int16x4_t base_y_c64 = vshl_s16(y_c64, v_frac_bits_y);
       uint16x4_t mask64 = vcgt_s16(min_base_y64, base_y_c64);
 
+      // Values in base_y_c64 range from -2 through 14 inclusive.
       base_y_c64 = vbic_s16(base_y_c64, vreinterpret_s16_u16(mask64));
 
+#if defined(__aarch64__)
+      uint8x8_t left_idx0 = vreinterpret_u8_s16(base_y_c64 + 2);  // [0, 16]
+      uint8x8_t left_idx1 = vreinterpret_u8_s16(base_y_c64 + 3);  // [1, 17]
+
+      uint8x8_t a0_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx0), v_zero_u8);
+      uint8x8_t a1_y = vtrn1_u8(vqtbl2_u8(left_vals, left_idx1), v_zero_u8);
+#else   // !defined(__aarch64__)
       DECLARE_ALIGNED(32, int16_t, base_y_c[4]);
 
       vst1_s16(base_y_c, base_y_c64);
@@ -1600,6 +1616,7 @@
       a1_y = vld1_lane_u8(left + base_y_c[1], a1_y, 2);
       a1_y = vld1_lane_u8(left + base_y_c[2], a1_y, 4);
       a1_y = vld1_lane_u8(left + base_y_c[3], a1_y, 6);
+#endif  // defined(__aarch64__)
 
       if (upsample_left) {
         v_shift.val[1] = vshr_n_u16(
@@ -1679,6 +1696,16 @@
   uint16x8_t c1234 = vcombine_u16(vcreate_u16(0x0004000300020001),
                                   vcreate_u16(0x0008000700060005));
 
+#if defined(__aarch64__)
+  // Use ext rather than loading left + 30 directly to avoid over-read.
+  const uint8x16_t left_m2 = vld1q_u8(left - 2);
+  const uint8x16_t left_0 = vld1q_u8(left + 0);
+  const uint8x16_t left_16 = vld1q_u8(left + 16);
+  const uint8x16_t left_14 = vextq_u8(left_0, left_16, 14);
+  const uint8x16_t left_30 = vextq_u8(left_16, left_16, 14);
+  const uint8x16x3_t left_vals = { { left_m2, left_14, left_30 } };
+#endif  // defined(__aarch64__)
+
   for (int r = 0; r < N; r++) {
     uint8x8_t resx, resy, resxy;
     uint16x8x2_t res, shift;
@@ -1746,8 +1773,18 @@
       base_y_c128 = vshlq_s16(y_c128, v_frac_bits_y);
       mask128 = vcgtq_s16(min_base_y128, base_y_c128);
 
+      // Values in base_y_c128 range from -2 through 31 inclusive.
       base_y_c128 = vbicq_s16(base_y_c128, vreinterpretq_s16_u16(mask128));
 
+#if defined(__aarch64__)
+      uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c128 + 2);  // [0, 33]
+      uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c128 + 3);  // [1, 34]
+      uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+      uint8x16_t a01_x = vqtbl3q_u8(left_vals, left_idx01);
+      uint8x8_t a0_x1 = vget_low_u8(a01_x);
+      uint8x8_t a1_x1 = vget_high_u8(a01_x);
+#else   // !defined(__aarch64__)
       DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
 
       vst1q_s16(base_y_c, base_y_c128);
@@ -1772,6 +1809,7 @@
       a1_x1 = vld1_lane_u8(left + base_y_c[5], a1_x1, 5);
       a1_x1 = vld1_lane_u8(left + base_y_c[6], a1_x1, 6);
       a1_x1 = vld1_lane_u8(left + base_y_c[7], a1_x1, 7);
+#endif  // defined(__aarch64__)
 
       if (upsample_left) {
         shift.val[1] = vshrq_n_u16(
@@ -1829,6 +1867,19 @@
   c1234.val[0] = vaddq_u16(c0123.val[0], c1);
   c1234.val[1] = vaddq_u16(c0123.val[1], c1);
 
+#if defined(__aarch64__)
+  const uint8x16_t left_m1 = vld1q_u8(left - 1);
+  const uint8x16_t left_0 = vld1q_u8(left + 0);
+  const uint8x16_t left_16 = vld1q_u8(left + 16);
+  const uint8x16_t left_32 = vld1q_u8(left + 32);
+  const uint8x16_t left_48 = vld1q_u8(left + 48);
+  const uint8x16_t left_15 = vextq_u8(left_0, left_16, 15);
+  const uint8x16_t left_31 = vextq_u8(left_16, left_32, 15);
+  const uint8x16_t left_47 = vextq_u8(left_32, left_48, 15);
+  const uint8x16x4_t left_vals0 = { { left_m1, left_15, left_31, left_47 } };
+  const uint8x16x4_t left_vals1 = { { left_0, left_16, left_32, left_48 } };
+#endif  // defined(__aarch64__)
+
   for (int r = 0; r < H; r++) {
     uint16x8x2_t res, r6, shift;
     uint16x8_t j256;
@@ -1966,11 +2017,27 @@
           a1_y0 = vget_low_u8(a1_y128);
           a1_y1 = vget_high_u8(a1_y128);
         } else {
+          // Values in base_y_c256 range from -1 through 62 inclusive.
           base_y_c256.val[0] = vbicq_s16(base_y_c256.val[0],
                                          vreinterpretq_s16_u16(mask256.val[0]));
           base_y_c256.val[1] = vbicq_s16(base_y_c256.val[1],
                                          vreinterpretq_s16_u16(mask256.val[1]));
 
+#if defined(__aarch64__)
+          // Values in left_idx{0,1} range from 0 through 63 inclusive.
+          uint8x16_t left_idx0 = vreinterpretq_u8_s16(base_y_c256.val[0] + 1);
+          uint8x16_t left_idx1 = vreinterpretq_u8_s16(base_y_c256.val[1] + 1);
+
+          uint8x16_t left_idx01 = vuzp1q_u8(left_idx0, left_idx1);
+
+          uint8x16_t a0_y01 = vqtbl4q_u8(left_vals0, left_idx01);
+          uint8x16_t a1_y01 = vqtbl4q_u8(left_vals1, left_idx01);
+
+          a0_y0 = vget_low_u8(a0_y01);
+          a0_y1 = vget_high_u8(a0_y01);
+          a1_y0 = vget_low_u8(a1_y01);
+          a1_y1 = vget_high_u8(a1_y01);
+#else   // !defined(__aarch64__)
           DECLARE_ALIGNED(32, int16_t, base_y_c[16]);
 
           vst1q_s16(base_y_c, base_y_c256.val[0]);
@@ -2019,6 +2086,7 @@
           a1_y1 = vld1_lane_u8(left + base_y_c[13], a1_y1, 5);
           a1_y1 = vld1_lane_u8(left + base_y_c[14], a1_y1, 6);
           a1_y1 = vld1_lane_u8(left + base_y_c[15], a1_y1, 7);
+#endif  // defined(__aarch64__)
         }
 
         shifty.val[0] = vshrq_n_u16(