remove arm64 specializations in av1_convolve_2d_sr

This has peculiar failures in the jenkins testing enviroment.

BUG=aomedia:2261

Change-Id: I3bbaf9b4e281bd0945b7aec7b42b7bbc26a06110
diff --git a/av1/common/arm/convolve_neon.c b/av1/common/arm/convolve_neon.c
index 934297b..b828dbd 100644
--- a/av1/common/arm/convolve_neon.c
+++ b/av1/common/arm/convolve_neon.c
@@ -853,9 +853,6 @@
   int im_dst_stride;
   int width, height;
   uint8x8_t t0;
-#if defined(__aarch64__)
-  uint8x8_t t1, t2, t3, t4, t5, t6, t7;
-#endif
 
   DECLARE_ALIGNED(16, int16_t,
                   im_block[(MAX_SB_SIZE + HORIZ_EXTRA_ROWS) * MAX_SB_SIZE]);
@@ -894,9 +891,6 @@
 
   if (w <= 4) {
     int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, d0;
-#if defined(__aarch64__)
-    int16x4_t s8, s9, s10, d1, d2, d3;
-#endif
 
     const int16x4_t horiz_const = vdup_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x4_t shift_round_0 = vdup_n_s16(-(conv_params->round_0 - 1));
@@ -904,66 +898,6 @@
     do {
       s = src_ptr;
 
-#if defined(__aarch64__)
-      __builtin_prefetch(s + 0 * src_stride);
-      __builtin_prefetch(s + 1 * src_stride);
-      __builtin_prefetch(s + 2 * src_stride);
-      __builtin_prefetch(s + 3 * src_stride);
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s0 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s1 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s2 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s3 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-      s4 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s5 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s6 = vget_high_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
-      s += 7;
-
-      load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3);
-      transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-      s7 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t0)));
-      s8 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t1)));
-      s9 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t2)));
-      s10 = vget_low_s16(vreinterpretq_s16_u16(vmovl_u8(t3)));
-
-      d0 = convolve8_4x4_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d1 = convolve8_4x4_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d2 = convolve8_4x4_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                             horiz_const, shift_round_0);
-      d3 = convolve8_4x4_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                             horiz_const, shift_round_0);
-
-      transpose_s16_4x4d(&d0, &d1, &d2, &d3);
-      if (w == 4) {
-        vst1_s16((dst_ptr + 0 * im_dst_stride), d0);
-        vst1_s16((dst_ptr + 1 * im_dst_stride), d1);
-        vst1_s16((dst_ptr + 2 * im_dst_stride), d2);
-        vst1_s16((dst_ptr + 3 * im_dst_stride), d3);
-      } else if (w == 2) {
-        vst1_lane_u32((uint32_t *)(dst_ptr + 0 * im_dst_stride),
-                      vreinterpret_u32_s16(d0), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 1 * im_dst_stride),
-                      vreinterpret_u32_s16(d1), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 2 * im_dst_stride),
-                      vreinterpret_u32_s16(d2), 0);
-        vst1_lane_u32((uint32_t *)(dst_ptr + 3 * im_dst_stride),
-                      vreinterpret_u32_s16(d3), 0);
-      }
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * im_dst_stride;
-      height -= 4;
-#else
       int16x8_t tt0;
 
       __builtin_prefetch(s);
@@ -999,108 +933,14 @@
 
       src_ptr += src_stride;
       height -= 1;
-#endif
     } while (height > 0);
   } else {
     int16_t *d_tmp;
     int16x8_t s0, s1, s2, s3, s4, s5, s6, s7, res0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10, res1, res2, res3, res4, res5, res6, res7;
-    int16x8_t s11, s12, s13, s14;
-#endif
 
     const int16x8_t horiz_const = vdupq_n_s16((1 << (bd + FILTER_BITS - 2)));
     const int16x8_t shift_round_0 = vdupq_n_s16(-(conv_params->round_0 - 1));
 
-#if defined(__aarch64__)
-    do {
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-      __builtin_prefetch(src_ptr + 4 * src_stride);
-      __builtin_prefetch(src_ptr + 5 * src_stride);
-      __builtin_prefetch(src_ptr + 6 * src_stride);
-      __builtin_prefetch(src_ptr + 7 * src_stride);
-
-      load_u8_8x8(src_ptr, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-      s0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      s1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      s2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      s3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-      s4 = vreinterpretq_s16_u16(vmovl_u8(t4));
-      s5 = vreinterpretq_s16_u16(vmovl_u8(t5));
-      s6 = vreinterpretq_s16_u16(vmovl_u8(t6));
-
-      width = w;
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-
-      __builtin_prefetch(dst_ptr + 0 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 4 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 5 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 6 * im_dst_stride);
-      __builtin_prefetch(dst_ptr + 7 * im_dst_stride);
-
-      do {
-        load_u8_8x8(s, src_stride, &t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        transpose_u8_8x8(&t0, &t1, &t2, &t3, &t4, &t5, &t6, &t7);
-
-        s7 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        s8 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        s9 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        s10 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        s11 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        s12 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        s13 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        s14 = vreinterpretq_s16_u16(vmovl_u8(t7));
-
-        res0 = convolve8_8x8_s16(s0, s1, s2, s3, s4, s5, s6, s7, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res1 = convolve8_8x8_s16(s1, s2, s3, s4, s5, s6, s7, s8, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res2 = convolve8_8x8_s16(s2, s3, s4, s5, s6, s7, s8, s9, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res3 = convolve8_8x8_s16(s3, s4, s5, s6, s7, s8, s9, s10, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res4 = convolve8_8x8_s16(s4, s5, s6, s7, s8, s9, s10, s11, x_filter_tmp,
-                                 horiz_const, shift_round_0);
-        res5 = convolve8_8x8_s16(s5, s6, s7, s8, s9, s10, s11, s12,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res6 = convolve8_8x8_s16(s6, s7, s8, s9, s10, s11, s12, s13,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-        res7 = convolve8_8x8_s16(s7, s8, s9, s10, s11, s12, s13, s14,
-                                 x_filter_tmp, horiz_const, shift_round_0);
-
-        transpose_s16_8x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7);
-
-        store_s16_8x8(d_tmp, im_dst_stride, res0, res1, res2, res3, res4, res5,
-                      res6, res7);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += 8 * src_stride;
-      dst_ptr += 8 * im_dst_stride;
-      height -= 8;
-    } while (height > 0);
-#else
     do {
       t0 = vld1_u8(src_ptr);
       s0 = vreinterpretq_s16_u16(vmovl_u8(t0));  // a0 a1 a2 a3 a4 a5 a6 a7
@@ -1138,7 +978,6 @@
       dst_ptr += im_dst_stride;
       height -= 1;
     } while (height > 0);
-#endif
   }
 
   // vertical
@@ -1168,13 +1007,6 @@
       uint16x8_t dd0;
       uint8x8_t d01;
 
-#if defined(__aarch64__)
-      int16x4_t s8, s9, s10;
-      uint16x4_t d1, d2, d3;
-      uint16x8_t dd1;
-      uint8x8_t d23;
-#endif
-
       d_u8 = dst_u8_ptr;
       v_s = v_src_ptr;
 
@@ -1191,85 +1023,6 @@
       v_s += (7 * im_stride);
 
       do {
-#if defined(__aarch64__)
-        load_s16_4x4(v_s, im_stride, &s7, &s8, &s9, &s10);
-        v_s += (im_stride << 2);
-
-        __builtin_prefetch(d_u8 + 0 * dst_stride);
-        __builtin_prefetch(d_u8 + 1 * dst_stride);
-        __builtin_prefetch(d_u8 + 2 * dst_stride);
-        __builtin_prefetch(d_u8 + 3 * dst_stride);
-
-        d0 = convolve8_vert_4x4_s32(s0, s1, s2, s3, s4, s5, s6, s7, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d1 = convolve8_vert_4x4_s32(s1, s2, s3, s4, s5, s6, s7, s8, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d2 = convolve8_vert_4x4_s32(s2, s3, s4, s5, s6, s7, s8, s9, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-        d3 = convolve8_vert_4x4_s32(s3, s4, s5, s6, s7, s8, s9, s10, y_filter,
-                                    round_shift_vec, offset_const,
-                                    sub_const_vec);
-
-        dd0 = vqrshlq_u16(vcombine_u16(d0, d1), vec_round_bits);
-        dd1 = vqrshlq_u16(vcombine_u16(d2, d3), vec_round_bits);
-
-        d01 = vqmovn_u16(dd0);
-        d23 = vqmovn_u16(dd1);
-
-        if ((w == 4) && (h != 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        0);  // 20 21 22 23
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d23),
-                        1);  // 30 31 32 33
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h != 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        0);  // 20 21
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d23),
-                        2);  // 30 31
-          d_u8 += dst_stride;
-        } else if ((w == 4) && (h == 2)) {
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        0);  // 00 01 02 03
-          d_u8 += dst_stride;
-          vst1_lane_u32((uint32_t *)d_u8, vreinterpret_u32_u8(d01),
-                        1);  // 10 11 12 13
-          d_u8 += dst_stride;
-        } else if ((w == 2) && (h == 2)) {
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        0);  // 00 01
-          d_u8 += dst_stride;
-          vst1_lane_u16((uint16_t *)d_u8, vreinterpret_u16_u8(d01),
-                        2);  // 10 11
-          d_u8 += dst_stride;
-        }
-
-        s0 = s4;
-        s1 = s5;
-        s2 = s6;
-        s3 = s7;
-        s4 = s8;
-        s5 = s9;
-        s6 = s10;
-        height -= 4;
-#else
         s7 = vld1_s16(v_s);
         v_s += im_stride;
 
@@ -1301,16 +1054,11 @@
         s5 = s6;
         s6 = s7;
         height -= 1;
-#endif
       } while (height > 0);
     } else {
       // if width is a multiple of 8 & height is a multiple of 4
       int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
       uint8x8_t res0;
-#if defined(__aarch64__)
-      int16x8_t s8, s9, s10;
-      uint8x8_t res1, res2, res3;
-#endif
 
       do {
         __builtin_prefetch(v_src_ptr + 0 * im_stride);
@@ -1330,52 +1078,6 @@
         height = h;
 
         do {
-#if defined(__aarch64__)
-          load_s16_8x4(v_s, im_stride, &s7, &s8, &s9, &s10);
-          v_s += (im_stride << 2);
-
-          __builtin_prefetch(d_u8 + 4 * dst_stride);
-          __builtin_prefetch(d_u8 + 5 * dst_stride);
-          __builtin_prefetch(d_u8 + 6 * dst_stride);
-          __builtin_prefetch(d_u8 + 7 * dst_stride);
-
-          res0 = convolve8_vert_8x4_s32(s0, s1, s2, s3, s4, s5, s6, s7,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res1 = convolve8_vert_8x4_s32(s1, s2, s3, s4, s5, s6, s7, s8,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res2 = convolve8_vert_8x4_s32(s2, s3, s4, s5, s6, s7, s8, s9,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-          res3 = convolve8_vert_8x4_s32(s3, s4, s5, s6, s7, s8, s9, s10,
-                                        y_filter, round_shift_vec, offset_const,
-                                        sub_const_vec, vec_round_bits);
-
-          if (h != 2) {
-            vst1_u8(d_u8, res0);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res2);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res3);
-            d_u8 += dst_stride;
-          } else {
-            vst1_u8(d_u8, res0);
-            d_u8 += dst_stride;
-            vst1_u8(d_u8, res1);
-            d_u8 += dst_stride;
-          }
-          s0 = s4;
-          s1 = s5;
-          s2 = s6;
-          s3 = s7;
-          s4 = s8;
-          s5 = s9;
-          s6 = s10;
-          height -= 4;
-#else
           s7 = vld1q_s16(v_s);
           v_s += im_stride;
 
@@ -1396,7 +1098,6 @@
           s5 = s6;
           s6 = s7;
           height -= 1;
-#endif
         } while (height > 0);
         v_src_ptr += 8;
         dst_u8_ptr += 8;