Add NEON version of aom_smooth_predictor functions

SpeedUp

size              gain
4 x 4             1.62
4 x 8             2.29
4 x 16            2.49
8 x 4             2.33
8 x 8             2.22
8 x 16            2.22
8 x 32            2.24
16 x 4            1.11
16 x 8            1.28
16 x 16           1.24
16 x 32           1.25
16 x 64           1.23
32 x 8            1.19
32 x 16           1.19
32 x 32           1.20
32 x 64           1.21
64 x 16           1.13
64 x 32           1.14
64 x 64           1.12

via NEON/LowbdIntraPredTest.DISABLED_Speed

Change-Id: Id2e33ddbf9443d959641fa75456cc22b6a0aa071
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 2379b75..90af7e7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -202,25 +202,25 @@
 specialize qw/aom_paeth_predictor_16x32 ssse3/;
 specialize qw/aom_paeth_predictor_32x16 ssse3/;
 specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
 
 specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
 specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 8aff784..6d41708 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -11,6 +11,8 @@
 
 #include <arm_neon.h>
 
+#include "common/tools_common.h"
+
 #include "config/aom_config.h"
 #include "config/aom_dsp_rtcd.h"
 
@@ -2701,3 +2703,524 @@
     }
   }
 }
+static const int sm_weight_log2_scale = 8;
+
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+
+/* clang-format off */
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+    // Unused, because we always offset by bs, which is at least 2.
+    0, 0,
+    // bs = 2
+    255, 128,
+    // bs = 4
+    255, 149, 85, 64,
+    // bs = 8
+    255, 197, 146, 105, 73, 50, 37, 32,
+    // bs = 16
+    255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+    // bs = 32
+    255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+    66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+    // bs = 64
+    255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+    150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+    69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+    15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+};
+/* clang-format on */
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+                                 int height, uint8x16_t *pixels) {
+  uint32x4_t zero = vdupq_n_u32(0);
+  const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
+  if (height == 4)
+    pixels[1] =
+        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
+  else if (height == 8) {
+    pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
+        ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
+  } else {
+    pixels[1] = vld1q_u8(left);
+  }
+
+  pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
+
+  const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
+#if defined(__aarch64__)
+  pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
+#else
+  pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
+#endif  // (__aarch64__)
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
+  const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
+  const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
+  weight_h[0] = vmovl_u8(t);
+  weight_h[1] = vsubw_u8(d, t);
+#if defined(__aarch64__)
+  weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
+#else
+  weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
+#endif  // (__aarch64__)
+
+  if (height == 8) {
+    const uint8x8_t weight = vld1_u8(&weight_array[8]);
+    weight_h[0] = vmovl_u8(weight);
+    weight_h[1] = vsubw_u8(d, weight);
+  } else if (height == 16) {
+    const uint8x16_t zero = vdupq_n_u8(0);
+    const uint8x16_t weight = vld1q_u8(&weight_array[16]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
+  }
+}
+
+static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
+                                   const uint16x8_t *wh, const uint16x8_t *ww,
+                                   int h, uint8_t *dst, ptrdiff_t stride,
+                                   int second_half) {
+  const uint16x4_t one = vdup_n_u16(1);
+  const uint16x4_t inc = vdup_n_u16(0x202);
+  uint16x4_t rep =
+      second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
+  uint16x4_t d = vdup_n_u16(0x100);
+  const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
+  const uint16x4_t v_pixel_0_hi =
+      vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
+  const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
+  const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
+  const uint16x4_t ww_0_hi =
+      vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
+  const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
+
+#if !defined(__aarch64__)
+  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[0])) } };
+  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[1])) } };
+  const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
+                                   vget_high_u8(pixel[1]) } };
+#endif  // (__aarch64__)
+
+  for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+    const uint8x8_t wg =
+        vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
+    const uint8x8_t sc =
+        vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
+#else
+    const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
+    const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
+#endif  // (__aarch64__)
+
+    uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
+    sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
+
+#if defined(__aarch64__)
+    uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
+#else
+    uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
+#endif  // (__aarch64__)
+
+    sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
+    sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
+    uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
+    uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
+    vst1_lane_u32((uint32_t *)dst, predsh, 0);
+
+    dst += stride;
+
+    rep = vadd_u16(rep, one);
+    d = vadd_u16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 4, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 8, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[3];
+  load_pixel_w4(above, left, 16, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w4(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+                                 int height, uint8x16_t *pixels) {
+  pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
+  pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
+  pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
+
+  if (height == 4) {
+    const uint32x4_t zero32 = vdupq_n_u32(0);
+    pixels[2] =
+        vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
+  } else if (height == 8) {
+    const uint64x2_t zero64 = vdupq_n_u64(0);
+    pixels[2] = vreinterpretq_u8_u64(
+        vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
+  } else if (height == 16) {
+    pixels[2] = vld1q_u8(left);
+  } else {
+    pixels[2] = vld1q_u8(left);
+    pixels[4] = pixels[0];
+    pixels[5] = pixels[1];
+    pixels[6] = vld1q_u8(left + 16);
+    pixels[7] = pixels[3];
+  }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+                                  uint16x8_t *weight_h, uint16x8_t *weight_w) {
+  const uint8x16_t zero = vdupq_n_u8(0);
+  const int we_offset = height < 8 ? 4 : 8;
+  uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
+#if defined(__aarch64__)
+  weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+  weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
+#endif  // (__aarch64__)
+  const uint16x8_t d = vdupq_n_u16(256);
+  weight_h[1] = vsubq_u16(d, weight_h[0]);
+
+  if (height == 4) {
+    we = vextq_u8(we, zero, 4);
+#if defined(__aarch64__)
+    weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+    weight_w[0] = vmovl_u8(vget_low_u8(we));
+#endif  // (__aarch64__)
+    weight_w[1] = vsubq_u16(d, weight_w[0]);
+  } else {
+    weight_w[0] = weight_h[0];
+    weight_w[1] = weight_h[1];
+  }
+
+  if (height == 16) {
+    we = vld1q_u8(&weight_array[16]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, weight_h[0]);
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, weight_h[2]);
+  } else if (height == 32) {
+    const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
+    const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
+    weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+    weight_h[1] = vsubq_u16(d, weight_h[0]);
+    weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+    weight_h[3] = vsubq_u16(d, weight_h[2]);
+    const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
+    const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
+    weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
+    weight_h[5] = vsubq_u16(d, weight_h[4]);
+    weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
+    weight_h[7] = vsubq_u16(d, weight_h[6]);
+  }
+}
+
+static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
+                                   const uint16x8_t *wh, const uint16x8_t *ww,
+                                   int h, uint8_t *dst, ptrdiff_t stride,
+                                   int second_half) {
+  const uint16x8_t one = vdupq_n_u16(1);
+  const uint16x8_t inc = vdupq_n_u16(0x202);
+  uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
+                               : vdupq_n_u16((uint16_t)0x8000);
+  uint16x8_t d = vdupq_n_u16(0x100);
+
+#if !defined(__aarch64__)
+  const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[0])) } };
+  const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+                                   vget_high_u8(
+                                       vreinterpretq_u8_u16(wh[1])) } };
+  const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
+                                   vget_high_u8(pixels[2]) } };
+#endif
+
+  for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+    const uint8x16_t wg_wg =
+        vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
+    const uint8x16_t sc_sc =
+        vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
+#else
+    const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
+    const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
+    const uint8x16_t wg_wg =
+        vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
+    const uint8x16_t sc_sc =
+        vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
+#endif  // (__aarch64__)
+    uint16x8_t s01 =
+        vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
+    s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
+                    vreinterpretq_u16_u8(sc_sc));
+#if defined(__aarch64__)
+    const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
+#else
+    const uint8x16_t b = vcombine_u8(
+        vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
+        vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
+#endif  // (__aarch64__)
+    uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
+    sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
+
+    uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
+#if defined(__aarch64__)
+    uint32x4_t s1 = vaddl_high_u16(s01, sum0);
+#else
+    uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
+#endif  // (__aarch64__)
+
+    sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
+    uint8x8_t predsh = vqmovn_u16(sum0);
+    vst1_u8(dst, predsh);
+
+    dst += stride;
+    rep = vaddq_u16(rep, one);
+    d = vaddq_u16(d, inc);
+  }
+}
+
+void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 4, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                   const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 8, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[4];
+  load_pixel_w8(above, left, 16, pixels);
+
+  uint16x8_t wh[4], ww[2];
+  load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+  smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  uint8x16_t pixels[8];
+  load_pixel_w8(above, left, 32, pixels);
+
+  uint16x8_t wh[8], ww[2];
+  load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+  smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+  dst += stride << 3;
+  smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+                                        const uint8_t *above,
+                                        const uint8_t *left, uint32_t bw,
+                                        uint32_t bh) {
+  const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+  const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+  const uint16x8_t scale_value = vdupq_n_u16(256);
+
+  for (uint32_t y = 0; y < bh; ++y) {
+    const uint8x8_t left_y = vdup_n_u8(left[y]);
+    const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
+    const uint32x4_t pred_scaled_bl =
+        vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
+
+    for (uint32_t x = 0; x < bw; x += 8) {
+      const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
+      const uint8x8_t top_x = vld1_u8(above + x);
+
+      uint16x8_t pred_m1, pred_m2;
+      uint32x4_t pred_lo, pred_hi;
+      pred_m1 = vmull_u8(top_x, weights_y_dup);
+      pred_m2 = vmull_u8(weights_x, left_y);
+
+      pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
+#if defined(__aarch64__)
+      pred_hi = vaddl_high_u16(pred_m1, pred_m2);
+#else
+      pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
+#endif  // (__aarch64__)
+
+      const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
+
+      const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
+
+      pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
+      pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
+
+      pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
+#if defined(__aarch64__)
+      pred_hi = vaddw_high_u16(pred_hi, swxtr);
+#else
+      pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
+#endif  // (__aarch64__)
+
+      uint16x8_t pred =
+          vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
+
+      uint8x8_t predsh = vqmovn_u16(pred);
+
+      vst1_u8(dst + x, predsh);
+    }
+
+    dst += stride;
+  }
+}
+
+void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 779cf9a..cb7311d 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -97,6 +97,63 @@
     }
     ASSERT_EQ(0, error_count);
   }
+  void RunSpeedTest(Pixel *left_col, Pixel *above_data, Pixel *dst,
+                    Pixel *ref_dst) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int block_width = params_.block_width;
+    const int block_height = params_.block_height;
+    above_row_ = above_data + 16;
+    left_col_ = left_col;
+    dst_ = dst;
+    ref_dst_ = ref_dst;
+    int error_count = 0;
+    const int numIter = 100;
+
+    int c_sum_time = 0;
+    int simd_sum_time = 0;
+    for (int i = 0; i < count_test_block; ++i) {
+      // Fill edges with random data, try first with saturated values.
+      for (int x = -1; x <= block_width * 2; x++) {
+        if (i == 0) {
+          above_row_[x] = mask_;
+        } else {
+          above_row_[x] = rnd.Rand16() & mask_;
+        }
+      }
+      for (int y = 0; y < block_height; y++) {
+        if (i == 0) {
+          left_col_[y] = mask_;
+        } else {
+          left_col_[y] = rnd.Rand16() & mask_;
+        }
+      }
+
+      aom_usec_timer c_timer_;
+      aom_usec_timer_start(&c_timer_);
+
+      PredictRefSpeedTest(numIter);
+
+      aom_usec_timer_mark(&c_timer_);
+
+      aom_usec_timer simd_timer_;
+      aom_usec_timer_start(&simd_timer_);
+
+      PredictFncSpeedTest(numIter);
+
+      aom_usec_timer_mark(&simd_timer_);
+
+      c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+      simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+      CheckPrediction(i, &error_count);
+    }
+
+    printf(
+        "blockWxH = %d x %d c_time = %d \t simd_time = %d \t Gain = %4.2f \n",
+        block_width, block_height, c_sum_time, simd_sum_time,
+        (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+    ASSERT_EQ(0, error_count);
+  }
 
  protected:
   virtual void SetUp() {
@@ -107,6 +164,9 @@
 
   virtual void Predict() = 0;
 
+  virtual void PredictRefSpeedTest(int num) = 0;
+  virtual void PredictFncSpeedTest(int num) = 0;
+
   void CheckPrediction(int test_case_number, int *error_count) const {
     // For each pixel ensure that the calculated value is the same as reference.
     const int block_width = params_.block_width;
@@ -142,6 +202,18 @@
     ASM_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
   }
+  void PredictRefSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    const int bit_depth = params_.bit_depth;
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+    }
+  }
 };
 #endif
 
@@ -152,6 +224,16 @@
     ASM_REGISTER_STATE_CHECK(
         params_.pred_fn(dst_, stride_, above_row_, left_col_));
   }
+  void PredictRefSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+    }
+  }
+  void PredictFncSpeedTest(int num) {
+    for (int i = 0; i < num; i++) {
+      params_.pred_fn(dst_, stride_, above_row_, left_col_);
+    }
+  }
 };
 
 #if CONFIG_AV1_HIGHBITDEPTH
@@ -169,19 +251,26 @@
 }
 #endif
 
-// Same issue as above but for arm.
-#if !HAVE_NEON
 TEST_P(LowbdIntraPredTest, Bitexact) {
-  // max block size is 32
-  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
-  DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
   av1_zero(left_col);
   av1_zero(above_data);
   RunTest(left_col, above_data, dst, ref_dst);
 }
-#endif  // !HAVE_NEON
+TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
+  // max block size is 64
+  DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+  DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+  av1_zero(left_col);
+  av1_zero(above_data);
+  RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
 
 #if CONFIG_AV1_HIGHBITDEPTH
 // -----------------------------------------------------------------------------
@@ -229,6 +318,23 @@
 
 #endif  // HAVE_SSE2
 
+#if HAVE_NEON
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
+  lowbd_entry(smooth, 4, 4, neon),   lowbd_entry(smooth, 4, 8, neon),
+  lowbd_entry(smooth, 4, 16, neon),  lowbd_entry(smooth, 8, 4, neon),
+  lowbd_entry(smooth, 8, 8, neon),   lowbd_entry(smooth, 8, 16, neon),
+  lowbd_entry(smooth, 8, 32, neon),  lowbd_entry(smooth, 16, 4, neon),
+  lowbd_entry(smooth, 16, 8, neon),  lowbd_entry(smooth, 16, 16, neon),
+  lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
+  lowbd_entry(smooth, 32, 8, neon),  lowbd_entry(smooth, 32, 16, neon),
+  lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
+  lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
+  lowbd_entry(smooth, 64, 64, neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
+                         ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
+#endif  // HAVE_NEON
+
 #if HAVE_SSSE3
 const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
   lowbd_intrapred(paeth, ssse3),