Add NEON version of aom_smooth_predictor functions
SpeedUp
size gain
4 x 4 1.62
4 x 8 2.29
4 x 16 2.49
8 x 4 2.33
8 x 8 2.22
8 x 16 2.22
8 x 32 2.24
16 x 4 1.11
16 x 8 1.28
16 x 16 1.24
16 x 32 1.25
16 x 64 1.23
32 x 8 1.19
32 x 16 1.19
32 x 32 1.20
32 x 64 1.21
64 x 16 1.13
64 x 32 1.14
64 x 64 1.12
via NEON/LowbdIntraPredTest.DISABLED_Speed
Change-Id: Id2e33ddbf9443d959641fa75456cc22b6a0aa071
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 2379b75..90af7e7 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -202,25 +202,25 @@
specialize qw/aom_paeth_predictor_16x32 ssse3/;
specialize qw/aom_paeth_predictor_32x16 ssse3/;
specialize qw/aom_paeth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_4x4 ssse3/;
-specialize qw/aom_smooth_predictor_4x8 ssse3/;
-specialize qw/aom_smooth_predictor_4x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x4 ssse3/;
-specialize qw/aom_smooth_predictor_8x8 ssse3/;
-specialize qw/aom_smooth_predictor_8x16 ssse3/;
-specialize qw/aom_smooth_predictor_8x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x4 ssse3/;
-specialize qw/aom_smooth_predictor_16x8 ssse3/;
-specialize qw/aom_smooth_predictor_16x16 ssse3/;
-specialize qw/aom_smooth_predictor_16x32 ssse3/;
-specialize qw/aom_smooth_predictor_16x64 ssse3/;
-specialize qw/aom_smooth_predictor_32x8 ssse3/;
-specialize qw/aom_smooth_predictor_32x16 ssse3/;
-specialize qw/aom_smooth_predictor_32x32 ssse3/;
-specialize qw/aom_smooth_predictor_32x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x64 ssse3/;
-specialize qw/aom_smooth_predictor_64x32 ssse3/;
-specialize qw/aom_smooth_predictor_64x16 ssse3/;
+specialize qw/aom_smooth_predictor_4x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_4x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_8x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x4 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_16x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x8 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x16 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_32x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x64 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x32 neon ssse3/;
+specialize qw/aom_smooth_predictor_64x16 neon ssse3/;
specialize qw/aom_smooth_v_predictor_4x4 ssse3/;
specialize qw/aom_smooth_v_predictor_4x8 ssse3/;
diff --git a/aom_dsp/arm/intrapred_neon.c b/aom_dsp/arm/intrapred_neon.c
index 8aff784..6d41708 100644
--- a/aom_dsp/arm/intrapred_neon.c
+++ b/aom_dsp/arm/intrapred_neon.c
@@ -11,6 +11,8 @@
#include <arm_neon.h>
+#include "common/tools_common.h"
+
#include "config/aom_config.h"
#include "config/aom_dsp_rtcd.h"
@@ -2701,3 +2703,524 @@
}
}
}
+static const int sm_weight_log2_scale = 8;
+
+// max(block_size_wide[BLOCK_LARGEST], block_size_high[BLOCK_LARGEST])
+#define MAX_BLOCK_DIM 64
+
+/* clang-format off */
+static const uint8_t sm_weight_arrays[2 * MAX_BLOCK_DIM] = {
+ // Unused, because we always offset by bs, which is at least 2.
+ 0, 0,
+ // bs = 2
+ 255, 128,
+ // bs = 4
+ 255, 149, 85, 64,
+ // bs = 8
+ 255, 197, 146, 105, 73, 50, 37, 32,
+ // bs = 16
+ 255, 225, 196, 170, 145, 123, 102, 84, 68, 54, 43, 33, 26, 20, 17, 16,
+ // bs = 32
+ 255, 240, 225, 210, 196, 182, 169, 157, 145, 133, 122, 111, 101, 92, 83, 74,
+ 66, 59, 52, 45, 39, 34, 29, 25, 21, 17, 14, 12, 10, 9, 8, 8,
+ // bs = 64
+ 255, 248, 240, 233, 225, 218, 210, 203, 196, 189, 182, 176, 169, 163, 156,
+ 150, 144, 138, 133, 127, 121, 116, 111, 106, 101, 96, 91, 86, 82, 77, 73,
+ 69, 65, 61, 57, 54, 50, 47, 44, 41, 38, 35, 32, 29, 27, 25, 22, 20, 18, 16,
+ 15, 13, 12, 10, 9, 8, 7, 6, 6, 5, 5, 4, 4, 4,
+};
+/* clang-format on */
+
+// -----------------------------------------------------------------------------
+// SMOOTH_PRED
+
+// pixels[0]: above and below_pred interleave vector
+// pixels[1]: left vector
+// pixels[2]: right_pred vector
+static INLINE void load_pixel_w4(const uint8_t *above, const uint8_t *left,
+ int height, uint8x16_t *pixels) {
+ uint32x4_t zero = vdupq_n_u32(0);
+ const uint8x8_t d = vcreate_u8(((const uint32_t *)above)[0]);
+ if (height == 4)
+ pixels[1] =
+ vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero, 0));
+ else if (height == 8) {
+ pixels[1] = vreinterpretq_u8_u64(vsetq_lane_u64(
+ ((const uint64_t *)left)[0], vreinterpretq_u64_u32(zero), 0));
+ } else {
+ pixels[1] = vld1q_u8(left);
+ }
+
+ pixels[2] = vreinterpretq_u8_u16(vdupq_n_u16(above[3]));
+
+ const uint16x8_t bp = vdupq_n_u16(left[height - 1]);
+#if defined(__aarch64__)
+ pixels[0] = vreinterpretq_u8_u16(vzip1q_u16(vmovl_u8(d), bp));
+#else
+ pixels[0] = vreinterpretq_u8_u16(vzipq_u16(vmovl_u8(d), bp).val[0]);
+#endif // (__aarch64__)
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], second half for height = 16 only
+// weight_h[3]: same as [1], second half for height = 16 only
+// weight_w[0]: weights_w and scale - weights_w interleave vector
+static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
+ uint16x8_t *weight_h, uint16x8_t *weight_w) {
+ const uint16x8_t d = vdupq_n_u16((uint16_t)(1 << sm_weight_log2_scale));
+ const uint8x8_t t = vcreate_u8(((const uint32_t *)(weight_array))[1]);
+ weight_h[0] = vmovl_u8(t);
+ weight_h[1] = vsubw_u8(d, t);
+#if defined(__aarch64__)
+ weight_w[0] = vzip1q_u16(weight_h[0], weight_h[1]);
+#else
+ weight_w[0] = vzipq_u16(weight_h[0], weight_h[1]).val[0];
+#endif // (__aarch64__)
+
+ if (height == 8) {
+ const uint8x8_t weight = vld1_u8(&weight_array[8]);
+ weight_h[0] = vmovl_u8(weight);
+ weight_h[1] = vsubw_u8(d, weight);
+ } else if (height == 16) {
+ const uint8x16_t zero = vdupq_n_u8(0);
+ const uint8x16_t weight = vld1q_u8(&weight_array[16]);
+ const uint8x16x2_t weight_h_02 = vzipq_u8(weight, zero);
+ weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+ weight_h[1] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[0]));
+ weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+ weight_h[3] = vsubq_u16(d, vreinterpretq_u16_u8(weight_h_02.val[1]));
+ }
+}
+
+static INLINE void smooth_pred_4xh(const uint8x16_t *pixel,
+ const uint16x8_t *wh, const uint16x8_t *ww,
+ int h, uint8_t *dst, ptrdiff_t stride,
+ int second_half) {
+ const uint16x4_t one = vdup_n_u16(1);
+ const uint16x4_t inc = vdup_n_u16(0x202);
+ uint16x4_t rep =
+ second_half ? vdup_n_u16((uint16_t)0x8008) : vdup_n_u16((uint16_t)0x8000);
+ uint16x4_t d = vdup_n_u16(0x100);
+ const uint16x4_t v_pixel_0_lo = vmovn_u32(vreinterpretq_u32_u8(pixel[0]));
+ const uint16x4_t v_pixel_0_hi =
+ vmovn_u32(vreinterpretq_u32_u8(vextq_u8(pixel[0], pixel[0], 2)));
+ const uint16x4_t v_pixel_2 = vget_low_u16(vreinterpretq_u16_u8(pixel[2]));
+ const uint16x4_t ww_0_lo = vmovn_u32(vreinterpretq_u32_u16(ww[0]));
+ const uint16x4_t ww_0_hi =
+ vmovn_u32(vreinterpretq_u32_u16(vextq_u16(ww[0], ww[0], 1)));
+ const uint8x8_t save_mask = vcreate_u8(0 + (2 << 8) + (4 << 16) + (6 << 24));
+
+#if !defined(__aarch64__)
+ const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+ vget_high_u8(
+ vreinterpretq_u8_u16(wh[0])) } };
+ const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+ vget_high_u8(
+ vreinterpretq_u8_u16(wh[1])) } };
+ const uint8x8x2_t v_split3 = { { vget_low_u8(pixel[1]),
+ vget_high_u8(pixel[1]) } };
+#endif // (__aarch64__)
+
+ for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+ const uint8x8_t wg =
+ vqtbl1_u8(vreinterpretq_u8_u16(wh[0]), vreinterpret_u8_u16(d));
+ const uint8x8_t sc =
+ vqtbl1_u8(vreinterpretq_u8_u16(wh[1]), vreinterpret_u8_u16(d));
+#else
+ const uint8x8_t wg = vtbl2_u8(v_split1, vreinterpret_u8_u16(d));
+ const uint8x8_t sc = vtbl2_u8(v_split2, vreinterpret_u8_u16(d));
+#endif // (__aarch64__)
+
+ uint32x4_t sum = vmull_u16(v_pixel_0_lo, vreinterpret_u16_u8(wg));
+ sum = vmlal_u16(sum, v_pixel_0_hi, vreinterpret_u16_u8(sc));
+
+#if defined(__aarch64__)
+ uint8x8_t b = vqtbl1_u8(pixel[1], vreinterpret_u8_u16(rep));
+#else
+ uint8x8_t b = vtbl2_u8(v_split3, vreinterpret_u8_u16(rep));
+#endif // (__aarch64__)
+
+ sum = vmlal_u16(sum, vreinterpret_u16_u8(b), ww_0_lo);
+ sum = vmlal_u16(sum, v_pixel_2, ww_0_hi);
+ uint8x8_t sum_l = vreinterpret_u8_u16(vqrshrn_n_u32(sum, 9));
+ uint32x2_t predsh = vreinterpret_u32_u8(vtbl1_u8(sum_l, save_mask));
+ vst1_lane_u32((uint32_t *)dst, predsh, 0);
+
+ dst += stride;
+
+ rep = vadd_u16(rep, one);
+ d = vadd_u16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[3];
+ load_pixel_w4(above, left, 4, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w4(sm_weight_arrays, 4, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[3];
+ load_pixel_w4(above, left, 8, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w4(sm_weight_arrays, 8, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_4x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[3];
+ load_pixel_w4(above, left, 16, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w4(sm_weight_arrays, 16, wh, ww);
+
+ smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+// pixels[0]: above and below_pred interleave vector, first half
+// pixels[1]: above and below_pred interleave vector, second half
+// pixels[2]: left vector
+// pixels[3]: right_pred vector
+// pixels[4]: above and below_pred interleave vector, first half
+// pixels[5]: above and below_pred interleave vector, second half
+// pixels[6]: left vector + 16
+// pixels[7]: right_pred vector
+static INLINE void load_pixel_w8(const uint8_t *above, const uint8_t *left,
+ int height, uint8x16_t *pixels) {
+ pixels[0] = vreinterpretq_u8_u16(vmovl_u8(vld1_u8(above)));
+ pixels[1] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)left[height - 1]));
+ pixels[3] = vreinterpretq_u8_u16(vdupq_n_u16((uint16_t)above[7]));
+
+ if (height == 4) {
+ const uint32x4_t zero32 = vdupq_n_u32(0);
+ pixels[2] =
+ vreinterpretq_u8_u32(vld1q_lane_u32((const uint32_t *)left, zero32, 0));
+ } else if (height == 8) {
+ const uint64x2_t zero64 = vdupq_n_u64(0);
+ pixels[2] = vreinterpretq_u8_u64(
+ vsetq_lane_u64(((const uint64_t *)left)[0], zero64, 0));
+ } else if (height == 16) {
+ pixels[2] = vld1q_u8(left);
+ } else {
+ pixels[2] = vld1q_u8(left);
+ pixels[4] = pixels[0];
+ pixels[5] = pixels[1];
+ pixels[6] = vld1q_u8(left + 16);
+ pixels[7] = pixels[3];
+ }
+}
+
+// weight_h[0]: weight_h vector
+// weight_h[1]: scale - weight_h vector
+// weight_h[2]: same as [0], offset 8
+// weight_h[3]: same as [1], offset 8
+// weight_h[4]: same as [0], offset 16
+// weight_h[5]: same as [1], offset 16
+// weight_h[6]: same as [0], offset 24
+// weight_h[7]: same as [1], offset 24
+// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
+// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
+static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
+ uint16x8_t *weight_h, uint16x8_t *weight_w) {
+ const uint8x16_t zero = vdupq_n_u8(0);
+ const int we_offset = height < 8 ? 4 : 8;
+ uint8x16_t we = vld1q_u8(&weight_array[we_offset]);
+#if defined(__aarch64__)
+ weight_h[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+ weight_h[0] = vreinterpretq_u16_u8(vzipq_u8(we, zero).val[0]);
+#endif // (__aarch64__)
+ const uint16x8_t d = vdupq_n_u16(256);
+ weight_h[1] = vsubq_u16(d, weight_h[0]);
+
+ if (height == 4) {
+ we = vextq_u8(we, zero, 4);
+#if defined(__aarch64__)
+ weight_w[0] = vreinterpretq_u16_u8(vzip1q_u8(we, zero));
+#else
+ weight_w[0] = vmovl_u8(vget_low_u8(we));
+#endif // (__aarch64__)
+ weight_w[1] = vsubq_u16(d, weight_w[0]);
+ } else {
+ weight_w[0] = weight_h[0];
+ weight_w[1] = weight_h[1];
+ }
+
+ if (height == 16) {
+ we = vld1q_u8(&weight_array[16]);
+ const uint8x16x2_t weight_h_02 = vzipq_u8(we, zero);
+ weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+ weight_h[1] = vsubq_u16(d, weight_h[0]);
+ weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+ weight_h[3] = vsubq_u16(d, weight_h[2]);
+ } else if (height == 32) {
+ const uint8x16_t weight_lo = vld1q_u8(&weight_array[32]);
+ const uint8x16x2_t weight_h_02 = vzipq_u8(weight_lo, zero);
+ weight_h[0] = vreinterpretq_u16_u8(weight_h_02.val[0]);
+ weight_h[1] = vsubq_u16(d, weight_h[0]);
+ weight_h[2] = vreinterpretq_u16_u8(weight_h_02.val[1]);
+ weight_h[3] = vsubq_u16(d, weight_h[2]);
+ const uint8x16_t weight_hi = vld1q_u8(&weight_array[32 + 16]);
+ const uint8x16x2_t weight_h_46 = vzipq_u8(weight_hi, zero);
+ weight_h[4] = vreinterpretq_u16_u8(weight_h_46.val[0]);
+ weight_h[5] = vsubq_u16(d, weight_h[4]);
+ weight_h[6] = vreinterpretq_u16_u8(weight_h_46.val[1]);
+ weight_h[7] = vsubq_u16(d, weight_h[6]);
+ }
+}
+
+static INLINE void smooth_pred_8xh(const uint8x16_t *pixels,
+ const uint16x8_t *wh, const uint16x8_t *ww,
+ int h, uint8_t *dst, ptrdiff_t stride,
+ int second_half) {
+ const uint16x8_t one = vdupq_n_u16(1);
+ const uint16x8_t inc = vdupq_n_u16(0x202);
+ uint16x8_t rep = second_half ? vdupq_n_u16((uint16_t)0x8008)
+ : vdupq_n_u16((uint16_t)0x8000);
+ uint16x8_t d = vdupq_n_u16(0x100);
+
+#if !defined(__aarch64__)
+ const uint8x8x2_t v_split1 = { { vget_low_u8(vreinterpretq_u8_u16(wh[0])),
+ vget_high_u8(
+ vreinterpretq_u8_u16(wh[0])) } };
+ const uint8x8x2_t v_split2 = { { vget_low_u8(vreinterpretq_u8_u16(wh[1])),
+ vget_high_u8(
+ vreinterpretq_u8_u16(wh[1])) } };
+ const uint8x8x2_t v_split3 = { { vget_low_u8(pixels[2]),
+ vget_high_u8(pixels[2]) } };
+#endif
+
+ for (int i = 0; i < h; ++i) {
+#if defined(__aarch64__)
+ const uint8x16_t wg_wg =
+ vqtbl1q_u8(vreinterpretq_u8_u16(wh[0]), vreinterpretq_u8_u16(d));
+ const uint8x16_t sc_sc =
+ vqtbl1q_u8(vreinterpretq_u8_u16(wh[1]), vreinterpretq_u8_u16(d));
+#else
+ const uint8x8_t v_d_lo = vreinterpret_u8_u16(vget_low_u16(d));
+ const uint8x8_t v_d_hi = vreinterpret_u8_u16(vget_high_u16(d));
+ const uint8x16_t wg_wg =
+ vcombine_u8(vtbl2_u8(v_split1, v_d_lo), vtbl2_u8(v_split1, v_d_hi));
+ const uint8x16_t sc_sc =
+ vcombine_u8(vtbl2_u8(v_split2, v_d_lo), vtbl2_u8(v_split2, v_d_hi));
+#endif // (__aarch64__)
+ uint16x8_t s01 =
+ vmulq_u16(vreinterpretq_u16_u8(pixels[0]), vreinterpretq_u16_u8(wg_wg));
+ s01 = vmlaq_u16(s01, vreinterpretq_u16_u8(pixels[1]),
+ vreinterpretq_u16_u8(sc_sc));
+#if defined(__aarch64__)
+ const uint8x16_t b = vqtbl1q_u8(pixels[2], vreinterpretq_u8_u16(rep));
+#else
+ const uint8x16_t b = vcombine_u8(
+ vtbl2_u8(v_split3, vget_low_u8(vreinterpretq_u8_u16(rep))),
+ vtbl2_u8(v_split3, vget_high_u8(vreinterpretq_u8_u16(rep))));
+#endif // (__aarch64__)
+ uint16x8_t sum0 = vmulq_u16(vreinterpretq_u16_u8(b), ww[0]);
+ sum0 = vmlaq_u16(sum0, vreinterpretq_u16_u8(pixels[3]), ww[1]);
+
+ uint32x4_t s0 = vaddl_u16(vget_low_u16(s01), vget_low_u16(sum0));
+#if defined(__aarch64__)
+ uint32x4_t s1 = vaddl_high_u16(s01, sum0);
+#else
+ uint32x4_t s1 = vaddl_u16(vget_high_u16(s01), vget_high_u16(sum0));
+#endif // (__aarch64__)
+
+ sum0 = vcombine_u16(vqrshrn_n_u32(s0, 9), vqrshrn_n_u32(s1, 9));
+ uint8x8_t predsh = vqmovn_u16(sum0);
+ vst1_u8(dst, predsh);
+
+ dst += stride;
+ rep = vaddq_u16(rep, one);
+ d = vaddq_u16(d, inc);
+ }
+}
+
+void aom_smooth_predictor_8x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[4];
+ load_pixel_w8(above, left, 4, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 4, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[4];
+ load_pixel_w8(above, left, 8, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 8, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+}
+
+void aom_smooth_predictor_8x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[4];
+ load_pixel_w8(above, left, 16, pixels);
+
+ uint16x8_t wh[4], ww[2];
+ load_weight_w8(sm_weight_arrays, 16, wh, ww);
+
+ smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
+}
+
+void aom_smooth_predictor_8x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x16_t pixels[8];
+ load_pixel_w8(above, left, 32, pixels);
+
+ uint16x8_t wh[8], ww[2];
+ load_weight_w8(sm_weight_arrays, 32, wh, ww);
+
+ smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
+ dst += stride << 3;
+ smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
+}
+
+static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left, uint32_t bw,
+ uint32_t bh) {
+ const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
+ const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
+ const uint16x8_t scale_value = vdupq_n_u16(256);
+
+ for (uint32_t y = 0; y < bh; ++y) {
+ const uint8x8_t left_y = vdup_n_u8(left[y]);
+ const uint8x8_t weights_y_dup = vdup_n_u8(sm_weights_h[y]);
+ const uint32x4_t pred_scaled_bl =
+ vdupq_n_u32(256 + (256 - sm_weights_h[y]) * left[bh - 1]);
+
+ for (uint32_t x = 0; x < bw; x += 8) {
+ const uint8x8_t weights_x = vld1_u8(sm_weights_w + x);
+ const uint8x8_t top_x = vld1_u8(above + x);
+
+ uint16x8_t pred_m1, pred_m2;
+ uint32x4_t pred_lo, pred_hi;
+ pred_m1 = vmull_u8(top_x, weights_y_dup);
+ pred_m2 = vmull_u8(weights_x, left_y);
+
+ pred_lo = vaddl_u16(vget_low_u16(pred_m1), vget_low_u16(pred_m2));
+#if defined(__aarch64__)
+ pred_hi = vaddl_high_u16(pred_m1, pred_m2);
+#else
+ pred_hi = vaddl_u16(vget_high_u16(pred_m1), vget_high_u16(pred_m2));
+#endif // (__aarch64__)
+
+ const uint16x8_t scale_m_weights_x = vsubw_u8(scale_value, weights_x);
+
+ const uint16x8_t swxtr = vmulq_n_u16(scale_m_weights_x, above[bw - 1]);
+
+ pred_lo = vaddq_u32(pred_lo, pred_scaled_bl);
+ pred_hi = vaddq_u32(pred_hi, pred_scaled_bl);
+
+ pred_lo = vaddw_u16(pred_lo, vget_low_u16(swxtr));
+#if defined(__aarch64__)
+ pred_hi = vaddw_high_u16(pred_hi, swxtr);
+#else
+ pred_hi = vaddw_u16(pred_hi, vget_high_u16(swxtr));
+#endif // (__aarch64__)
+
+ uint16x8_t pred =
+ vcombine_u16(vshrn_n_u32(pred_lo, 9), vshrn_n_u32(pred_hi, 9));
+
+ uint8x8_t predsh = vqmovn_u16(pred);
+
+ vst1_u8(dst + x, predsh);
+ }
+
+ dst += stride;
+ }
+}
+
+void aom_smooth_predictor_16x4_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 4);
+}
+
+void aom_smooth_predictor_16x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 8);
+}
+
+void aom_smooth_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 16);
+}
+
+void aom_smooth_predictor_16x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 32);
+}
+
+void aom_smooth_predictor_32x8_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above, const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 8);
+}
+
+void aom_smooth_predictor_32x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 16);
+}
+
+void aom_smooth_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 32);
+}
+
+void aom_smooth_predictor_32x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 32, 64);
+}
+
+void aom_smooth_predictor_64x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 64);
+}
+
+void aom_smooth_predictor_64x32_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 32);
+}
+
+void aom_smooth_predictor_64x16_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 64, 16);
+}
+
+void aom_smooth_predictor_16x64_neon(uint8_t *dst, ptrdiff_t stride,
+ const uint8_t *above,
+ const uint8_t *left) {
+ smooth_predictor_wxh(dst, stride, above, left, 16, 64);
+}
diff --git a/test/intrapred_test.cc b/test/intrapred_test.cc
index 779cf9a..cb7311d 100644
--- a/test/intrapred_test.cc
+++ b/test/intrapred_test.cc
@@ -97,6 +97,63 @@
}
ASSERT_EQ(0, error_count);
}
+ void RunSpeedTest(Pixel *left_col, Pixel *above_data, Pixel *dst,
+ Pixel *ref_dst) {
+ ACMRandom rnd(ACMRandom::DeterministicSeed());
+ const int block_width = params_.block_width;
+ const int block_height = params_.block_height;
+ above_row_ = above_data + 16;
+ left_col_ = left_col;
+ dst_ = dst;
+ ref_dst_ = ref_dst;
+ int error_count = 0;
+ const int numIter = 100;
+
+ int c_sum_time = 0;
+ int simd_sum_time = 0;
+ for (int i = 0; i < count_test_block; ++i) {
+ // Fill edges with random data, try first with saturated values.
+ for (int x = -1; x <= block_width * 2; x++) {
+ if (i == 0) {
+ above_row_[x] = mask_;
+ } else {
+ above_row_[x] = rnd.Rand16() & mask_;
+ }
+ }
+ for (int y = 0; y < block_height; y++) {
+ if (i == 0) {
+ left_col_[y] = mask_;
+ } else {
+ left_col_[y] = rnd.Rand16() & mask_;
+ }
+ }
+
+ aom_usec_timer c_timer_;
+ aom_usec_timer_start(&c_timer_);
+
+ PredictRefSpeedTest(numIter);
+
+ aom_usec_timer_mark(&c_timer_);
+
+ aom_usec_timer simd_timer_;
+ aom_usec_timer_start(&simd_timer_);
+
+ PredictFncSpeedTest(numIter);
+
+ aom_usec_timer_mark(&simd_timer_);
+
+ c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+ simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+ CheckPrediction(i, &error_count);
+ }
+
+ printf(
+ "blockWxH = %d x %d c_time = %d \t simd_time = %d \t Gain = %4.2f \n",
+ block_width, block_height, c_sum_time, simd_sum_time,
+ (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+ ASSERT_EQ(0, error_count);
+ }
protected:
virtual void SetUp() {
@@ -107,6 +164,9 @@
virtual void Predict() = 0;
+ virtual void PredictRefSpeedTest(int num) = 0;
+ virtual void PredictFncSpeedTest(int num) = 0;
+
void CheckPrediction(int test_case_number, int *error_count) const {
// For each pixel ensure that the calculated value is the same as reference.
const int block_width = params_.block_width;
@@ -142,6 +202,18 @@
ASM_REGISTER_STATE_CHECK(
params_.pred_fn(dst_, stride_, above_row_, left_col_, bit_depth));
}
+ void PredictRefSpeedTest(int num) {
+ const int bit_depth = params_.bit_depth;
+ for (int i = 0; i < num; i++) {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+ }
+ }
+ void PredictFncSpeedTest(int num) {
+ const int bit_depth = params_.bit_depth;
+ for (int i = 0; i < num; i++) {
+ params_.pred_fn(ref_dst_, stride_, above_row_, left_col_, bit_depth);
+ }
+ }
};
#endif
@@ -152,6 +224,16 @@
ASM_REGISTER_STATE_CHECK(
params_.pred_fn(dst_, stride_, above_row_, left_col_));
}
+ void PredictRefSpeedTest(int num) {
+ for (int i = 0; i < num; i++) {
+ params_.ref_fn(ref_dst_, stride_, above_row_, left_col_);
+ }
+ }
+ void PredictFncSpeedTest(int num) {
+ for (int i = 0; i < num; i++) {
+ params_.pred_fn(dst_, stride_, above_row_, left_col_);
+ }
+ }
};
#if CONFIG_AV1_HIGHBITDEPTH
@@ -169,19 +251,26 @@
}
#endif
-// Same issue as above but for arm.
-#if !HAVE_NEON
TEST_P(LowbdIntraPredTest, Bitexact) {
- // max block size is 32
- DECLARE_ALIGNED(16, uint8_t, left_col[2 * 32]);
- DECLARE_ALIGNED(16, uint8_t, above_data[2 * 32 + 32]);
- DECLARE_ALIGNED(16, uint8_t, dst[3 * 32 * 32]);
- DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 32 * 32]);
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
av1_zero(left_col);
av1_zero(above_data);
RunTest(left_col, above_data, dst, ref_dst);
}
-#endif // !HAVE_NEON
+TEST_P(LowbdIntraPredTest, DISABLED_Speed) {
+ // max block size is 64
+ DECLARE_ALIGNED(16, uint8_t, left_col[2 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, above_data[2 * 64 + 64]);
+ DECLARE_ALIGNED(16, uint8_t, dst[3 * 64 * 64]);
+ DECLARE_ALIGNED(16, uint8_t, ref_dst[3 * 64 * 64]);
+ av1_zero(left_col);
+ av1_zero(above_data);
+ RunSpeedTest(left_col, above_data, dst, ref_dst);
+}
#if CONFIG_AV1_HIGHBITDEPTH
// -----------------------------------------------------------------------------
@@ -229,6 +318,23 @@
#endif // HAVE_SSE2
+#if HAVE_NEON
+const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorNeon[] = {
+ lowbd_entry(smooth, 4, 4, neon), lowbd_entry(smooth, 4, 8, neon),
+ lowbd_entry(smooth, 4, 16, neon), lowbd_entry(smooth, 8, 4, neon),
+ lowbd_entry(smooth, 8, 8, neon), lowbd_entry(smooth, 8, 16, neon),
+ lowbd_entry(smooth, 8, 32, neon), lowbd_entry(smooth, 16, 4, neon),
+ lowbd_entry(smooth, 16, 8, neon), lowbd_entry(smooth, 16, 16, neon),
+ lowbd_entry(smooth, 16, 32, neon), lowbd_entry(smooth, 16, 64, neon),
+ lowbd_entry(smooth, 32, 8, neon), lowbd_entry(smooth, 32, 16, neon),
+ lowbd_entry(smooth, 32, 32, neon), lowbd_entry(smooth, 32, 64, neon),
+ lowbd_entry(smooth, 64, 16, neon), lowbd_entry(smooth, 64, 32, neon),
+ lowbd_entry(smooth, 64, 64, neon)
+};
+INSTANTIATE_TEST_SUITE_P(NEON, LowbdIntraPredTest,
+ ::testing::ValuesIn(LowbdIntraPredTestVectorNeon));
+#endif // HAVE_NEON
+
#if HAVE_SSSE3
const IntraPredFunc<IntraPred> LowbdIntraPredTestVectorSsse3[] = {
lowbd_intrapred(paeth, ssse3),