Add NEON optimization of lpf_vertical_4 and lpf_horizontal_4
Intrinsic optimization and unit test changes of lpf_vertical_4 and
lpf_horizontal_4 functions are added.
Performance gain w.r.t. C,
lpf_vertical_4 ~2.8x
lpf_horizontal_4 ~4.7x
Change-Id: I60159f943eb5b8ed77efdafb23c02ccd9d672c60
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index d013609..1463d88 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -386,7 +386,7 @@
specialize qw/aom_lpf_vertical_8_dual sse2/;
add_proto qw/void aom_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_4 sse2/;
+specialize qw/aom_lpf_vertical_4 sse2 neon/;
add_proto qw/void aom_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_vertical_4_dual sse2/;
@@ -410,7 +410,7 @@
specialize qw/aom_lpf_horizontal_8_dual sse2/;
add_proto qw/void aom_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_4 sse2/;
+specialize qw/aom_lpf_horizontal_4 sse2 neon/;
add_proto qw/void aom_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_horizontal_4_dual sse2/;
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index c93914b..dd81007 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -52,6 +52,36 @@
return mask_8x8;
}
+static INLINE uint8x8_t lpf_mask2(uint8x8_t p1q1, uint8x8_t p0q0,
+ const uint8_t blimit, const uint8_t limit) {
+ uint32x2x2_t p0q0_p1q1;
+ uint16x8_t temp_16x8;
+ uint16x4_t temp0_16x4, temp1_16x4;
+ const uint16x4_t blimit_16x4 = vdup_n_u16(blimit);
+ const uint8x8_t limit_8x8 = vdup_n_u8(limit);
+ uint8x8_t mask_8x8, temp_8x8;
+
+ mask_8x8 = vabd_u8(p1q1, p0q0);
+ mask_8x8 = vcle_u8(mask_8x8, limit_8x8);
+
+ temp_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(mask_8x8)));
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ p0q0_p1q1 = vtrn_u32(vreinterpret_u32_u8(p0q0), vreinterpret_u32_u8(p1q1));
+ temp_8x8 = vabd_u8(vreinterpret_u8_u32(p0q0_p1q1.val[0]),
+ vreinterpret_u8_u32(p0q0_p1q1.val[1]));
+ temp_16x8 = vmovl_u8(temp_8x8);
+ temp0_16x4 = vshl_n_u16(vget_low_u16(temp_16x8), 1);
+ temp1_16x4 = vshr_n_u16(vget_high_u16(temp_16x8), 1);
+ temp0_16x4 = vadd_u16(temp0_16x4, temp1_16x4);
+ temp0_16x4 = vcle_u16(temp0_16x4, blimit_16x4);
+ temp_8x8 = vmovn_u16(vcombine_u16(temp0_16x4, temp0_16x4));
+
+ mask_8x8 = vand_u8(mask_8x8, temp_8x8);
+
+ return mask_8x8;
+}
+
static INLINE uint8x8_t lpf_flat_mask4(uint8x8_t p3q3, uint8x8_t p2q2,
uint8x8_t p1q1, uint8x8_t p0q0) {
const uint8x8_t thresh_8x8 = vdup_n_u8(1); // for bd==8 threshold is always 1
@@ -523,6 +553,68 @@
}
}
+static void lpf_4_neon(uint8x8_t *p1q1, uint8x8_t *p0q0, const uint8_t blimit,
+ const uint8_t limit, const uint8_t thresh) {
+ int32x2x2_t ps0_qs0, ps1_qs1;
+ int16x8_t filter_s16;
+ const uint8x8_t thresh_f4 = vdup_n_u8(thresh);
+ uint8x8_t mask_8x8, temp0_8x8, temp1_8x8;
+ int8x8_t ps0_s8, ps1_s8, qs0_s8, qs1_s8, temp_s8;
+ int8x8_t op0, oq0, op1, oq1;
+ int8x8_t pq_s0, pq_s1;
+ int8x8_t filter_s8, filter1_s8, filter2_s8;
+ int8x8_t hev_8x8;
+ const int8x8_t sign_mask = vdup_n_s8(0x80);
+ const int8x8_t val_4 = vdup_n_s8(4);
+ const int8x8_t val_3 = vdup_n_s8(3);
+
+ // Calculate filter mask
+ mask_8x8 = lpf_mask2(*p1q1, *p0q0, blimit, limit);
+
+ pq_s0 = veor_s8(vreinterpret_s8_u8(*p0q0), sign_mask);
+ pq_s1 = veor_s8(vreinterpret_s8_u8(*p1q1), sign_mask);
+
+ ps0_qs0 = vtrn_s32(vreinterpret_s32_s8(pq_s0), vreinterpret_s32_s8(pq_s0));
+ ps1_qs1 = vtrn_s32(vreinterpret_s32_s8(pq_s1), vreinterpret_s32_s8(pq_s1));
+ ps0_s8 = vreinterpret_s8_s32(ps0_qs0.val[0]);
+ qs0_s8 = vreinterpret_s8_s32(ps0_qs0.val[1]);
+ ps1_s8 = vreinterpret_s8_s32(ps1_qs1.val[0]);
+ qs1_s8 = vreinterpret_s8_s32(ps1_qs1.val[1]);
+
+ // hev_mask
+ temp0_8x8 = vcgt_u8(vabd_u8(*p0q0, *p1q1), thresh_f4);
+ temp1_8x8 = vreinterpret_u8_u32(vrev64_u32(vreinterpret_u32_u8(temp0_8x8)));
+ hev_8x8 = vreinterpret_s8_u8(vorr_u8(temp0_8x8, temp1_8x8));
+
+ // add outer taps if we have high edge variance
+ filter_s8 = vqsub_s8(ps1_s8, qs1_s8);
+ filter_s8 = vand_s8(filter_s8, hev_8x8);
+
+ // inner taps
+ temp_s8 = vqsub_s8(qs0_s8, ps0_s8);
+ filter_s16 = vmovl_s8(filter_s8);
+ filter_s16 = vmlal_s8(filter_s16, temp_s8, val_3);
+ filter_s8 = vqmovn_s16(filter_s16);
+ filter_s8 = vand_s8(filter_s8, vreinterpret_s8_u8(mask_8x8));
+
+ filter1_s8 = vqadd_s8(filter_s8, val_4);
+ filter2_s8 = vqadd_s8(filter_s8, val_3);
+ filter1_s8 = vshr_n_s8(filter1_s8, 3);
+ filter2_s8 = vshr_n_s8(filter2_s8, 3);
+
+ oq0 = veor_s8(vqsub_s8(qs0_s8, filter1_s8), sign_mask);
+ op0 = veor_s8(vqadd_s8(ps0_s8, filter2_s8), sign_mask);
+
+ filter_s8 = vrshr_n_s8(filter1_s8, 1);
+ filter_s8 = vbic_s8(filter_s8, hev_8x8);
+
+ oq1 = veor_s8(vqsub_s8(qs1_s8, filter_s8), sign_mask);
+ op1 = veor_s8(vqadd_s8(ps1_s8, filter_s8), sign_mask);
+
+ *p0q0 = vreinterpret_u8_s8(vext_s8(op0, oq0, 4));
+ *p1q1 = vreinterpret_u8_s8(vext_s8(op1, oq1, 4));
+}
+
void aom_lpf_vertical_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x16_t row0, row1, row2, row3;
@@ -689,6 +781,46 @@
store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
}
+void aom_lpf_vertical_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p1q0_p0q1, p1q1_p0q0, p1p0_q1q0;
+ uint32x2_t pq_rev;
+ uint8x8_t p1p0, q0q1, p0q0, p1q1;
+
+ // row0: p1 p0 | q0 q1
+ // row1: p1 p0 | q0 q1
+ // row2: p1 p0 | q0 q1
+ // row3: p1 p0 | q0 q1
+ load_u8_4x1(src - 2, &p1p0, 0);
+ load_u8_4x1((src - 2) + 1 * stride, &p1p0, 1);
+ load_u8_4x1((src - 2) + 2 * stride, &q0q1, 0);
+ load_u8_4x1((src - 2) + 3 * stride, &q0q1, 1);
+
+ transpose_u8_4x4(&p1p0, &q0q1);
+
+ p1q0_p0q1 = vtrn_u32(vreinterpret_u32_u8(p1p0), vreinterpret_u32_u8(q0q1));
+
+ pq_rev = vrev64_u32(p1q0_p0q1.val[1]);
+ p1q1_p0q0 = vtrn_u32(p1q0_p0q1.val[0], pq_rev);
+
+ p1q1 = vreinterpret_u8_u32(p1q1_p0q0.val[0]);
+ p0q0 = vreinterpret_u8_u32(p1q1_p0q0.val[1]);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ p1p0_q1q0 = vtrn_u32(vreinterpret_u32_u8(p1q1), vreinterpret_u32_u8(p0q0));
+
+ p1p0 = vreinterpret_u8_u32(p1p0_q1q0.val[0]);
+ q0q1 = vreinterpret_u8_u32(vrev64_u32(p1p0_q1q0.val[1]));
+
+ transpose_u8_4x4(&p1p0, &q0q1);
+
+ store_u8_4x1(src - 2, p1p0, 0);
+ store_u8_4x1((src - 2) + 1 * stride, q0q1, 0);
+ store_u8_4x1((src - 2) + 2 * stride, p1p0, 1);
+ store_u8_4x1((src - 2) + 3 * stride, q0q1, 1);
+}
+
void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
@@ -777,3 +909,20 @@
vst1_lane_u32((uint32_t *)(src + 1 * stride), vreinterpret_u32_u8(p1q1), 1);
vst1_lane_u32((uint32_t *)(src + 2 * stride), vreinterpret_u32_u8(p2q2), 1);
}
+
+void aom_lpf_horizontal_4_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1;
+
+ load_u8_4x1(src - 2 * stride, &p1q1, 0);
+ load_u8_4x1(src - 1 * stride, &p0q0, 0);
+ load_u8_4x1(src + 0 * stride, &p0q0, 1);
+ load_u8_4x1(src + 1 * stride, &p1q1, 1);
+
+ lpf_4_neon(&p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ store_u8_4x1(src - 2 * stride, p1q1, 0);
+ store_u8_4x1(src - 1 * stride, p0q0, 0);
+ store_u8_4x1(src + 0 * stride, p0q0, 1);
+ store_u8_4x1(src + 1 * stride, p1q1, 1);
+}
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index d510df3..451bffd 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -582,9 +582,11 @@
make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+ make_tuple(&aom_lpf_vertical_4_neon, &aom_lpf_vertical_4_c, 8),
make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
- make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8)
+ make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8),
+ make_tuple(&aom_lpf_horizontal_4_neon, &aom_lpf_horizontal_4_c, 8)
};
INSTANTIATE_TEST_CASE_P(NEON, Loop8Test6Param_lbd,