Add NEON optimization of lpf_vertical_6 and lpf_horizontal_14
Intrinsic optimization and unit test changes of lpf_vertical_6 and
lpf_horizontal_14 functions are added.
Performance gain w.r.t. C,
lpf_vertical_6 ~2.84x
lpf_horizontal_14 ~5.03x
Change-Id: Id9195eb4fa7f4c1f1554e7a2b8bf8164a4a0244d
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index a8ac5eb..d013609 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -377,7 +377,7 @@
specialize qw/aom_lpf_vertical_14_dual sse2/;
add_proto qw/void aom_lpf_vertical_6/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_vertical_6 sse2/;
+specialize qw/aom_lpf_vertical_6 sse2 neon/;
add_proto qw/void aom_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
specialize qw/aom_lpf_vertical_8 sse2 neon/;
@@ -392,7 +392,7 @@
specialize qw/aom_lpf_vertical_4_dual sse2/;
add_proto qw/void aom_lpf_horizontal_14/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/aom_lpf_horizontal_14 sse2/;
+specialize qw/aom_lpf_horizontal_14 sse2 neon/;
add_proto qw/void aom_lpf_horizontal_14_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
specialize qw/aom_lpf_horizontal_14_dual sse2/;
diff --git a/aom_dsp/arm/loopfilter_neon.c b/aom_dsp/arm/loopfilter_neon.c
index ee1a3c7..c93914b 100644
--- a/aom_dsp/arm/loopfilter_neon.c
+++ b/aom_dsp/arm/loopfilter_neon.c
@@ -646,6 +646,85 @@
store_u8_8x4(src - 4, stride, p3q0, p2q1, p1q2, p0q3);
}
+void aom_lpf_vertical_6_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint32x2x2_t p2q2_p1q1, pxqy_p0q0;
+ uint32x2_t pq_rev;
+ uint8x8_t pxq0, p2q1, p1q2, p0qy;
+ uint8x8_t p0q0, p1q1, p2q2, pxqy;
+
+ // row0: px p2 p1 p0 | q0 q1 q2 qy
+ // row1: px p2 p1 p0 | q0 q1 q2 qy
+ // row2: px p2 p1 p0 | q0 q1 q2 qy
+ // row3: px p2 p1 p0 | q0 q1 q2 qy
+ load_u8_8x4(src - 4, stride, &pxq0, &p2q1, &p1q2, &p0qy);
+
+ transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0qy));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxq0), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q2));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q1), pq_rev);
+
+ p0q0 = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q1 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q2 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxqy = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+
+ lpf_6_neon(&p2q2, &p1q1, &p0q0, *blimit, *limit, *thresh);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p0q0));
+ pxqy_p0q0 = vtrn_u32(vreinterpret_u32_u8(pxqy), pq_rev);
+
+ pq_rev = vrev64_u32(vreinterpret_u32_u8(p1q1));
+ p2q2_p1q1 = vtrn_u32(vreinterpret_u32_u8(p2q2), pq_rev);
+
+ p0qy = vreinterpret_u8_u32(vrev64_u32(pxqy_p0q0.val[1]));
+ p1q2 = vreinterpret_u8_u32(vrev64_u32(p2q2_p1q1.val[1]));
+ p2q1 = vreinterpret_u8_u32(p2q2_p1q1.val[0]);
+ pxq0 = vreinterpret_u8_u32(pxqy_p0q0.val[0]);
+ transpose_u8_8x4(&pxq0, &p2q1, &p1q2, &p0qy);
+
+ store_u8_8x4(src - 4, stride, pxq0, p2q1, p1q2, p0qy);
+}
+
+void aom_lpf_horizontal_14_neon(uint8_t *src, int stride, const uint8_t *blimit,
+ const uint8_t *limit, const uint8_t *thresh) {
+ uint8x8_t p0q0, p1q1, p2q2, p3q3, p4q4, p5q5, p6q6;
+
+ load_u8_4x1(src - 7 * stride, &p6q6, 0);
+ load_u8_4x1(src - 6 * stride, &p5q5, 0);
+ load_u8_4x1(src - 5 * stride, &p4q4, 0);
+ load_u8_4x1(src - 4 * stride, &p3q3, 0);
+ load_u8_4x1(src - 3 * stride, &p2q2, 0);
+ load_u8_4x1(src - 2 * stride, &p1q1, 0);
+ load_u8_4x1(src - 1 * stride, &p0q0, 0);
+ load_u8_4x1(src + 0 * stride, &p0q0, 1);
+ load_u8_4x1(src + 1 * stride, &p1q1, 1);
+ load_u8_4x1(src + 2 * stride, &p2q2, 1);
+ load_u8_4x1(src + 3 * stride, &p3q3, 1);
+ load_u8_4x1(src + 4 * stride, &p4q4, 1);
+ load_u8_4x1(src + 5 * stride, &p5q5, 1);
+ load_u8_4x1(src + 6 * stride, &p6q6, 1);
+
+ lpf_14_neon(&p6q6, &p5q5, &p4q4, &p3q3, &p2q2, &p1q1, &p0q0, *blimit, *limit,
+ *thresh);
+
+ store_u8_4x1(src - 6 * stride, p5q5, 0);
+ store_u8_4x1(src - 5 * stride, p4q4, 0);
+ store_u8_4x1(src - 4 * stride, p3q3, 0);
+ store_u8_4x1(src - 3 * stride, p2q2, 0);
+ store_u8_4x1(src - 2 * stride, p1q1, 0);
+ store_u8_4x1(src - 1 * stride, p0q0, 0);
+ store_u8_4x1(src + 0 * stride, p0q0, 1);
+ store_u8_4x1(src + 1 * stride, p1q1, 1);
+ store_u8_4x1(src + 2 * stride, p2q2, 1);
+ store_u8_4x1(src + 3 * stride, p3q3, 1);
+ store_u8_4x1(src + 4 * stride, p4q4, 1);
+ store_u8_4x1(src + 5 * stride, p5q5, 1);
+}
+
void aom_lpf_horizontal_8_neon(uint8_t *src, int stride, const uint8_t *blimit,
const uint8_t *limit, const uint8_t *thresh) {
uint8x8_t p0q0, p1q1, p2q2, p3q3;
diff --git a/av1/common/arm/mem_neon.h b/av1/common/arm/mem_neon.h
index 214b14b..b4463e5 100644
--- a/av1/common/arm/mem_neon.h
+++ b/av1/common/arm/mem_neon.h
@@ -22,6 +22,12 @@
s += p;
}
+static INLINE void load_u8_4x1(const uint8_t *s, uint8x8_t *const s0,
+ int lane) {
+ *s0 = vreinterpret_u8_u32(
+ vld1_lane_u32((uint32_t *)s, vreinterpret_u32_u8(*s0), lane));
+}
+
static INLINE void load_u8_8x8(const uint8_t *s, ptrdiff_t p,
uint8x8_t *const s0, uint8x8_t *const s1,
uint8x8_t *const s2, uint8x8_t *const s3,
@@ -128,6 +134,11 @@
*s3 = vld1_s16(s);
}
+static INLINE void store_u8_4x1(const uint8_t *s, uint8x8_t const s0,
+ int lane) {
+ vst1_lane_u32((uint32_t *)s, vreinterpret_u32_u8(s0), lane);
+}
+
static INLINE void store_u8_8x8(uint8_t *s, ptrdiff_t p, const uint8x8_t s0,
const uint8x8_t s1, const uint8x8_t s2,
const uint8x8_t s3, const uint8x8_t s4,
diff --git a/test/lpf_test.cc b/test/lpf_test.cc
index 1e2862a..d510df3 100644
--- a/test/lpf_test.cc
+++ b/test/lpf_test.cc
@@ -581,6 +581,8 @@
const loop_param_t kLoop8Test6[] = {
make_tuple(&aom_lpf_vertical_14_neon, &aom_lpf_vertical_14_c, 8),
make_tuple(&aom_lpf_vertical_8_neon, &aom_lpf_vertical_8_c, 8),
+ make_tuple(&aom_lpf_vertical_6_neon, &aom_lpf_vertical_6_c, 8),
+ make_tuple(&aom_lpf_horizontal_14_neon, &aom_lpf_horizontal_14_c, 8),
make_tuple(&aom_lpf_horizontal_8_neon, &aom_lpf_horizontal_8_c, 8),
make_tuple(&aom_lpf_horizontal_6_neon, &aom_lpf_horizontal_6_c, 8)
};