VP8 encoder for ARMv8 by using NEON intrinsics 1 Add vp8_mse16x16_neon.c - vp8_mse16x16_neon - vp8_get4x4sse_cs_neon Change-Id: I108952f60a9ae50613f0ce3903c2c81df19d99d0 Signed-off-by: James Yu <james.yu@linaro.org>
diff --git a/test/variance_test.cc b/test/variance_test.cc index f76402e..a438d17 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc
@@ -214,6 +214,99 @@ EXPECT_EQ(expected, var); } +#if CONFIG_VP8_ENCODER +template<typename MseFunctionType> +class MseTest + : public ::testing::TestWithParam<tuple<int, int, MseFunctionType> > { + public: + virtual void SetUp() { + const tuple<int, int, MseFunctionType>& params = this->GetParam(); + log2width_ = get<0>(params); + width_ = 1 << log2width_; + log2height_ = get<1>(params); + height_ = 1 << log2height_; + mse_ = get<2>(params); + + rnd(ACMRandom::DeterministicSeed()); + block_size_ = width_ * height_; + src_ = reinterpret_cast<uint8_t *>(vpx_memalign(16, block_size_)); + ref_ = new uint8_t[block_size_]; + ASSERT_TRUE(src_ != NULL); + ASSERT_TRUE(ref_ != NULL); + } + + virtual void TearDown() { + vpx_free(src_); + delete[] ref_; + libvpx_test::ClearSystemState(); + } + + protected: + void RefTest_mse(); + void RefTest_sse(); + void MaxTest_mse(); + void MaxTest_sse(); + + ACMRandom rnd; + uint8_t* src_; + uint8_t* ref_; + int width_, log2width_; + int height_, log2height_; + int block_size_; + MseFunctionType mse_; +}; + +template<typename MseFunctionType> +void MseTest<MseFunctionType>::RefTest_mse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + ref_[j] = rnd.Rand8(); + } + unsigned int sse1, sse2; + ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse1)); + variance_ref(src_, ref_, log2width_, log2height_, &sse2); + EXPECT_EQ(sse1, sse2); + } +} + +template<typename MseFunctionType> +void MseTest<MseFunctionType>::RefTest_sse() { + for (int i = 0; i < 10; ++i) { + for (int j = 0; j < block_size_; j++) { + src_[j] = rnd.Rand8(); + ref_[j] = rnd.Rand8(); + } + unsigned int sse2; + unsigned int var1; + ASM_REGISTER_STATE_CHECK( + var1 = mse_(src_, width_, ref_, width_)); + variance_ref(src_, ref_, log2width_, log2height_, &sse2); + EXPECT_EQ(var1, sse2); + } +} + +template<typename MseFunctionType> +void MseTest<MseFunctionType>::MaxTest_mse() { + memset(src_, 255, block_size_); + memset(ref_, 0, block_size_); + unsigned int sse; + ASM_REGISTER_STATE_CHECK(mse_(src_, width_, ref_, width_, &sse)); + const unsigned int expected = block_size_ * 255 * 255; + EXPECT_EQ(expected, sse); +} + +template<typename MseFunctionType> +void MseTest<MseFunctionType>::MaxTest_sse() { + memset(src_, 255, block_size_); + memset(ref_, 0, block_size_); + unsigned int var; + ASM_REGISTER_STATE_CHECK(var = mse_(src_, width_, ref_, width_)); + const unsigned int expected = block_size_ * 255 * 255; + EXPECT_EQ(expected, var); +} +#endif + #if CONFIG_VP9_ENCODER unsigned int subpel_avg_variance_ref(const uint8_t *ref, @@ -343,12 +436,31 @@ namespace vp8 { #if CONFIG_VP8_ENCODER +typedef unsigned int (*vp8_sse_fn_t)(const unsigned char *src_ptr, + int source_stride, const unsigned char *ref_ptr, int ref_stride); + +typedef MseTest<vp8_sse_fn_t> VP8SseTest; +typedef MseTest<vp8_variance_fn_t> VP8MseTest; typedef VarianceTest<vp8_variance_fn_t> VP8VarianceTest; +TEST_P(VP8SseTest, Ref_sse) { RefTest_sse(); } +TEST_P(VP8SseTest, Max_sse) { MaxTest_sse(); } +TEST_P(VP8MseTest, Ref_mse) { RefTest_mse(); } +TEST_P(VP8MseTest, Max_mse) { MaxTest_mse(); } TEST_P(VP8VarianceTest, Zero) { ZeroTest(); } TEST_P(VP8VarianceTest, Ref) { RefTest(); } TEST_P(VP8VarianceTest, OneQuarter) { OneQuarterTest(); } +const vp8_sse_fn_t get4x4sse_cs_c = vp8_get4x4sse_cs_c; +INSTANTIATE_TEST_CASE_P( + C, VP8SseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_c))); + +const vp8_variance_fn_t mse16x16_c = vp8_mse16x16_c; +INSTANTIATE_TEST_CASE_P( + C, VP8MseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_c))); + const vp8_variance_fn_t variance4x4_c = vp8_variance4x4_c; const vp8_variance_fn_t variance8x8_c = vp8_variance8x8_c; const vp8_variance_fn_t variance8x16_c = vp8_variance8x16_c; @@ -363,6 +475,16 @@ make_tuple(4, 4, variance16x16_c))); #if HAVE_NEON +const vp8_sse_fn_t get4x4sse_cs_neon = vp8_get4x4sse_cs_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP8SseTest, + ::testing::Values(make_tuple(2, 2, get4x4sse_cs_neon))); + +const vp8_variance_fn_t mse16x16_neon = vp8_mse16x16_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP8MseTest, + ::testing::Values(make_tuple(4, 4, mse16x16_neon))); + const vp8_variance_fn_t variance8x8_neon = vp8_variance8x8_neon; const vp8_variance_fn_t variance8x16_neon = vp8_variance8x16_neon; const vp8_variance_fn_t variance16x8_neon = vp8_variance16x8_neon; @@ -375,6 +497,7 @@ make_tuple(4, 4, variance16x16_neon))); #endif + #if HAVE_MMX const vp8_variance_fn_t variance4x4_mmx = vp8_variance4x4_mmx; const vp8_variance_fn_t variance8x8_mmx = vp8_variance8x8_mmx;
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 0070c28..c73ecf9 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl
@@ -404,14 +404,12 @@ $vp8_sub_pixel_mse16x16_sse2=vp8_sub_pixel_mse16x16_wmt; add_proto qw/unsigned int vp8_mse16x16/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp8_mse16x16 mmx sse2 media neon_asm/; +specialize qw/vp8_mse16x16 mmx sse2 media neon/; $vp8_mse16x16_sse2=vp8_mse16x16_wmt; $vp8_mse16x16_media=vp8_mse16x16_armv6; -$vp8_mse16x16_neon_asm=vp8_mse16x16_neon; add_proto qw/unsigned int vp8_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride"; -specialize qw/vp8_get4x4sse_cs mmx neon_asm/; -$vp8_get4x4sse_cs_neon_asm=vp8_get4x4sse_cs_neon; +specialize qw/vp8_get4x4sse_cs mmx neon/; # # Block copy
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm b/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm deleted file mode 100644 index f82af3e..0000000 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm +++ /dev/null
@@ -1,123 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_mse16x16_neon| - EXPORT |vp8_get4x4sse_cs_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;============================ -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -;note: in this function, sum is never used. So, we can remove this part of calculation -;from vp8_variance(). - -|vp8_mse16x16_neon| PROC - vpush {q7} - - vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse - vmov.i8 q8, #0 - vmov.i8 q9, #0 - vmov.i8 q10, #0 - - mov r12, #8 - -mse16x16_neon_loop - vld1.8 {q0}, [r0], r1 ;Load up source and reference - vld1.8 {q2}, [r2], r3 - vld1.8 {q1}, [r0], r1 - vld1.8 {q3}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmlal.s16 q7, d22, d22 - vmlal.s16 q8, d23, d23 - - subs r12, r12, #1 - - vmlal.s16 q9, d24, d24 - vmlal.s16 q10, d25, d25 - vmlal.s16 q7, d26, d26 - vmlal.s16 q8, d27, d27 - vmlal.s16 q9, d28, d28 - vmlal.s16 q10, d29, d29 - - bne mse16x16_neon_loop - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - - ldr r12, [sp, #16] ;load *sse from stack - - vadd.u32 q10, q7, q9 - vpaddl.u32 q1, q10 - vadd.u64 d0, d2, d3 - - vst1.32 {d0[0]}, [r12] - vmov.32 r0, d0[0] - - vpop {q7} - bx lr - - ENDP - - -;============================= -; r0 unsigned char *src_ptr, -; r1 int source_stride, -; r2 unsigned char *ref_ptr, -; r3 int recon_stride -|vp8_get4x4sse_cs_neon| PROC - vpush {q7} - - vld1.8 {d0}, [r0], r1 ;Load up source and reference - vld1.8 {d4}, [r2], r3 - vld1.8 {d1}, [r0], r1 - vld1.8 {d5}, [r2], r3 - vld1.8 {d2}, [r0], r1 - vld1.8 {d6}, [r2], r3 - vld1.8 {d3}, [r0], r1 - vld1.8 {d7}, [r2], r3 - - vsubl.u8 q11, d0, d4 - vsubl.u8 q12, d1, d5 - vsubl.u8 q13, d2, d6 - vsubl.u8 q14, d3, d7 - - vmull.s16 q7, d22, d22 - vmull.s16 q8, d24, d24 - vmull.s16 q9, d26, d26 - vmull.s16 q10, d28, d28 - - vadd.u32 q7, q7, q8 - vadd.u32 q9, q9, q10 - vadd.u32 q9, q7, q9 - - vpaddl.u32 q1, q9 - vadd.u64 d0, d2, d3 - - vmov.32 r0, d0[0] - - vpop {q7} - bx lr - - ENDP - - END
diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c new file mode 100644 index 0000000..06e4f94 --- /dev/null +++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c
@@ -0,0 +1,131 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +unsigned int vp8_mse16x16_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride, + unsigned int *sse) { + int i; + int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; + int64_t d0s64; + uint8x16_t q0u8, q1u8, q2u8, q3u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + q7s32 = vdupq_n_s32(0); + q8s32 = vdupq_n_s32(0); + q9s32 = vdupq_n_s32(0); + q10s32 = vdupq_n_s32(0); + + for (i = 0; i < 8; i++) { // mse16x16_neon_loop + q0u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q1u8 = vld1q_u8(src_ptr); + src_ptr += source_stride; + q2u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + q3u8 = vld1q_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q2u8)); + q12u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q2u8)); + q13u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q3u8)); + q14u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q3u8)); + + d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); + d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); + q7s32 = vmlal_s16(q7s32, d22s16, d22s16); + q8s32 = vmlal_s16(q8s32, d23s16, d23s16); + + d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); + d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); + q9s32 = vmlal_s16(q9s32, d24s16, d24s16); + q10s32 = vmlal_s16(q10s32, d25s16, d25s16); + + d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); + d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); + q7s32 = vmlal_s16(q7s32, d26s16, d26s16); + q8s32 = vmlal_s16(q8s32, d27s16, d27s16); + + d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16)); + d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16)); + q9s32 = vmlal_s16(q9s32, d28s16, d28s16); + q10s32 = vmlal_s16(q10s32, d29s16, d29s16); + } + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q10s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q10s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d0s64), 0); + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +} + +unsigned int vp8_get4x4sse_cs_neon( + const unsigned char *src_ptr, + int source_stride, + const unsigned char *ref_ptr, + int recon_stride) { + int16x4_t d22s16, d24s16, d26s16, d28s16; + int64_t d0s64; + uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; + int32x4_t q7s32, q8s32, q9s32, q10s32; + uint16x8_t q11u16, q12u16, q13u16, q14u16; + int64x2_t q1s64; + + d0u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d4u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d1u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d5u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d2u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d6u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + d3u8 = vld1_u8(src_ptr); + src_ptr += source_stride; + d7u8 = vld1_u8(ref_ptr); + ref_ptr += recon_stride; + + q11u16 = vsubl_u8(d0u8, d4u8); + q12u16 = vsubl_u8(d1u8, d5u8); + q13u16 = vsubl_u8(d2u8, d6u8); + q14u16 = vsubl_u8(d3u8, d7u8); + + d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16)); + d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16)); + d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16)); + d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16)); + + q7s32 = vmull_s16(d22s16, d22s16); + q8s32 = vmull_s16(d24s16, d24s16); + q9s32 = vmull_s16(d26s16, d26s16); + q10s32 = vmull_s16(d28s16, d28s16); + + q7s32 = vaddq_s32(q7s32, q8s32); + q9s32 = vaddq_s32(q9s32, q10s32); + q9s32 = vaddq_s32(q7s32, q9s32); + + q1s64 = vpaddlq_s32(q9s32); + d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64)); + + return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0); +}
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 551271e..ed19fd4 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk
@@ -36,9 +36,9 @@ #File list for neon # encoder VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/fastquantizeb_neon$(ASM) -VP8_CX_SRCS-$(HAVE_NEON_ASM) += encoder/arm/neon/vp8_mse16x16_neon$(ASM) VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/subtract_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_mse16x16_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c