Add v1_txb_init_levels_avx2 1.optimize v1_txb_init_levels_sse4_1 2.Add v1_txb_init_levels_avx2 speed up about 0.3% without rd change test sequence: BasketballDrill_832x480_50.y4m test command line:./aomenc --cpu-used=1 --psnr -D \ -q --end-usage=vbr --target-bitrate=800 --limit=20 \ BasketballDrill_832x480_50.y4m -otest.webm Change-Id: Icb6db0c199c953bca6c8575446053e1565541c0a
diff --git a/av1/av1.cmake b/av1/av1.cmake index 5da5f0f..7220a40 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -280,6 +280,7 @@ "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm_avx2.h" "${AOM_ROOT}/av1/encoder/x86/av1_fwd_txfm2d_avx2.c" "${AOM_ROOT}/av1/encoder/x86/wedge_utils_avx2.c" + "${AOM_ROOT}/av1/encoder/x86/encodetxb_avx2.c" "${AOM_ROOT}/av1/encoder/x86/pickrst_avx2.c") list(APPEND AOM_AV1_ENCODER_INTRIN_NEON
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index 675a7cd..988a172 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -252,7 +252,7 @@ add_proto qw/void av1_get_nz_map_contexts/, "const uint8_t *const levels, const int16_t *const scan, const uint16_t eob, const TX_SIZE tx_size, const TX_CLASS tx_class, int8_t *const coeff_contexts"; specialize qw/av1_get_nz_map_contexts sse2/; add_proto qw/void av1_txb_init_levels/, "const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels"; - specialize qw/av1_txb_init_levels sse4_1/; + specialize qw/av1_txb_init_levels sse4_1 avx2/; add_proto qw/uint64_t av1_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N"; specialize qw/av1_wedge_sse_from_residuals sse2 avx2/;
diff --git a/av1/encoder/x86/encodetxb_avx2.c b/av1/encoder/x86/encodetxb_avx2.c new file mode 100644 index 0000000..7642f57 --- /dev/null +++ b/av1/encoder/x86/encodetxb_avx2.c
@@ -0,0 +1,130 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <assert.h> +#include <emmintrin.h> // SSE2 +#include <smmintrin.h> /* SSE4.1 */ +#include <immintrin.h> /* AVX2 */ + +#include "aom/aom_integer.h" +#include "aom_dsp/x86/mem_sse2.h" +#include "av1/common/onyxc_int.h" +#include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" +#include "aom_dsp/x86/synonyms_avx2.h" + +void av1_txb_init_levels_avx2(const tran_low_t *const coeff, const int width, + const int height, uint8_t *const levels) { + const int stride = width + TX_PAD_HOR; + const __m256i y_zeros = _mm256_setzero_si256(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + yy_storeu_256(pre_buf, y_zeros); + pre_buf += 32; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf_end = levels + (height + TX_PAD_BOTTOM) * stride; + uint8_t *bottom_buf = bottom_buf_end - ((bottom_len + 31) & (~31)); + + do { + yy_storeu_256(bottom_buf, y_zeros); + bottom_buf += 32; + } while (bottom_buf < bottom_buf_end); + + int i = 0; + uint8_t *ls = levels; + const tran_low_t *cf = coeff; + if (width == 4) { + do { + const __m256i c0 = yy_loadu_256(cf); + const __m256i c1 = yy_loadu_256(cf + 8); + const __m256i abs01 = _mm256_abs_epi16(_mm256_packs_epi32(c0, c1)); + const __m256i abs01_8 = _mm256_packs_epi16(abs01, y_zeros); + const __m256i res_ = _mm256_shuffle_epi32(abs01_8, 0xd8); + const __m256i res = _mm256_permute4x64_epi64(res_, 0xd8); + yy_storeu_256(ls, res); + ls += 32; + cf += 16; + i += 4; + } while (i < height); + } else if (width == 8) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + const __m128i res0 = _mm256_castsi256_si128(res); + const __m128i res1 = _mm256_extracti128_si256(res, 1); + xx_storel_64(ls, res0); + *(int32_t *)(ls + width) = 0; + xx_storel_64(ls + stride, _mm_srli_si128(res0, 8)); + *(int32_t *)(ls + width + stride) = 0; + xx_storel_64(ls + stride * 2, res1); + *(int32_t *)(ls + width + stride * 2) = 0; + xx_storel_64(ls + stride * 3, _mm_srli_si128(res1, 8)); + *(int32_t *)(ls + width + stride * 3) = 0; + cf += 32; + ls += stride << 2; + i += 4; + } while (i < height); + } else if (width == 16) { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + xx_storeu_128(ls, _mm256_castsi256_si128(res)); + xx_storeu_128(ls + stride, _mm256_extracti128_si256(res, 1)); + cf += 32; + *(int32_t *)(ls + width) = 0; + *(int32_t *)(ls + stride + width) = 0; + ls += stride << 1; + i += 2; + } while (i < height); + } else { + do { + const __m256i coeffA = yy_loadu_256(cf); + const __m256i coeffB = yy_loadu_256(cf + 8); + const __m256i coeffC = yy_loadu_256(cf + 16); + const __m256i coeffD = yy_loadu_256(cf + 24); + const __m256i coeffAB = _mm256_packs_epi32(coeffA, coeffB); + const __m256i coeffCD = _mm256_packs_epi32(coeffC, coeffD); + const __m256i absAB = _mm256_abs_epi16(coeffAB); + const __m256i absCD = _mm256_abs_epi16(coeffCD); + const __m256i absABCD = _mm256_packs_epi16(absAB, absCD); + const __m256i res_ = _mm256_permute4x64_epi64(absABCD, 0xd8); + const __m256i res = _mm256_shuffle_epi32(res_, 0xd8); + yy_storeu_256(ls, res); + cf += 32; + *(int32_t *)(ls + width) = 0; + ls += stride; + i += 1; + } while (i < height); + } +}
diff --git a/av1/encoder/x86/encodetxb_sse4.c b/av1/encoder/x86/encodetxb_sse4.c index b3a879b..5e0687c 100644 --- a/av1/encoder/x86/encodetxb_sse4.c +++ b/av1/encoder/x86/encodetxb_sse4.c
@@ -14,43 +14,55 @@ #include <smmintrin.h> /* SSE4.1 */ #include "aom/aom_integer.h" -#include "aom_dsp/x86/mem_sse2.h" #include "av1/common/onyxc_int.h" #include "av1/common/txb_common.h" +#include "aom_dsp/x86/synonyms.h" void av1_txb_init_levels_sse4_1(const tran_low_t *const coeff, const int width, const int height, uint8_t *const levels) { const int stride = width + TX_PAD_HOR; - memset(levels - TX_PAD_TOP * stride, 0, - sizeof(*levels) * TX_PAD_TOP * stride); - memset(levels + stride * height, 0, - sizeof(*levels) * (TX_PAD_BOTTOM * stride + TX_PAD_END)); - const __m128i zeros = _mm_setzero_si128(); + + const int32_t pre_len = sizeof(*levels) * TX_PAD_TOP * stride; + uint8_t *pre_buf = levels - TX_PAD_TOP * stride; + uint8_t *pre_buf_end = pre_buf + pre_len; + do { + _mm_storeu_si128((__m128i *)(pre_buf), zeros); + pre_buf += 16; + } while (pre_buf < pre_buf_end); + + const int32_t bottom_len = sizeof(*levels) * (TX_PAD_BOTTOM * stride); + uint8_t *bottom_buf = levels + stride * height; + uint8_t *bottom_buf_end = bottom_buf + bottom_len; + do { + _mm_storeu_si128((__m128i *)(bottom_buf), zeros); + bottom_buf += 16; + } while (bottom_buf < bottom_buf_end); + int i = 0; uint8_t *ls = levels; const tran_low_t *cf = coeff; if (width == 4) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + width)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); const __m128i lsAB = _mm_unpacklo_epi32(absAB8, zeros); - _mm_storeu_si128((__m128i *)ls, lsAB); + xx_storeu_128(ls, lsAB); ls += (stride << 1); cf += (width << 1); i += 2; } while (i < height); } else if (width == 8) { do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absAB8 = _mm_packs_epi16(absAB, zeros); - _mm_storeu_si128((__m128i *)ls, absAB8); + xx_storeu_128(ls, absAB8); ls += stride; cf += width; i += 1; @@ -59,16 +71,16 @@ do { int j = 0; do { - const __m128i coeffA = _mm_load_si128((__m128i *)(cf)); - const __m128i coeffB = _mm_load_si128((__m128i *)(cf + 4)); - const __m128i coeffC = _mm_load_si128((__m128i *)(cf + 8)); - const __m128i coeffD = _mm_load_si128((__m128i *)(cf + 12)); + const __m128i coeffA = xx_loadu_128(cf); + const __m128i coeffB = xx_loadu_128(cf + 4); + const __m128i coeffC = xx_loadu_128(cf + 8); + const __m128i coeffD = xx_loadu_128(cf + 12); const __m128i coeffAB = _mm_packs_epi32(coeffA, coeffB); const __m128i coeffCD = _mm_packs_epi32(coeffC, coeffD); const __m128i absAB = _mm_abs_epi16(coeffAB); const __m128i absCD = _mm_abs_epi16(coeffCD); const __m128i absABCD = _mm_packs_epi16(absAB, absCD); - _mm_storeu_si128((__m128i *)(ls + j), absABCD); + xx_storeu_128(ls + j, absABCD); j += 16; cf += 16; } while (j < width);
diff --git a/test/encodetxb_test.cc b/test/encodetxb_test.cc index ab6ec72..11cc073 100644 --- a/test/encodetxb_test.cc +++ b/test/encodetxb_test.cc
@@ -181,15 +181,22 @@ ::testing::Values(av1_get_nz_map_contexts_sse2)); #endif -#if HAVE_SSE4_1 -class EncodeTxbInitLevelTest : public ::testing::TestWithParam<int> { +typedef void (*av1_txb_init_levels_func)(const tran_low_t *const coeff, + const int width, const int height, + uint8_t *const levels); + +typedef ::testing::tuple<av1_txb_init_levels_func, int> TxbInitLevelParam; + +class EncodeTxbInitLevelTest + : public ::testing::TestWithParam<TxbInitLevelParam> { public: virtual ~EncodeTxbInitLevelTest() {} virtual void TearDown() { libaom_test::ClearSystemState(); } - void RunTest(int tx_size, int is_speed); + void RunTest(av1_txb_init_levels_func test_func, int tx_size, int is_speed); }; -void EncodeTxbInitLevelTest::RunTest(int tx_size, int is_speed) { +void EncodeTxbInitLevelTest::RunTest(av1_txb_init_levels_func test_func, + int tx_size, int is_speed) { const int width = get_txb_wide((TX_SIZE)tx_size); const int height = get_txb_high((TX_SIZE)tx_size); tran_low_t coeff[MAX_TX_SQUARE]; @@ -215,7 +222,7 @@ const double t1 = get_time_mark(&timer); aom_usec_timer_start(&timer); for (int i = 0; i < run_times; ++i) { - av1_txb_init_levels_sse4_1(coeff, width, height, levels1); + test_func(coeff, width, height, levels1); } const double t2 = get_time_mark(&timer); if (is_speed) { @@ -232,11 +239,24 @@ } } -TEST_P(EncodeTxbInitLevelTest, match) { RunTest(GetParam(), 0); } -TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { RunTest(GetParam(), 1); } +TEST_P(EncodeTxbInitLevelTest, match) { + RunTest(GET_PARAM(0), GET_PARAM(1), 0); +} -INSTANTIATE_TEST_CASE_P(SSE4_1, EncodeTxbInitLevelTest, - ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1)); +TEST_P(EncodeTxbInitLevelTest, DISABLED_Speed) { + RunTest(GET_PARAM(0), GET_PARAM(1), 1); +} + +#if HAVE_SSE4_1 +INSTANTIATE_TEST_CASE_P( + SSE4_1, EncodeTxbInitLevelTest, + ::testing::Combine(::testing::Values(&av1_txb_init_levels_sse4_1), + ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1))); #endif - +#if HAVE_AVX2 +INSTANTIATE_TEST_CASE_P( + AVX2, EncodeTxbInitLevelTest, + ::testing::Combine(::testing::Values(&av1_txb_init_levels_avx2), + ::testing::Range(0, static_cast<int>(TX_SIZES_ALL), 1))); +#endif } // namespace