Add SSE4_2 version of crc hash function 1. Add SSE4_2 detection to rtcd 2. Add av1_get_crc_value_sse4_2 and unittest AV1CrcHashTest 3. av1_get_crc_value_sse4_2 is crc32, which is longer than the C version So, the hash result of sse4_2 and C is not the same, but should be bitwise identical for the encoder result. 4. The speed test in AV1CrcHashTest shows SSE4_2 version is 10x ~ 50x faster than C version. hash 64x64 :1906883.00/75701.00ns(25.19) hash 32x32 :922948.00/38389.00ns(24.04) hash 8x8 :234861.00/4615.00ns(50.89) hash 4x4 :107561.00/9238.00ns(11.64) 5. For encoder, about 2% faster shows by encoding 20 frames foreman_cif.y4m. Change-Id: I1d3272cdb94733ac55a0f9affbb1faac3fdc78d1
diff --git a/CMakeLists.txt b/CMakeLists.txt index 0adbec6..60ccfec 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt
@@ -43,6 +43,7 @@ option(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON) option(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON) option(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON) +option(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON) option(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON) option(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/aom_ports/x86.h b/aom_ports/x86.h index e5680ca..1e3e37d 100644 --- a/aom_ports/x86.h +++ b/aom_ports/x86.h
@@ -162,6 +162,7 @@ #define HAS_SSE4_1 0x20 #define HAS_AVX 0x40 #define HAS_AVX2 0x80 +#define HAS_SSE4_2 0x100 #ifndef BIT #define BIT(n) (1 << n) #endif @@ -202,6 +203,8 @@ if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1; + if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2; + // bits 27 (OSXSAVE) & 28 (256-bit AVX) if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) { if ((xgetbv() & 0x6) == 0x6) {
diff --git a/av1/av1.cmake b/av1/av1.cmake index eb15d58..88d6e07 100644 --- a/av1/av1.cmake +++ b/av1/av1.cmake
@@ -332,6 +332,10 @@ ${AOM_AV1_ENCODER_INTRIN_SSE4_1} "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c") +set(AOM_AV1_ENCODER_INTRIN_SSE4_2 + ${AOM_AV1_ENCODER_INTRIN_SSE4_2} + "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c") + if (CONFIG_INSPECTION) set(AOM_AV1_DECODER_SOURCES ${AOM_AV1_DECODER_SOURCES} @@ -525,6 +529,16 @@ endif () endif () + if (HAVE_SSE4_2) + require_compiler_flag_nomsvc("-msse4.2" NO) + if (CONFIG_AV1_ENCODER) + if (AOM_AV1_ENCODER_INTRIN_SSE4_2) + add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder" + "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom") + endif () + endif () + endif () + if (HAVE_AVX2) require_compiler_flag_nomsvc("-mavx2" NO) add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl index b4960d8..43742ac 100755 --- a/av1/common/av1_rtcd_defs.pl +++ b/av1/common/av1_rtcd_defs.pl
@@ -441,6 +441,10 @@ add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N"; specialize qw/av1_wedge_compute_delta_squares sse2/; + # hash + add_proto qw/uint32_t av1_get_crc_value/, "void *crc_calculator, uint8_t *p, int length"; + specialize qw/av1_get_crc_value sse4_2/; + } # end encoder functions
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c index 89c5bd8..4f0bbcb 100644 --- a/av1/encoder/hash.c +++ b/av1/encoder/hash.c
@@ -22,7 +22,7 @@ } } -void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { +static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) { p_crc_calculator->remainder = 0; } @@ -61,8 +61,8 @@ crc_calculator_init_table(p_crc_calculator); } -uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, - int length) { +uint32_t av1_get_crc_value_c(void *crc_calculator, uint8_t *p, int length) { + CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator; crc_calculator_reset(p_crc_calculator); crc_calculator_process_data(p_crc_calculator, p, length); return crc_calculator_get_crc(p_crc_calculator);
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h index a0fd54f..2b77bf9 100644 --- a/av1/encoder/hash.h +++ b/av1/encoder/hash.h
@@ -32,9 +32,6 @@ void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits, uint32_t truncPoly); -uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p, - int length); - #ifdef __cplusplus } // extern "C" #endif
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c new file mode 100644 index 0000000..014d889 --- /dev/null +++ b/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <stdint.h> +#include <smmintrin.h> + +// Byte-boundary alignment issues +#define ALIGN_SIZE 8 +#define ALIGN_MASK (ALIGN_SIZE - 1) + +#define CALC_CRC(op, crc, type, buf, len) \ + while ((len) >= sizeof(type)) { \ + (crc) = op((crc), *(type *)(buf)); \ + (len) -= sizeof(type); \ + buf += sizeof(type); \ + } + +/** + * Calculates 32-bit CRC for the input buffer + * polynomial is 0x11EDC6F41 + * @return A 32-bit unsigned integer representing the CRC + */ +uint32_t av1_get_crc_value_sse4_2(void *crc_calculator, uint8_t *p, + size_t len) { + (void)crc_calculator; + const uint8_t *buf = p; + uint32_t crc = 0xFFFFFFFF; + + // Align the input to the word boundary + for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) { + crc = _mm_crc32_u8(crc, *buf); + } + +#ifdef __x86_64__ + uint64_t crc64 = crc; + CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len); + crc = (uint32_t)crc64; +#endif + CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len); + CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len); + CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len); + return (crc ^= 0xFFFFFFFF); +}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake index 8b695bd..e80693b 100644 --- a/build/cmake/aom_config_defaults.cmake +++ b/build/cmake/aom_config_defaults.cmake
@@ -39,6 +39,7 @@ set(HAVE_SSE2 0 CACHE NUMBER "Enables SSE2 optimizations.") set(HAVE_SSE3 0 CACHE NUMBER "Enables SSE3 optimizations.") set(HAVE_SSE4_1 0 CACHE NUMBER "Enables SSE 4.1 optimizations.") +set(HAVE_SSE4_2 0 CACHE NUMBER "Enables SSE 4.2 optimizations.") set(HAVE_SSSE3 0 CACHE NUMBER "Enables SSSE3 optimizations.") # Flags describing the build environment.
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake index 4225c3d..82f5e06 100644 --- a/build/cmake/cpu.cmake +++ b/build/cmake/cpu.cmake
@@ -76,7 +76,7 @@ set(RTCD_ARCH_X86_64 "yes") endif () - set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;AVX;AVX2") + set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2") foreach (flavor ${X86_FLAVORS}) if (ENABLE_${flavor} AND NOT disable_remaining_flavors) set(HAVE_${flavor} 1)
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 5c6106c..7834999 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl
@@ -369,10 +369,10 @@ &require("c"); if ($opts{arch} eq 'x86') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/); x86; } elsif ($opts{arch} eq 'x86_64') { - @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/); + @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/); @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/); &require(@REQUIRES); x86;
diff --git a/test/hash_test.cc b/test/hash_test.cc new file mode 100644 index 0000000..e9a7a8b --- /dev/null +++ b/test/hash_test.cc
@@ -0,0 +1,191 @@ +/* + * Copyright (c) 2018, Alliance for Open Media. All rights reserved + * + * This source code is subject to the terms of the BSD 2 Clause License and + * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License + * was not distributed with this source code in the LICENSE file, you can + * obtain it at www.aomedia.org/license/software. If the Alliance for Open + * Media Patent License 1.0 was not distributed with this source code in the + * PATENTS file, you can obtain it at www.aomedia.org/license/patent. + */ + +#include <cstdlib> +#include <new> + +#include "./aom_config.h" +#include "./av1_rtcd.h" +#include "aom_ports/aom_timer.h" +#include "av1/encoder/hash.h" +#include "test/acm_random.h" +#include "test/util.h" +#include "third_party/googletest/src/googletest/include/gtest/gtest.h" + +namespace AV1Hash { + +//////////////////////////////////////// +// C version reference code from +// https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software?answertab=active#tab-top +//////////////////////////////////////// + +/* CRC-32C (iSCSI) polynomial in reversed bit order. */ +#define POLY 0x82f63b78 + +/* Table for a quadword-at-a-time software crc. */ +static uint32_t crc32c_table[8][256]; + +/* Construct table for software CRC-32C calculation. */ +static void crc32c_init_sw(void) { + uint32_t n, crc, k; + + for (n = 0; n < 256; n++) { + crc = n; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1; + crc32c_table[0][n] = crc; + } + for (n = 0; n < 256; n++) { + crc = crc32c_table[0][n]; + for (k = 1; k < 8; k++) { + crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8); + crc32c_table[k][n] = crc; + } + } +} + +/* Table-driven software version as a fall-back. This is about 15 times slower + than using the hardware instructions. This assumes little-endian integers, + as is the case on Intel processors that the assembler code here is for. */ +uint32_t crc32c_sw(const void *buf, size_t len, uint32_t crci) { + const unsigned char *next = (const unsigned char *)buf; + uint64_t crc; + + crc = crci ^ 0xffffffff; + while (len && ((uintptr_t)next & 7) != 0) { + crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + while (len >= 8) { + crc ^= *(uint64_t *)next; + crc = crc32c_table[7][crc & 0xff] ^ crc32c_table[6][(crc >> 8) & 0xff] ^ + crc32c_table[5][(crc >> 16) & 0xff] ^ + crc32c_table[4][(crc >> 24) & 0xff] ^ + crc32c_table[3][(crc >> 32) & 0xff] ^ + crc32c_table[2][(crc >> 40) & 0xff] ^ + crc32c_table[1][(crc >> 48) & 0xff] ^ crc32c_table[0][crc >> 56]; + next += 8; + len -= 8; + } + while (len) { + crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8); + len--; + } + return (uint32_t)crc ^ 0xffffffff; +} + +static uint32_t get_crc32c_value_ref(void *calculator, uint8_t *p, int length) { + (void)calculator; + return crc32c_sw(p, length, 0); +} + +typedef uint32_t (*get_crc_value_func)(void *calculator, uint8_t *p, + int length); + +typedef std::tr1::tuple<get_crc_value_func, get_crc_value_func, int> HashParam; + +class AV1CrcHashTest : public ::testing::TestWithParam<HashParam> { + public: + ~AV1CrcHashTest(); + void SetUp(); + + void TearDown(); + + protected: + void RunCheckOutput(get_crc_value_func test_impl, + get_crc_value_func ref_impl); + void RunSpeedTest(get_crc_value_func test_impl); + libaom_test::ACMRandom rnd_; + CRC_CALCULATOR calc_; + uint8_t *buffer_; + int bsize_; + int length_; +}; + +AV1CrcHashTest::~AV1CrcHashTest() { ; } + +void AV1CrcHashTest::SetUp() { + rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed()); + av1_crc_calculator_init(&calc_, 24, 0x5D6DCB); + crc32c_init_sw(); + bsize_ = GET_PARAM(2); + length_ = bsize_ * bsize_ * sizeof(uint16_t); + buffer_ = (uint8_t *)malloc(length_); + for (int i = 0; i < length_; ++i) { + buffer_[i] = rnd_.Rand8(); + } +} + +void AV1CrcHashTest::TearDown() { free(buffer_); } + +void AV1CrcHashTest::RunCheckOutput(get_crc_value_func test_impl, + get_crc_value_func ref_impl) { + // for the same buffer crc should be the same + uint32_t crc0 = test_impl(&calc_, buffer_, length_); + uint32_t crc1 = test_impl(&calc_, buffer_, length_); + uint32_t crc2 = ref_impl(&calc_, buffer_, length_); + ASSERT_EQ(crc0, crc1); + ASSERT_EQ(crc0, crc2); // should equal to software version + // modify buffer + buffer_[0] += 1; + uint32_t crc3 = test_impl(&calc_, buffer_, length_); + uint32_t crc4 = ref_impl(&calc_, buffer_, length_); + ASSERT_NE(crc0, crc3); // crc shoud not equal to previours one + ASSERT_EQ(crc3, crc4); +} + +void AV1CrcHashTest::RunSpeedTest(get_crc_value_func test_impl) { + get_crc_value_func impls[] = { av1_get_crc_value_c, test_impl }; + const int repeat = 10000000 / (bsize_ + bsize_); + + aom_usec_timer timer; + double time[2] = { 0 }; + for (int i = 0; i < 2; ++i) { + aom_usec_timer_start(&timer); + for (int j = 0; j < repeat; ++j) { + impls[i](&calc_, buffer_, length_); + } + aom_usec_timer_mark(&timer); + time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer)); + } + printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]); + printf("(%3.2f)\n", time[0] / time[1]); +} + +TEST_P(AV1CrcHashTest, CheckOutput) { + RunCheckOutput(GET_PARAM(0), GET_PARAM(1)); +} + +TEST_P(AV1CrcHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); } + +const int kValidBlockSize[] = { 64, 32, 8, 4 }; + +INSTANTIATE_TEST_CASE_P( + C, AV1CrcHashTest, + ::testing::Combine(::testing::Values(&av1_get_crc_value_c), + ::testing::Values(&av1_get_crc_value_c), + ::testing::ValuesIn(kValidBlockSize))); + +#if HAVE_SSE4_2 +INSTANTIATE_TEST_CASE_P( + SSE4_2, AV1CrcHashTest, + ::testing::Combine(::testing::Values(&av1_get_crc_value_sse4_2), + ::testing::Values(&get_crc32c_value_ref), + ::testing::ValuesIn(kValidBlockSize))); +#endif + +} // namespace AV1Hash
diff --git a/test/test.cmake b/test/test.cmake index 708e420..8a4bccc 100644 --- a/test/test.cmake +++ b/test/test.cmake
@@ -263,6 +263,12 @@ "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc") endif () + if (HAVE_SSE4_2) + set(AOM_UNIT_TEST_ENCODER_SOURCES + ${AOM_UNIT_TEST_ENCODER_SOURCES} + "${AOM_ROOT}/test/hash_test.cc") + endif () + set(AOM_UNIT_TEST_ENCODER_SOURCES ${AOM_UNIT_TEST_ENCODER_SOURCES} "${AOM_ROOT}/test/av1_fht16x32_test.cc"