Add SSE4_2 version of crc hash function
1. Add SSE4_2 detection to rtcd
2. Add av1_get_crc_value_sse4_2 and unittest AV1CrcHashTest
3. av1_get_crc_value_sse4_2 is crc32, which is longer than the C version
So, the hash result of sse4_2 and C is not the same, but should be
bitwise identical for the encoder result.
4. The speed test in AV1CrcHashTest shows SSE4_2 version is 10x ~ 50x
faster than C version.
hash 64x64 :1906883.00/75701.00ns(25.19)
hash 32x32 :922948.00/38389.00ns(24.04)
hash 8x8 :234861.00/4615.00ns(50.89)
hash 4x4 :107561.00/9238.00ns(11.64)
5. For encoder, about 2% faster shows by encoding 20 frames foreman_cif.y4m.
Change-Id: I1d3272cdb94733ac55a0f9affbb1faac3fdc78d1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0adbec6..60ccfec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@
option(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON)
option(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
option(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
option(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON)
option(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON)
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index e5680ca..1e3e37d 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -162,6 +162,7 @@
#define HAS_SSE4_1 0x20
#define HAS_AVX 0x40
#define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
#ifndef BIT
#define BIT(n) (1 << n)
#endif
@@ -202,6 +203,8 @@
if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
+ if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
// bits 27 (OSXSAVE) & 28 (256-bit AVX)
if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
if ((xgetbv() & 0x6) == 0x6) {
diff --git a/av1/av1.cmake b/av1/av1.cmake
index eb15d58..88d6e07 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -332,6 +332,10 @@
${AOM_AV1_ENCODER_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
+set(AOM_AV1_ENCODER_INTRIN_SSE4_2
+ ${AOM_AV1_ENCODER_INTRIN_SSE4_2}
+ "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
if (CONFIG_INSPECTION)
set(AOM_AV1_DECODER_SOURCES
${AOM_AV1_DECODER_SOURCES}
@@ -525,6 +529,16 @@
endif ()
endif ()
+ if (HAVE_SSE4_2)
+ require_compiler_flag_nomsvc("-msse4.2" NO)
+ if (CONFIG_AV1_ENCODER)
+ if (AOM_AV1_ENCODER_INTRIN_SSE4_2)
+ add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+ endif ()
+ endif ()
+ endif ()
+
if (HAVE_AVX2)
require_compiler_flag_nomsvc("-mavx2" NO)
add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b4960d8..43742ac 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -441,6 +441,10 @@
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
specialize qw/av1_wedge_compute_delta_squares sse2/;
+ # hash
+ add_proto qw/uint32_t av1_get_crc_value/, "void *crc_calculator, uint8_t *p, int length";
+ specialize qw/av1_get_crc_value sse4_2/;
+
}
# end encoder functions
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 89c5bd8..4f0bbcb 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -22,7 +22,7 @@
}
}
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
p_crc_calculator->remainder = 0;
}
@@ -61,8 +61,8 @@
crc_calculator_init_table(p_crc_calculator);
}
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
- int length) {
+uint32_t av1_get_crc_value_c(void *crc_calculator, uint8_t *p, int length) {
+ CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
crc_calculator_reset(p_crc_calculator);
crc_calculator_process_data(p_crc_calculator, p, length);
return crc_calculator_get_crc(p_crc_calculator);
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h
index a0fd54f..2b77bf9 100644
--- a/av1/encoder/hash.h
+++ b/av1/encoder/hash.h
@@ -32,9 +32,6 @@
void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
uint32_t truncPoly);
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
- int length);
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000..014d889
--- /dev/null
+++ b/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+ return (crc ^= 0xFFFFFFFF);
+}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 8b695bd..e80693b 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -39,6 +39,7 @@
set(HAVE_SSE2 0 CACHE NUMBER "Enables SSE2 optimizations.")
set(HAVE_SSE3 0 CACHE NUMBER "Enables SSE3 optimizations.")
set(HAVE_SSE4_1 0 CACHE NUMBER "Enables SSE 4.1 optimizations.")
+set(HAVE_SSE4_2 0 CACHE NUMBER "Enables SSE 4.2 optimizations.")
set(HAVE_SSSE3 0 CACHE NUMBER "Enables SSSE3 optimizations.")
# Flags describing the build environment.
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 4225c3d..82f5e06 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -76,7 +76,7 @@
set(RTCD_ARCH_X86_64 "yes")
endif ()
- set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;AVX;AVX2")
+ set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
foreach (flavor ${X86_FLAVORS})
if (ENABLE_${flavor} AND NOT disable_remaining_flavors)
set(HAVE_${flavor} 1)
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 5c6106c..7834999 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -369,10 +369,10 @@
&require("c");
if ($opts{arch} eq 'x86') {
- @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
x86;
} elsif ($opts{arch} eq 'x86_64') {
- @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+ @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
@REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
&require(@REQUIRES);
x86;
diff --git a/test/hash_test.cc b/test/hash_test.cc
new file mode 100644
index 0000000..e9a7a8b
--- /dev/null
+++ b/test/hash_test.cc
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1Hash {
+
+////////////////////////////////////////
+// C version reference code from
+// https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software?answertab=active#tab-top
+////////////////////////////////////////
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Table for a quadword-at-a-time software crc. */
+static uint32_t crc32c_table[8][256];
+
+/* Construct table for software CRC-32C calculation. */
+static void crc32c_init_sw(void) {
+ uint32_t n, crc, k;
+
+ for (n = 0; n < 256; n++) {
+ crc = n;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+ crc32c_table[0][n] = crc;
+ }
+ for (n = 0; n < 256; n++) {
+ crc = crc32c_table[0][n];
+ for (k = 1; k < 8; k++) {
+ crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
+ crc32c_table[k][n] = crc;
+ }
+ }
+}
+
+/* Table-driven software version as a fall-back. This is about 15 times slower
+ than using the hardware instructions. This assumes little-endian integers,
+ as is the case on Intel processors that the assembler code here is for. */
+uint32_t crc32c_sw(const void *buf, size_t len, uint32_t crci) {
+ const unsigned char *next = (const unsigned char *)buf;
+ uint64_t crc;
+
+ crc = crci ^ 0xffffffff;
+ while (len && ((uintptr_t)next & 7) != 0) {
+ crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ while (len >= 8) {
+ crc ^= *(uint64_t *)next;
+ crc = crc32c_table[7][crc & 0xff] ^ crc32c_table[6][(crc >> 8) & 0xff] ^
+ crc32c_table[5][(crc >> 16) & 0xff] ^
+ crc32c_table[4][(crc >> 24) & 0xff] ^
+ crc32c_table[3][(crc >> 32) & 0xff] ^
+ crc32c_table[2][(crc >> 40) & 0xff] ^
+ crc32c_table[1][(crc >> 48) & 0xff] ^ crc32c_table[0][crc >> 56];
+ next += 8;
+ len -= 8;
+ }
+ while (len) {
+ crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+ len--;
+ }
+ return (uint32_t)crc ^ 0xffffffff;
+}
+
+static uint32_t get_crc32c_value_ref(void *calculator, uint8_t *p, int length) {
+ (void)calculator;
+ return crc32c_sw(p, length, 0);
+}
+
+typedef uint32_t (*get_crc_value_func)(void *calculator, uint8_t *p,
+ int length);
+
+typedef std::tr1::tuple<get_crc_value_func, get_crc_value_func, int> HashParam;
+
+class AV1CrcHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+ ~AV1CrcHashTest();
+ void SetUp();
+
+ void TearDown();
+
+ protected:
+ void RunCheckOutput(get_crc_value_func test_impl,
+ get_crc_value_func ref_impl);
+ void RunSpeedTest(get_crc_value_func test_impl);
+ libaom_test::ACMRandom rnd_;
+ CRC_CALCULATOR calc_;
+ uint8_t *buffer_;
+ int bsize_;
+ int length_;
+};
+
+AV1CrcHashTest::~AV1CrcHashTest() { ; }
+
+void AV1CrcHashTest::SetUp() {
+ rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+ av1_crc_calculator_init(&calc_, 24, 0x5D6DCB);
+ crc32c_init_sw();
+ bsize_ = GET_PARAM(2);
+ length_ = bsize_ * bsize_ * sizeof(uint16_t);
+ buffer_ = (uint8_t *)malloc(length_);
+ for (int i = 0; i < length_; ++i) {
+ buffer_[i] = rnd_.Rand8();
+ }
+}
+
+void AV1CrcHashTest::TearDown() { free(buffer_); }
+
+void AV1CrcHashTest::RunCheckOutput(get_crc_value_func test_impl,
+ get_crc_value_func ref_impl) {
+ // for the same buffer crc should be the same
+ uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+ ASSERT_EQ(crc0, crc1);
+ ASSERT_EQ(crc0, crc2); // should equal to software version
+ // modify buffer
+ buffer_[0] += 1;
+ uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+ uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+ ASSERT_NE(crc0, crc3); // crc shoud not equal to previours one
+ ASSERT_EQ(crc3, crc4);
+}
+
+void AV1CrcHashTest::RunSpeedTest(get_crc_value_func test_impl) {
+ get_crc_value_func impls[] = { av1_get_crc_value_c, test_impl };
+ const int repeat = 10000000 / (bsize_ + bsize_);
+
+ aom_usec_timer timer;
+ double time[2] = { 0 };
+ for (int i = 0; i < 2; ++i) {
+ aom_usec_timer_start(&timer);
+ for (int j = 0; j < repeat; ++j) {
+ impls[i](&calc_, buffer_, length_);
+ }
+ aom_usec_timer_mark(&timer);
+ time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+ }
+ printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+ printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+TEST_P(AV1CrcHashTest, CheckOutput) {
+ RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1CrcHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_CASE_P(
+ C, AV1CrcHashTest,
+ ::testing::Combine(::testing::Values(&av1_get_crc_value_c),
+ ::testing::Values(&av1_get_crc_value_c),
+ ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_CASE_P(
+ SSE4_2, AV1CrcHashTest,
+ ::testing::Combine(::testing::Values(&av1_get_crc_value_sse4_2),
+ ::testing::Values(&get_crc32c_value_ref),
+ ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+} // namespace AV1Hash
diff --git a/test/test.cmake b/test/test.cmake
index 708e420..8a4bccc 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -263,6 +263,12 @@
"${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
endif ()
+ if (HAVE_SSE4_2)
+ set(AOM_UNIT_TEST_ENCODER_SOURCES
+ ${AOM_UNIT_TEST_ENCODER_SOURCES}
+ "${AOM_ROOT}/test/hash_test.cc")
+ endif ()
+
set(AOM_UNIT_TEST_ENCODER_SOURCES
${AOM_UNIT_TEST_ENCODER_SOURCES}
"${AOM_ROOT}/test/av1_fht16x32_test.cc"