Add SSE4_2 version of crc hash function
1. Add SSE4_2 detection to rtcd
2. Add av1_get_crc_value_sse4_2 and unittest AV1CrcHashTest
3. av1_get_crc_value_sse4_2 is crc32, which is longer than the C version
So, the hash result of sse4_2 and C is not the same, but should be
bitwise identical for the encoder result.
4. The speed test in AV1CrcHashTest shows SSE4_2 version is 10x ~ 50x
faster than C version.
hash 64x64 :1906883.00/75701.00ns(25.19)
hash 32x32 :922948.00/38389.00ns(24.04)
hash 8x8 :234861.00/4615.00ns(50.89)
hash 4x4 :107561.00/9238.00ns(11.64)
5. For encoder, about 2% faster shows by encoding 20 frames foreman_cif.y4m.
Change-Id: I1d3272cdb94733ac55a0f9affbb1faac3fdc78d1
diff --git a/av1/av1.cmake b/av1/av1.cmake
index eb15d58..88d6e07 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -332,6 +332,10 @@
${AOM_AV1_ENCODER_INTRIN_SSE4_1}
"${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
+set(AOM_AV1_ENCODER_INTRIN_SSE4_2
+ ${AOM_AV1_ENCODER_INTRIN_SSE4_2}
+ "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
if (CONFIG_INSPECTION)
set(AOM_AV1_DECODER_SOURCES
${AOM_AV1_DECODER_SOURCES}
@@ -525,6 +529,16 @@
endif ()
endif ()
+ if (HAVE_SSE4_2)
+ require_compiler_flag_nomsvc("-msse4.2" NO)
+ if (CONFIG_AV1_ENCODER)
+ if (AOM_AV1_ENCODER_INTRIN_SSE4_2)
+ add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+ "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+ endif ()
+ endif ()
+ endif ()
+
if (HAVE_AVX2)
require_compiler_flag_nomsvc("-mavx2" NO)
add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b4960d8..43742ac 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -441,6 +441,10 @@
add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
specialize qw/av1_wedge_compute_delta_squares sse2/;
+ # hash
+ add_proto qw/uint32_t av1_get_crc_value/, "void *crc_calculator, uint8_t *p, int length";
+ specialize qw/av1_get_crc_value sse4_2/;
+
}
# end encoder functions
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 89c5bd8..4f0bbcb 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -22,7 +22,7 @@
}
}
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
p_crc_calculator->remainder = 0;
}
@@ -61,8 +61,8 @@
crc_calculator_init_table(p_crc_calculator);
}
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
- int length) {
+uint32_t av1_get_crc_value_c(void *crc_calculator, uint8_t *p, int length) {
+ CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
crc_calculator_reset(p_crc_calculator);
crc_calculator_process_data(p_crc_calculator, p, length);
return crc_calculator_get_crc(p_crc_calculator);
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h
index a0fd54f..2b77bf9 100644
--- a/av1/encoder/hash.h
+++ b/av1/encoder/hash.h
@@ -32,9 +32,6 @@
void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
uint32_t truncPoly);
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
- int length);
-
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000..014d889
--- /dev/null
+++ b/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+ while ((len) >= sizeof(type)) { \
+ (crc) = op((crc), *(type *)(buf)); \
+ (len) -= sizeof(type); \
+ buf += sizeof(type); \
+ }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc_value_sse4_2(void *crc_calculator, uint8_t *p,
+ size_t len) {
+ (void)crc_calculator;
+ const uint8_t *buf = p;
+ uint32_t crc = 0xFFFFFFFF;
+
+ // Align the input to the word boundary
+ for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+ crc = _mm_crc32_u8(crc, *buf);
+ }
+
+#ifdef __x86_64__
+ uint64_t crc64 = crc;
+ CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+ crc = (uint32_t)crc64;
+#endif
+ CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+ CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+ CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+ return (crc ^= 0xFFFFFFFF);
+}