Add SSE4_2 version of crc hash function

1. Add SSE4_2 detection to rtcd
2. Add av1_get_crc_value_sse4_2 and unittest AV1CrcHashTest
3. av1_get_crc_value_sse4_2 is crc32, which is longer than the C version
So, the hash result of sse4_2 and C is not the same, but should be
bitwise identical for the encoder result.
4. The speed test in AV1CrcHashTest shows SSE4_2 version is 10x ~ 50x
faster than C version.
hash  64x64 :1906883.00/75701.00ns(25.19)
hash  32x32 :922948.00/38389.00ns(24.04)
hash   8x8  :234861.00/4615.00ns(50.89)
hash   4x4  :107561.00/9238.00ns(11.64)
5. For encoder, about 2% faster shows by encoding 20 frames foreman_cif.y4m.

Change-Id: I1d3272cdb94733ac55a0f9affbb1faac3fdc78d1
diff --git a/av1/av1.cmake b/av1/av1.cmake
index eb15d58..88d6e07 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -332,6 +332,10 @@
     ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
     "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
 
+set(AOM_AV1_ENCODER_INTRIN_SSE4_2
+    ${AOM_AV1_ENCODER_INTRIN_SSE4_2}
+    "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
 if (CONFIG_INSPECTION)
   set(AOM_AV1_DECODER_SOURCES
       ${AOM_AV1_DECODER_SOURCES}
@@ -525,6 +529,16 @@
     endif ()
   endif ()
 
+  if (HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if (CONFIG_AV1_ENCODER)
+      if (AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+      endif ()
+    endif ()
+  endif ()
+
   if (HAVE_AVX2)
     require_compiler_flag_nomsvc("-mavx2" NO)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b4960d8..43742ac 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -441,6 +441,10 @@
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2/;
 
+  # hash
+  add_proto qw/uint32_t av1_get_crc_value/, "void *crc_calculator, uint8_t *p, int length";
+  specialize qw/av1_get_crc_value sse4_2/;
+
 }
 # end encoder functions
 
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 89c5bd8..4f0bbcb 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -22,7 +22,7 @@
   }
 }
 
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
   p_crc_calculator->remainder = 0;
 }
 
@@ -61,8 +61,8 @@
   crc_calculator_init_table(p_crc_calculator);
 }
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length) {
+uint32_t av1_get_crc_value_c(void *crc_calculator, uint8_t *p, int length) {
+  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
   crc_calculator_reset(p_crc_calculator);
   crc_calculator_process_data(p_crc_calculator, p, length);
   return crc_calculator_get_crc(p_crc_calculator);
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h
index a0fd54f..2b77bf9 100644
--- a/av1/encoder/hash.h
+++ b/av1/encoder/hash.h
@@ -32,9 +32,6 @@
 void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
                              uint32_t truncPoly);
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000..014d889
--- /dev/null
+++ b/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                  size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}