Add SSE4_2 version of crc hash function

1. Add SSE4_2 detection to rtcd
2. Add av1_get_crc_value_sse4_2 and unittest AV1CrcHashTest
3. av1_get_crc_value_sse4_2 is crc32, which is longer than the C version
So, the hash result of sse4_2 and C is not the same, but should be
bitwise identical for the encoder result.
4. The speed test in AV1CrcHashTest shows SSE4_2 version is 10x ~ 50x
faster than C version.
hash  64x64 :1906883.00/75701.00ns(25.19)
hash  32x32 :922948.00/38389.00ns(24.04)
hash   8x8  :234861.00/4615.00ns(50.89)
hash   4x4  :107561.00/9238.00ns(11.64)
5. For encoder, about 2% faster shows by encoding 20 frames foreman_cif.y4m.

Change-Id: I1d3272cdb94733ac55a0f9affbb1faac3fdc78d1
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0adbec6..60ccfec 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -43,6 +43,7 @@
 option(ENABLE_SSE3 "Enables SSE3 optimizations on x86/x86_64 targets." ON)
 option(ENABLE_SSSE3 "Enables SSSE3 optimizations on x86/x86_64 targets." ON)
 option(ENABLE_SSE4_1 "Enables SSE4_1 optimizations on x86/x86_64 targets." ON)
+option(ENABLE_SSE4_2 "Enables SSE4_2 optimizations on x86/x86_64 targets." ON)
 option(ENABLE_AVX "Enables AVX optimizations on x86/x86_64 targets." ON)
 option(ENABLE_AVX2 "Enables AVX2 optimizations on x86/x86_64 targets." ON)
 
diff --git a/aom_ports/x86.h b/aom_ports/x86.h
index e5680ca..1e3e37d 100644
--- a/aom_ports/x86.h
+++ b/aom_ports/x86.h
@@ -162,6 +162,7 @@
 #define HAS_SSE4_1 0x20
 #define HAS_AVX 0x40
 #define HAS_AVX2 0x80
+#define HAS_SSE4_2 0x100
 #ifndef BIT
 #define BIT(n) (1 << n)
 #endif
@@ -202,6 +203,8 @@
 
   if (reg_ecx & BIT(19)) flags |= HAS_SSE4_1;
 
+  if (reg_ecx & BIT(20)) flags |= HAS_SSE4_2;
+
   // bits 27 (OSXSAVE) & 28 (256-bit AVX)
   if ((reg_ecx & (BIT(27) | BIT(28))) == (BIT(27) | BIT(28))) {
     if ((xgetbv() & 0x6) == 0x6) {
diff --git a/av1/av1.cmake b/av1/av1.cmake
index eb15d58..88d6e07 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -332,6 +332,10 @@
     ${AOM_AV1_ENCODER_INTRIN_SSE4_1}
     "${AOM_ROOT}/av1/encoder/x86/corner_match_sse4.c")
 
+set(AOM_AV1_ENCODER_INTRIN_SSE4_2
+    ${AOM_AV1_ENCODER_INTRIN_SSE4_2}
+    "${AOM_ROOT}/av1/encoder/x86/hash_sse42.c")
+
 if (CONFIG_INSPECTION)
   set(AOM_AV1_DECODER_SOURCES
       ${AOM_AV1_DECODER_SOURCES}
@@ -525,6 +529,16 @@
     endif ()
   endif ()
 
+  if (HAVE_SSE4_2)
+    require_compiler_flag_nomsvc("-msse4.2" NO)
+    if (CONFIG_AV1_ENCODER)
+      if (AOM_AV1_ENCODER_INTRIN_SSE4_2)
+        add_intrinsics_object_library("-msse4.2" "sse42" "aom_av1_encoder"
+                                      "AOM_AV1_ENCODER_INTRIN_SSE4_2" "aom")
+      endif ()
+    endif ()
+  endif ()
+
   if (HAVE_AVX2)
     require_compiler_flag_nomsvc("-mavx2" NO)
     add_intrinsics_object_library("-mavx2" "avx2" "aom_av1_common"
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index b4960d8..43742ac 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -441,6 +441,10 @@
   add_proto qw/void av1_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
   specialize qw/av1_wedge_compute_delta_squares sse2/;
 
+  # hash
+  add_proto qw/uint32_t av1_get_crc_value/, "void *crc_calculator, uint8_t *p, int length";
+  specialize qw/av1_get_crc_value sse4_2/;
+
 }
 # end encoder functions
 
diff --git a/av1/encoder/hash.c b/av1/encoder/hash.c
index 89c5bd8..4f0bbcb 100644
--- a/av1/encoder/hash.c
+++ b/av1/encoder/hash.c
@@ -22,7 +22,7 @@
   }
 }
 
-void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
+static void crc_calculator_reset(CRC_CALCULATOR *p_crc_calculator) {
   p_crc_calculator->remainder = 0;
 }
 
@@ -61,8 +61,8 @@
   crc_calculator_init_table(p_crc_calculator);
 }
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length) {
+uint32_t av1_get_crc_value_c(void *crc_calculator, uint8_t *p, int length) {
+  CRC_CALCULATOR *p_crc_calculator = (CRC_CALCULATOR *)crc_calculator;
   crc_calculator_reset(p_crc_calculator);
   crc_calculator_process_data(p_crc_calculator, p, length);
   return crc_calculator_get_crc(p_crc_calculator);
diff --git a/av1/encoder/hash.h b/av1/encoder/hash.h
index a0fd54f..2b77bf9 100644
--- a/av1/encoder/hash.h
+++ b/av1/encoder/hash.h
@@ -32,9 +32,6 @@
 void av1_crc_calculator_init(CRC_CALCULATOR *p_crc_calculator, uint32_t bits,
                              uint32_t truncPoly);
 
-uint32_t av1_get_crc_value(CRC_CALCULATOR *p_crc_calculator, uint8_t *p,
-                           int length);
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/av1/encoder/x86/hash_sse42.c b/av1/encoder/x86/hash_sse42.c
new file mode 100644
index 0000000..014d889
--- /dev/null
+++ b/av1/encoder/x86/hash_sse42.c
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <smmintrin.h>
+
+// Byte-boundary alignment issues
+#define ALIGN_SIZE 8
+#define ALIGN_MASK (ALIGN_SIZE - 1)
+
+#define CALC_CRC(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+/**
+ * Calculates 32-bit CRC for the input buffer
+ * polynomial is 0x11EDC6F41
+ * @return A 32-bit unsigned integer representing the CRC
+ */
+uint32_t av1_get_crc_value_sse4_2(void *crc_calculator, uint8_t *p,
+                                  size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+  // Align the input to the word boundary
+  for (; (len > 0) && ((intptr_t)buf & ALIGN_MASK); len--, buf++) {
+    crc = _mm_crc32_u8(crc, *buf);
+  }
+
+#ifdef __x86_64__
+  uint64_t crc64 = crc;
+  CALC_CRC(_mm_crc32_u64, crc64, uint64_t, buf, len);
+  crc = (uint32_t)crc64;
+#endif
+  CALC_CRC(_mm_crc32_u32, crc, uint32_t, buf, len);
+  CALC_CRC(_mm_crc32_u16, crc, uint16_t, buf, len);
+  CALC_CRC(_mm_crc32_u8, crc, uint8_t, buf, len);
+  return (crc ^= 0xFFFFFFFF);
+}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 8b695bd..e80693b 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -39,6 +39,7 @@
 set(HAVE_SSE2 0 CACHE NUMBER "Enables SSE2 optimizations.")
 set(HAVE_SSE3 0 CACHE NUMBER "Enables SSE3 optimizations.")
 set(HAVE_SSE4_1 0 CACHE NUMBER "Enables SSE 4.1 optimizations.")
+set(HAVE_SSE4_2 0 CACHE NUMBER "Enables SSE 4.2 optimizations.")
 set(HAVE_SSSE3 0 CACHE NUMBER "Enables SSSE3 optimizations.")
 
 # Flags describing the build environment.
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index 4225c3d..82f5e06 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -76,7 +76,7 @@
     set(RTCD_ARCH_X86_64 "yes")
   endif ()
 
-  set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;AVX;AVX2")
+  set(X86_FLAVORS "MMX;SSE;SSE2;SSE3;SSSE3;SSE4_1;SSE4_2;AVX;AVX2")
   foreach (flavor ${X86_FLAVORS})
     if (ENABLE_${flavor} AND NOT disable_remaining_flavors)
       set(HAVE_${flavor} 1)
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl
index 5c6106c..7834999 100755
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -369,10 +369,10 @@
 
 &require("c");
 if ($opts{arch} eq 'x86') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
   x86;
 } elsif ($opts{arch} eq 'x86_64') {
-  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 avx avx2/);
+  @ALL_ARCHS = filter(qw/mmx sse sse2 sse3 ssse3 sse4_1 sse4_2 avx avx2/);
   @REQUIRES = filter(keys %required ? keys %required : qw/mmx sse sse2/);
   &require(@REQUIRES);
   x86;
diff --git a/test/hash_test.cc b/test/hash_test.cc
new file mode 100644
index 0000000..e9a7a8b
--- /dev/null
+++ b/test/hash_test.cc
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <cstdlib>
+#include <new>
+
+#include "./aom_config.h"
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/encoder/hash.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+namespace AV1Hash {
+
+////////////////////////////////////////
+// C version reference code from
+// https://stackoverflow.com/questions/17645167/implementing-sse-4-2s-crc32c-in-software?answertab=active#tab-top
+////////////////////////////////////////
+
+/* CRC-32C (iSCSI) polynomial in reversed bit order. */
+#define POLY 0x82f63b78
+
+/* Table for a quadword-at-a-time software crc. */
+static uint32_t crc32c_table[8][256];
+
+/* Construct table for software CRC-32C calculation. */
+static void crc32c_init_sw(void) {
+  uint32_t n, crc, k;
+
+  for (n = 0; n < 256; n++) {
+    crc = n;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
+    crc32c_table[0][n] = crc;
+  }
+  for (n = 0; n < 256; n++) {
+    crc = crc32c_table[0][n];
+    for (k = 1; k < 8; k++) {
+      crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
+      crc32c_table[k][n] = crc;
+    }
+  }
+}
+
+/* Table-driven software version as a fall-back.  This is about 15 times slower
+   than using the hardware instructions.  This assumes little-endian integers,
+   as is the case on Intel processors that the assembler code here is for. */
+uint32_t crc32c_sw(const void *buf, size_t len, uint32_t crci) {
+  const unsigned char *next = (const unsigned char *)buf;
+  uint64_t crc;
+
+  crc = crci ^ 0xffffffff;
+  while (len && ((uintptr_t)next & 7) != 0) {
+    crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  while (len >= 8) {
+    crc ^= *(uint64_t *)next;
+    crc = crc32c_table[7][crc & 0xff] ^ crc32c_table[6][(crc >> 8) & 0xff] ^
+          crc32c_table[5][(crc >> 16) & 0xff] ^
+          crc32c_table[4][(crc >> 24) & 0xff] ^
+          crc32c_table[3][(crc >> 32) & 0xff] ^
+          crc32c_table[2][(crc >> 40) & 0xff] ^
+          crc32c_table[1][(crc >> 48) & 0xff] ^ crc32c_table[0][crc >> 56];
+    next += 8;
+    len -= 8;
+  }
+  while (len) {
+    crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
+    len--;
+  }
+  return (uint32_t)crc ^ 0xffffffff;
+}
+
+static uint32_t get_crc32c_value_ref(void *calculator, uint8_t *p, int length) {
+  (void)calculator;
+  return crc32c_sw(p, length, 0);
+}
+
+typedef uint32_t (*get_crc_value_func)(void *calculator, uint8_t *p,
+                                       int length);
+
+typedef std::tr1::tuple<get_crc_value_func, get_crc_value_func, int> HashParam;
+
+class AV1CrcHashTest : public ::testing::TestWithParam<HashParam> {
+ public:
+  ~AV1CrcHashTest();
+  void SetUp();
+
+  void TearDown();
+
+ protected:
+  void RunCheckOutput(get_crc_value_func test_impl,
+                      get_crc_value_func ref_impl);
+  void RunSpeedTest(get_crc_value_func test_impl);
+  libaom_test::ACMRandom rnd_;
+  CRC_CALCULATOR calc_;
+  uint8_t *buffer_;
+  int bsize_;
+  int length_;
+};
+
+AV1CrcHashTest::~AV1CrcHashTest() { ; }
+
+void AV1CrcHashTest::SetUp() {
+  rnd_.Reset(libaom_test::ACMRandom::DeterministicSeed());
+  av1_crc_calculator_init(&calc_, 24, 0x5D6DCB);
+  crc32c_init_sw();
+  bsize_ = GET_PARAM(2);
+  length_ = bsize_ * bsize_ * sizeof(uint16_t);
+  buffer_ = (uint8_t *)malloc(length_);
+  for (int i = 0; i < length_; ++i) {
+    buffer_[i] = rnd_.Rand8();
+  }
+}
+
+void AV1CrcHashTest::TearDown() { free(buffer_); }
+
+void AV1CrcHashTest::RunCheckOutput(get_crc_value_func test_impl,
+                                    get_crc_value_func ref_impl) {
+  // for the same buffer crc should be the same
+  uint32_t crc0 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc1 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc2 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_EQ(crc0, crc1);
+  ASSERT_EQ(crc0, crc2);  // should equal to software version
+  // modify buffer
+  buffer_[0] += 1;
+  uint32_t crc3 = test_impl(&calc_, buffer_, length_);
+  uint32_t crc4 = ref_impl(&calc_, buffer_, length_);
+  ASSERT_NE(crc0, crc3);  // crc shoud not equal to previours one
+  ASSERT_EQ(crc3, crc4);
+}
+
+void AV1CrcHashTest::RunSpeedTest(get_crc_value_func test_impl) {
+  get_crc_value_func impls[] = { av1_get_crc_value_c, test_impl };
+  const int repeat = 10000000 / (bsize_ + bsize_);
+
+  aom_usec_timer timer;
+  double time[2] = { 0 };
+  for (int i = 0; i < 2; ++i) {
+    aom_usec_timer_start(&timer);
+    for (int j = 0; j < repeat; ++j) {
+      impls[i](&calc_, buffer_, length_);
+    }
+    aom_usec_timer_mark(&timer);
+    time[i] = static_cast<double>(aom_usec_timer_elapsed(&timer));
+  }
+  printf("hash %3dx%-3d:%7.2f/%7.2fus", bsize_, bsize_, time[0], time[1]);
+  printf("(%3.2f)\n", time[0] / time[1]);
+}
+
+TEST_P(AV1CrcHashTest, CheckOutput) {
+  RunCheckOutput(GET_PARAM(0), GET_PARAM(1));
+}
+
+TEST_P(AV1CrcHashTest, DISABLED_Speed) { RunSpeedTest(GET_PARAM(0)); }
+
+const int kValidBlockSize[] = { 64, 32, 8, 4 };
+
+INSTANTIATE_TEST_CASE_P(
+    C, AV1CrcHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc_value_c),
+                       ::testing::Values(&av1_get_crc_value_c),
+                       ::testing::ValuesIn(kValidBlockSize)));
+
+#if HAVE_SSE4_2
+INSTANTIATE_TEST_CASE_P(
+    SSE4_2, AV1CrcHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc_value_sse4_2),
+                       ::testing::Values(&get_crc32c_value_ref),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
+}  // namespace AV1Hash
diff --git a/test/test.cmake b/test/test.cmake
index 708e420..8a4bccc 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -263,6 +263,12 @@
           "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
     endif ()
 
+    if (HAVE_SSE4_2)
+      set(AOM_UNIT_TEST_ENCODER_SOURCES
+          ${AOM_UNIT_TEST_ENCODER_SOURCES}
+          "${AOM_ROOT}/test/hash_test.cc")
+    endif ()
+
     set(AOM_UNIT_TEST_ENCODER_SOURCES
         ${AOM_UNIT_TEST_ENCODER_SOURCES}
         "${AOM_ROOT}/test/av1_fht16x32_test.cc"