Use dedicated Arm hardware instructions to compute CRC32C hash

Add an implementation of the CRC32C hash function that uses the
dedicated CRC32C instructions in the Armv8-A architecture.

The Arm CRC32C instructions are optional in Armv8.0-A but mandatory
in all architecture revisions starting from Armv8.1-A. For the time
being, we detect the presence of the CRC32C instructions at compile
time but run-time feature detection ought to be implemented for
platforms like Android where an architecture baseline of Armv8.1-A
cannot be assumed.

Change-Id: I30894ec77dd79c87eeafcdefc9e4b061cd5106d6
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 72dc3e7..fc99b25 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -367,6 +367,9 @@
             "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/wedge_utils_neon.c")
 
+list(APPEND AOM_AV1_ENCODER_INTRIN_CRC32
+            "${AOM_ROOT}/av1/encoder/arm/crc32/hash_crc32.c")
+
 list(APPEND AOM_AV1_ENCODER_INTRIN_MSA
             "${AOM_ROOT}/av1/encoder/mips/msa/error_msa.c"
             "${AOM_ROOT}/av1/encoder/mips/msa/fdct4x4_msa.c"
@@ -632,6 +635,16 @@
                                       "AOM_AV1_ENCODER_INTRIN_NEON")
       endif()
     endif()
+
+    if(HAVE_ARM_CRC32)
+      if(CONFIG_AV1_ENCODER)
+        if(AOM_AV1_ENCODER_INTRIN_CRC32)
+          add_intrinsics_object_library("${AOM_ARM_CRC32_FLAG}" "crc32"
+                                        "aom_av1_encoder"
+                                        "AOM_AV1_ENCODER_INTRIN_CRC32")
+        endif()
+      endif()
+    endif()
   endif()
 
   if(HAVE_VSX)
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 97912be..fb650a8 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -450,7 +450,7 @@
 
   # hash
   add_proto qw/uint32_t av1_get_crc32c_value/, "void *crc_calculator, uint8_t *p, size_t length";
-  specialize qw/av1_get_crc32c_value sse4_2/;
+  specialize qw/av1_get_crc32c_value sse4_2 arm_crc32/;
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
     add_proto qw/void av1_compute_stats/,  "int wiener_win, const uint8_t *dgd8, const uint8_t *src8, int h_start, int h_end, int v_start, int v_end, int dgd_stride, int src_stride, int64_t *M, int64_t *H, int use_downsampled_wiener_stats";
diff --git a/av1/encoder/arm/crc32/hash_crc32.c b/av1/encoder/arm/crc32/hash_crc32.c
new file mode 100644
index 0000000..dd8685d
--- /dev/null
+++ b/av1/encoder/arm/crc32/hash_crc32.c
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2022, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <stdint.h>
+#include <stddef.h>
+#include <arm_acle.h>
+
+#define CRC_LOOP(op, crc, type, buf, len) \
+  while ((len) >= sizeof(type)) {         \
+    (crc) = op((crc), *(type *)(buf));    \
+    (len) -= sizeof(type);                \
+    buf += sizeof(type);                  \
+  }
+
+#define CRC_SINGLE(op, crc, type, buf, len) \
+  if ((len) >= sizeof(type)) {              \
+    (crc) = op((crc), *(type *)(buf));      \
+    (len) -= sizeof(type);                  \
+    buf += sizeof(type);                    \
+  }
+
+/* Return 32-bit CRC for the input buffer.
+ * Polynomial is 0x1EDC6F41.
+ */
+
+uint32_t av1_get_crc32c_value_arm_crc32(void *crc_calculator, uint8_t *p,
+                                        size_t len) {
+  (void)crc_calculator;
+  const uint8_t *buf = p;
+  uint32_t crc = 0xFFFFFFFF;
+
+#if !defined(__aarch64__)
+  // Align input to 8-byte boundary (only necessary for 32-bit builds.)
+  while (len && ((uintptr_t)buf & 7)) {
+    crc = __crc32cb(crc, *buf++);
+    len--;
+  }
+#endif
+
+  CRC_LOOP(__crc32cd, crc, uint64_t, buf, len)
+  CRC_SINGLE(__crc32cw, crc, uint32_t, buf, len)
+  CRC_SINGLE(__crc32ch, crc, uint16_t, buf, len)
+  CRC_SINGLE(__crc32cb, crc, uint8_t, buf, len)
+
+  return ~crc;
+}
diff --git a/build/cmake/aom_config_defaults.cmake b/build/cmake/aom_config_defaults.cmake
index 84159fb..b8e68f8 100644
--- a/build/cmake/aom_config_defaults.cmake
+++ b/build/cmake/aom_config_defaults.cmake
@@ -31,6 +31,7 @@
 
 # ARM feature flags.
 set_aom_detect_var(HAVE_NEON 0 "Enables NEON intrinsics optimizations.")
+set_aom_detect_var(HAVE_ARM_CRC32 0 "Enables Arm CRC32 optimizations.")
 
 # MIPS feature flags.
 set_aom_detect_var(HAVE_DSPR2 0 "Enables DSPR2 optimizations.")
diff --git a/build/cmake/cpu.cmake b/build/cmake/cpu.cmake
index ef2d755..c818ec6 100644
--- a/build/cmake/cpu.cmake
+++ b/build/cmake/cpu.cmake
@@ -20,6 +20,19 @@
     set(HAVE_NEON 0)
     set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-neon)
   endif()
+
+  check_c_source_compiles("
+    #if !defined(__ARM_FEATURE_CRC32) || __ARM_FEATURE_CRC32 != 1
+    #error \"CRC32 is unavailable.\"
+    #endif
+    int main(void) { return 0; }" HAVE_CRC32)
+  if(HAVE_CRC32)
+    set(HAVE_ARM_CRC32 1)
+  else()
+    set(HAVE_ARM_CRC32 0)
+    set(AOM_RTCD_FLAGS ${AOM_RTCD_FLAGS} --disable-arm_crc32)
+  endif()
+
 elseif("${AOM_TARGET_CPU}" MATCHES "^mips")
   set(ARCH_MIPS 1)
   set(RTCD_ARCH_MIPS "yes")
diff --git a/build/cmake/rtcd.pl b/build/cmake/rtcd.pl
index e9f75dd..7f961ca 100755
--- a/build/cmake/rtcd.pl
+++ b/build/cmake/rtcd.pl
@@ -432,8 +432,8 @@
   @ALL_ARCHS = filter(qw/neon/);
   arm;
 } elsif ($opts{arch} eq 'arm64' ) {
-  @ALL_ARCHS = filter(qw/neon/);
-  &require("neon");
+  @ALL_ARCHS = filter(qw/neon arm_crc32/);
+  &require(@ALL_ARCHS);
   arm;
 } elsif ($opts{arch} eq 'ppc') {
   @ALL_ARCHS = filter(qw/vsx/);
diff --git a/test/hash_test.cc b/test/hash_test.cc
index 5ce0fbb..61e0b51 100644
--- a/test/hash_test.cc
+++ b/test/hash_test.cc
@@ -131,4 +131,11 @@
                        ::testing::ValuesIn(kValidBlockSize)));
 #endif
 
+#if HAVE_ARM_CRC32
+INSTANTIATE_TEST_SUITE_P(
+    ARM_CRC32, AV1Crc32cHashTest,
+    ::testing::Combine(::testing::Values(&av1_get_crc32c_value_arm_crc32),
+                       ::testing::ValuesIn(kValidBlockSize)));
+#endif
+
 }  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index ac8681d..ea99a3a 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -322,7 +322,7 @@
 
   endif()
 
-  if(HAVE_SSE4_2)
+  if(HAVE_SSE4_2 OR HAVE_ARM_CRC32)
     list(APPEND AOM_UNIT_TEST_ENCODER_SOURCES "${AOM_ROOT}/test/hash_test.cc")
   endif()
 
@@ -498,6 +498,10 @@
     add_intrinsics_source_to_target("${AOM_NEON_INTRIN_FLAG}" "test_libaom"
                                     "AOM_UNIT_TEST_COMMON_INTRIN_NEON")
   endif()
+  if(HAVE_ARM_CRC32)
+    add_intrinsics_source_to_target("${AOM_ARM_CRC32_FLAG}" "test_libaom"
+                                    "AOM_UNIT_TEST_COMMON_INTRIN_CRC32")
+  endif()
 
   if(ENABLE_TESTDATA)
     make_test_data_lists("${AOM_UNIT_TEST_DATA_LIST_FILE}" test_files