Add NEON version of av1_fwht4x4 function

AVG gain = 1.4 via  NEON/Trans4x4WHT.DISABLED_Speed

Change-Id: I778bc07eb9d5a74a697d77e56c8e95092ad50012
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 4bef55e..f280ccf 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -383,6 +383,7 @@
             "${AOM_ROOT}/av1/encoder/arm/neon/rdopt_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_error_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/encodetxb_neon.c"
+            "${AOM_ROOT}/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/av1_fwd_txfm2d_neon.c"
             "${AOM_ROOT}/av1/encoder/arm/neon/highbd_fwd_txfm_neon.c")
 
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 2a94ef3..901203e 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -298,6 +298,7 @@
   # fdct functions
 
   add_proto qw/void av1_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_fwht4x4 neon/;
 
   #fwd txfm
   add_proto qw/void av1_lowbd_fwd_txfm/, "const int16_t *src_diff, tran_low_t *coeff, int diff_stride, TxfmParam *txfm_param";
@@ -364,6 +365,7 @@
   }
 
   add_proto qw/void av1_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/av1_highbd_fwht4x4 neon/;
 
   # End av1_high encoder functions
 
diff --git a/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c b/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
new file mode 100644
index 0000000..0ad1131
--- /dev/null
+++ b/av1/encoder/arm/neon/hybrid_fwd_txfm_neon.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2020, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+
+#include "aom_dsp/txfm_common.h"
+
+static void transpose4x4(int16x8_t in[2], int16x4_t out[4]) {
+  int32x4x2_t b0 =
+      vtrnq_s32(vreinterpretq_s32_s16(in[0]), vreinterpretq_s32_s16(in[1]));
+  int16x4x2_t c0 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[0])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[0])));
+  int16x4x2_t c1 = vtrn_s16(vreinterpret_s16_s32(vget_low_s32(b0.val[1])),
+                            vreinterpret_s16_s32(vget_high_s32(b0.val[1])));
+  out[0] = c0.val[0];
+  out[1] = c0.val[1];
+  out[2] = c1.val[0];
+  out[3] = c1.val[1];
+}
+
+void av1_fwht4x4_neon(const int16_t *input, tran_low_t *output, int stride) {
+  // Load the 4x4 source in transposed form.
+  int16x4_t a1, b1, c1, d1, e;
+  a1 = vld1_s16(&input[0]);
+  b1 = vld1_s16(&input[1 * stride]);
+  c1 = vld1_s16(&input[2 * stride]);
+  d1 = vld1_s16(&input[3 * stride]);
+
+  // WHT.
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  int16x8_t x[2];
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  int16x4_t s[4];
+  transpose4x4(x, s);
+
+  a1 = s[0];
+  b1 = s[1];
+  c1 = s[2];
+  d1 = s[3];
+
+  // Row transforms.
+  a1 = vadd_s16(a1, b1);
+  d1 = vsub_s16(d1, c1);
+  e = vhsub_s16(a1, d1);
+  b1 = vsub_s16(e, b1);
+  c1 = vsub_s16(e, c1);
+  a1 = vsub_s16(a1, c1);
+  d1 = vadd_s16(d1, b1);
+
+  x[0] = vcombine_s16(a1, c1);
+  x[1] = vcombine_s16(d1, b1);
+
+  transpose4x4(x, s);
+
+  vst1q_s32(&output[0], vshll_n_s16(s[0], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[4], vshll_n_s16(s[1], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[8], vshll_n_s16(s[2], UNIT_QUANT_SHIFT));
+  vst1q_s32(&output[12], vshll_n_s16(s[3], UNIT_QUANT_SHIFT));
+}
+
+void av1_highbd_fwht4x4_neon(const int16_t *input, tran_low_t *output,
+                             int stride) {
+  av1_fwht4x4_neon(input, output, stride);
+}
diff --git a/test/fwht4x4_test.cc b/test/fwht4x4_test.cc
index d2f77b8..b600d26 100644
--- a/test/fwht4x4_test.cc
+++ b/test/fwht4x4_test.cc
@@ -37,7 +37,7 @@
 
 using libaom_test::FhtFunc;
 
-typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int>
+typedef std::tuple<FdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t, int, FdctFunc>
     Dct4x4Param;
 
 void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
@@ -67,6 +67,7 @@
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
     num_coeffs_ = GET_PARAM(4);
+    fwd_txfm_c_ = GET_PARAM(5);
   }
   virtual void TearDown() { libaom_test::ClearSystemState(); }
 
@@ -77,9 +78,89 @@
   void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
     inv_txfm_(out, dst, stride);
   }
+  void RunSpeedTest() {
+    if (!fwd_txfm_c_) {
+      GTEST_SKIP();
+    } else {
+      ACMRandom rnd(ACMRandom::DeterministicSeed());
+      const int count_test_block = 10;
+      const int numIter = 5000;
+
+      int c_sum_time = 0;
+      int simd_sum_time = 0;
+
+      int stride = 96;
+
+      int16_t *input_block = reinterpret_cast<int16_t *>(
+          aom_memalign(16, sizeof(int16_t) * stride * height_));
+      tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_ref_block[0]) * num_coeffs_));
+      tran_low_t *output_block = reinterpret_cast<tran_low_t *>(
+          aom_memalign(16, sizeof(output_block[0]) * num_coeffs_));
+
+      for (int i = 0; i < count_test_block; ++i) {
+        int j, k;
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int in_idx = j * stride + k;
+            int out_idx = j * pitch_ + k;
+            input_block[in_idx] =
+                (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+            if (bit_depth_ == AOM_BITS_8) {
+              output_block[out_idx] = output_ref_block[out_idx] = rnd.Rand8();
+            } else {
+              output_block[out_idx] = output_ref_block[out_idx] =
+                  rnd.Rand16() & mask_;
+            }
+          }
+        }
+
+        aom_usec_timer c_timer_;
+        aom_usec_timer_start(&c_timer_);
+        for (int i = 0; i < numIter; i++) {
+          ASM_REGISTER_STATE_CHECK(
+              fwd_txfm_c_(input_block, output_ref_block, stride));
+        }
+        aom_usec_timer_mark(&c_timer_);
+
+        aom_usec_timer simd_timer_;
+        aom_usec_timer_start(&simd_timer_);
+
+        for (int i = 0; i < numIter; i++) {
+          ASM_REGISTER_STATE_CHECK(
+              fwd_txfm_(input_block, output_block, stride));
+        }
+        aom_usec_timer_mark(&simd_timer_);
+
+        c_sum_time += static_cast<int>(aom_usec_timer_elapsed(&c_timer_));
+        simd_sum_time += static_cast<int>(aom_usec_timer_elapsed(&simd_timer_));
+
+        // The minimum quant value is 4.
+        for (j = 0; j < height_; ++j) {
+          for (k = 0; k < pitch_; ++k) {
+            int out_idx = j * pitch_ + k;
+            ASSERT_EQ(output_block[out_idx], output_ref_block[out_idx])
+                << "Error: not bit-exact result at index: " << out_idx
+                << " at test block: " << i;
+          }
+        }
+      }
+
+      printf(
+          "c_time = %d \t simd_time = %d \t Gain = %4.2f \n", c_sum_time,
+          simd_sum_time,
+          (static_cast<float>(c_sum_time) / static_cast<float>(simd_sum_time)));
+
+      aom_free(input_block);
+      aom_free(output_ref_block);
+      aom_free(output_block);
+    }
+  }
 
   FdctFunc fwd_txfm_;
   IdctFunc inv_txfm_;
+
+  FdctFunc fwd_txfm_c_;  // C version of forward transform for speed test.
 };
 
 TEST_P(Trans4x4WHT, AccuracyCheck) { RunAccuracyCheck(0, 0.00001); }
@@ -89,12 +170,27 @@
 TEST_P(Trans4x4WHT, MemCheck) { RunMemCheck(); }
 
 TEST_P(Trans4x4WHT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
+
+TEST_P(Trans4x4WHT, DISABLED_Speed) { RunSpeedTest(); }
+
 using std::make_tuple;
 
 INSTANTIATE_TEST_SUITE_P(
     C, Trans4x4WHT,
     ::testing::Values(make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_10, DCT_DCT,
-                                 AOM_BITS_10, 16),
+                                 AOM_BITS_10, 16, static_cast<FdctFunc>(NULL)),
                       make_tuple(&av1_highbd_fwht4x4_c, &iwht4x4_12, DCT_DCT,
-                                 AOM_BITS_12, 16)));
+                                 AOM_BITS_12, 16,
+                                 static_cast<FdctFunc>(NULL))));
+#if HAVE_NEON
+
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Trans4x4WHT,
+    ::testing::Values(make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_10, DCT_DCT,
+                                 AOM_BITS_10, 16, &av1_highbd_fwht4x4_c),
+                      make_tuple(&av1_highbd_fwht4x4_neon, &iwht4x4_12, DCT_DCT,
+                                 AOM_BITS_12, 16, &av1_highbd_fwht4x4_c)));
+
+#endif  // HAVE_NEON
+
 }  // namespace