Add ARM Neon optimization of compound_diffwtd_mask_d16

Block size  c/neon
8x4         5.04x
8x8         5.99x
16x8        6.37x
16x16       6.32x
32x16       3.14x
32x32       3.22x
64x32       3.01x
64x64       3.03x
128x64      2.57x
128x128     2.80x

Change-Id: I9d70e451c3d9ca365bb8879d10c62e726f519f49
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 4b8acd5..4b48418 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -292,6 +292,7 @@
             "${AOM_ROOT}/av1/common/arm/transpose_neon.h"
             "${AOM_ROOT}/av1/common/arm/blend_a64_hmask_neon.c"
             "${AOM_ROOT}/av1/common/arm/blend_a64_vmask_neon.c"
+            "${AOM_ROOT}/av1/common/arm/reconinter_neon.c"
             "${AOM_ROOT}/av1/common/arm/wiener_convolve_neon.c"
             "${AOM_ROOT}/av1/common/cdef_block_neon.c")
 
diff --git a/av1/common/arm/reconinter_neon.c b/av1/common/arm/reconinter_neon.c
new file mode 100644
index 0000000..44e0641
--- /dev/null
+++ b/av1/common/arm/reconinter_neon.c
@@ -0,0 +1,86 @@
+/*
+ *
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "aom/aom_integer.h"
+#include "aom_dsp/blend.h"
+#include "aom_ports/mem.h"
+#include "av1/common/arm/mem_neon.h"
+#include "av1/common/blockd.h"
+#include "config/av1_rtcd.h"
+
+void av1_build_compound_diffwtd_mask_d16_neon(
+    uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
+    int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
+    ConvolveParams *conv_params, int bd) {
+  assert(h >= 4);
+  assert(w >= 4);
+  assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
+  const int round =
+      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
+  uint16x8_t diff_q, tmp0, tmp1;
+  uint8x8_t diff_d, diff_select;
+  const CONV_BUF_TYPE *src0_1, *src1_1;
+  const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
+  const uint8x8_t dup_38 = vdup_n_u8(38);
+  const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
+  if (mask_type == DIFFWTD_38) {
+    diff_select = vdup_n_u8(255);
+  } else {
+    diff_select = vdup_n_u8(0);
+  }
+  if (w >= 8) {
+    for (int i = 0; i < h; ++i) {
+      src0_1 = src0;
+      src1_1 = src1;
+      for (int j = 0; j < w; j += 8) {
+        __builtin_prefetch(src0_1);
+        __builtin_prefetch(src1_1);
+        diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
+        diff_q = vrshlq_u16(diff_q, dup_round);
+        diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+        diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+        diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+        vst1_u8(mask, diff_d);
+        src0_1 += 8;
+        src1_1 += 8;
+        mask += 8;
+      }
+      src0 += src0_stride;
+      src1 += src1_stride;
+    }
+  } else if (w == 4) {
+    for (int i = 0; i < h; i += 2) {
+      src0_1 = src0;
+      src1_1 = src1;
+      __builtin_prefetch(src0_1 + 0 * src0_stride);
+      __builtin_prefetch(src0_1 + 1 * src0_stride);
+      __builtin_prefetch(src1_1 + 0 * src1_stride);
+      __builtin_prefetch(src1_1 + 1 * src1_stride);
+      tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
+                          vld1_u16(src0_1 + (1 * src0_stride)));
+      tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
+                          vld1_u16(src1_1 + (1 * src1_stride)));
+      diff_q = vabdq_u16(tmp0, tmp1);
+      diff_q = vrshlq_u16(diff_q, dup_round);
+      diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
+      diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
+      diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
+      vst1_u8(mask, diff_d);
+      src0 += src0_stride * 2;
+      src1 += src1_stride * 2;
+      mask += w * 2;
+    }
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 373e292..6aa9255 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -152,7 +152,7 @@
 specialize qw/av1_build_compound_diffwtd_mask_highbd ssse3 avx2/;
 
 add_proto qw/void av1_build_compound_diffwtd_mask_d16/, "uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0, int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w, ConvolveParams *conv_params, int bd";
-specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1/;
+specialize qw/av1_build_compound_diffwtd_mask_d16 sse4_1 neon/;
 
 #
 # Encoder functions below this point.
diff --git a/test/reconinter_test.cc b/test/reconinter_test.cc
index 2f4dc3a..4f74c81 100644
--- a/test/reconinter_test.cc
+++ b/test/reconinter_test.cc
@@ -48,13 +48,14 @@
 typedef ::testing::tuple<int, buildcompdiffwtdmaskd16_func, BLOCK_SIZE>
     BuildCompDiffwtdMaskD16Param;
 
+#if HAVE_SSE4_1 || HAVE_NEON
 ::testing::internal::ParamGenerator<BuildCompDiffwtdMaskD16Param> BuildParams(
     buildcompdiffwtdmaskd16_func filter) {
   return ::testing::Combine(::testing::Range(8, 13, 2),
                             ::testing::Values(filter),
                             ::testing::Range(BLOCK_4X4, BLOCK_SIZES_ALL));
 }
-
+#endif
 class BuildCompDiffwtdMaskD16Test
     : public ::testing::TestWithParam<BuildCompDiffwtdMaskD16Param> {
  public:
@@ -157,7 +158,7 @@
   printf("av1_build_compound_diffwtd_mask_d16 test_code %3dx%-3d: %7.2f us\n",
          width, height, 1000.0 * elapsed_time1 / num_loops);
 }
-
+#if HAVE_SSE4_1
 void BuildCompDiffwtdMaskTest::RunTest(const int sb_type, const int is_speed,
                                        const DIFFWTD_MASK_TYPE type) {
   const int width = block_size_wide[sb_type];
@@ -206,7 +207,7 @@
   RunTest(GetParam(), 1, DIFFWTD_38);
   RunTest(GetParam(), 1, DIFFWTD_38_INV);
 }
-
+#endif
 TEST_P(BuildCompDiffwtdMaskD16Test, CheckOutput) {
   RunCheckOutput(GET_PARAM(1));
 }
@@ -225,4 +226,9 @@
     BuildParams(av1_build_compound_diffwtd_mask_d16_sse4_1));
 #endif
 
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, BuildCompDiffwtdMaskD16Test,
+                        BuildParams(av1_build_compound_diffwtd_mask_d16_neon));
+#endif
+
 }  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 52fe380..8594d05 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -186,6 +186,7 @@
               "${AOM_ROOT}/test/obmc_variance_test.cc"
               "${AOM_ROOT}/test/sad_test.cc"
               "${AOM_ROOT}/test/subtract_test.cc"
+              "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/sum_squares_test.cc"
               "${AOM_ROOT}/test/variance_test.cc")
 
@@ -194,7 +195,6 @@
               "${AOM_ROOT}/test/av1_quantize_test.cc"
               "${AOM_ROOT}/test/corner_match_test.cc"
               "${AOM_ROOT}/test/quantize_func_test.cc"
-              "${AOM_ROOT}/test/reconinter_test.cc"
               "${AOM_ROOT}/test/simd_cmp_sse4.cc")
 
   if(HAVE_SSE4_1)