Add 2-tap path for aom_convolve8_horiz_neon

Add a specialized Neon implementation for 2-tap filters and use it
instead of the 4-tap implementation in all 3 Neon versions of
aom_convolve8_horiz. This provides between 20% and 60% uplift depending
on the architecture extension.

Change-Id: I48da9553cd391dd801affdc1c62995d1f1a48f15
diff --git a/aom_dsp/arm/aom_convolve8_neon.c b/aom_dsp/arm/aom_convolve8_neon.c
index 6a177b2..5665b5e 100644
--- a/aom_dsp/arm/aom_convolve8_neon.c
+++ b/aom_dsp/arm/aom_convolve8_neon.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -354,7 +355,12 @@
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon(src + 2, src_stride, dst, dst_stride, filter_x, w,
                               h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon.h b/aom_dsp/arm/aom_convolve8_neon.h
new file mode 100644
index 0000000..0aebc6d
--- /dev/null
+++ b/aom_dsp/arm/aom_convolve8_neon.h
@@ -0,0 +1,106 @@
+/*
+ *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+#define AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
+
+#include <arm_neon.h>
+
+#include "config/aom_config.h"
+#include "aom_dsp/arm/mem_neon.h"
+
+static INLINE void convolve8_horiz_2tap_neon(const uint8_t *src,
+                                             ptrdiff_t src_stride, uint8_t *dst,
+                                             ptrdiff_t dst_stride,
+                                             const int16_t *filter_x, int w,
+                                             int h) {
+  // Bilinear filter values are all positive.
+  const uint8x8_t f0 = vdup_n_u8((uint8_t)filter_x[3]);
+  const uint8x8_t f1 = vdup_n_u8((uint8_t)filter_x[4]);
+
+  if (w == 4) {
+    do {
+      uint8x8_t s0 =
+          load_unaligned_u8(src + 0 * src_stride + 0, (int)src_stride);
+      uint8x8_t s1 =
+          load_unaligned_u8(src + 0 * src_stride + 1, (int)src_stride);
+      uint8x8_t s2 =
+          load_unaligned_u8(src + 2 * src_stride + 0, (int)src_stride);
+      uint8x8_t s3 =
+          load_unaligned_u8(src + 2 * src_stride + 1, (int)src_stride);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      store_u8x4_strided_x2(dst + 0 * dst_stride, dst_stride, d0);
+      store_u8x4_strided_x2(dst + 2 * dst_stride, dst_stride, d1);
+
+      src += 4 * src_stride;
+      dst += 4 * dst_stride;
+      h -= 4;
+    } while (h > 0);
+  } else if (w == 8) {
+    do {
+      uint8x8_t s0 = vld1_u8(src + 0 * src_stride + 0);
+      uint8x8_t s1 = vld1_u8(src + 0 * src_stride + 1);
+      uint8x8_t s2 = vld1_u8(src + 1 * src_stride + 0);
+      uint8x8_t s3 = vld1_u8(src + 1 * src_stride + 1);
+
+      uint16x8_t sum0 = vmull_u8(s0, f0);
+      sum0 = vmlal_u8(sum0, s1, f1);
+      uint16x8_t sum1 = vmull_u8(s2, f0);
+      sum1 = vmlal_u8(sum1, s3, f1);
+
+      uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+      uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+      vst1_u8(dst + 0 * dst_stride, d0);
+      vst1_u8(dst + 1 * dst_stride, d1);
+
+      src += 2 * src_stride;
+      dst += 2 * dst_stride;
+      h -= 2;
+    } while (h > 0);
+  } else {
+    do {
+      int width = w;
+      const uint8_t *s = src;
+      uint8_t *d = dst;
+
+      do {
+        uint8x16_t s0 = vld1q_u8(s + 0);
+        uint8x16_t s1 = vld1q_u8(s + 1);
+
+        uint16x8_t sum0 = vmull_u8(vget_low_u8(s0), f0);
+        sum0 = vmlal_u8(sum0, vget_low_u8(s1), f1);
+        uint16x8_t sum1 = vmull_u8(vget_high_u8(s0), f0);
+        sum1 = vmlal_u8(sum1, vget_high_u8(s1), f1);
+
+        uint8x8_t d0 = vqrshrn_n_u16(sum0, FILTER_BITS);
+        uint8x8_t d1 = vqrshrn_n_u16(sum1, FILTER_BITS);
+
+        vst1q_u8(d, vcombine_u8(d0, d1));
+
+        s += 16;
+        d += 16;
+        width -= 16;
+      } while (width != 0);
+      src += src_stride;
+      dst += dst_stride;
+    } while (--h > 0);
+  }
+}
+
+#endif  // AOM_AOM_DSP_ARM_AOM_CONVOLVE8_NEON_H_
diff --git a/aom_dsp/arm/aom_convolve8_neon_dotprod.c b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
index 576db8e..f49d33f 100644
--- a/aom_dsp/arm/aom_convolve8_neon_dotprod.c
+++ b/aom_dsp/arm/aom_convolve8_neon_dotprod.c
@@ -20,6 +20,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -269,7 +270,12 @@
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon_dotprod(src + 2, src_stride, dst, dst_stride,
                                       filter_x, w, h);
   } else {
diff --git a/aom_dsp/arm/aom_convolve8_neon_i8mm.c b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
index da0210a..9727639 100644
--- a/aom_dsp/arm/aom_convolve8_neon_i8mm.c
+++ b/aom_dsp/arm/aom_convolve8_neon_i8mm.c
@@ -19,6 +19,7 @@
 #include "aom/aom_integer.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/aom_filter.h"
+#include "aom_dsp/arm/aom_convolve8_neon.h"
 #include "aom_dsp/arm/aom_filter.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
@@ -243,7 +244,12 @@
 
   src -= ((SUBPEL_TAPS / 2) - 1);
 
-  if (get_filter_taps_convolve8(filter_x) <= 4) {
+  int filter_taps = get_filter_taps_convolve8(filter_x);
+
+  if (filter_taps == 2) {
+    convolve8_horiz_2tap_neon(src + 3, src_stride, dst, dst_stride, filter_x, w,
+                              h);
+  } else if (filter_taps == 4) {
     convolve8_horiz_4tap_neon_i8mm(src + 2, src_stride, dst, dst_stride,
                                    filter_x, w, h);
   } else {