highway avx2: sad 64 and above

Microbenchmark (ns):
Speed up is compared against baseline avx2.
Positive means faster, negative means slower.

| Block size | baseline avx2 | hwy avx2 | hwy avx2 speedup |
|------------|---------------|----------|------------------|
| 128x128    | 548           | 450      | 17.88%           |
| 128x64     | 143           | 152      | -6.29%           |
| 64x128     | 141           | 135      | 4.26%            |
| 64x64      | 71.5          | 65.1     | 8.95%            |
| 64x32      | 40.5          | 31.1     | 23.21%           |

Change-Id: I3169d8c04287d32913bb4cd56d5077ac88f8f00d
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 33d9713..9ceb109 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -181,6 +181,11 @@
               "${AOM_ROOT}/aom_dsp/variance.c"
               "${AOM_ROOT}/aom_dsp/variance.h")
 
+  if(CONFIG_HIGHWAY)
+    list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/reduce_sum_hwy.h"
+                "${AOM_ROOT}/aom_dsp/sad_hwy.h")
+  endif()
+
   # Flow estimation library and grain/noise table/model.
   if(NOT CONFIG_REALTIME_ONLY)
     list(APPEND AOM_DSP_ENCODER_SOURCES
@@ -259,6 +264,11 @@
               "${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
               "${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
 
+  if(CONFIG_HIGHWAY)
+    list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+                "${AOM_ROOT}/aom_dsp/x86/sad_hwy_avx2.cc")
+  endif()
+
   list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
               "${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
 
diff --git a/aom_dsp/reduce_sum_hwy.h b/aom_dsp/reduce_sum_hwy.h
new file mode 100644
index 0000000..9f4c005
--- /dev/null
+++ b/aom_dsp/reduce_sum_hwy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_REDUCE_SUM_HWY_H_
+#define AOM_AOM_DSP_REDUCE_SUM_HWY_H_
+
+#include <type_traits>
+#include "third_party/highway/hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+
+namespace {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+template <size_t NumBlocks>
+struct BlockReduceTraits;
+
+template <>
+struct BlockReduceTraits<1> {
+  template <typename D>
+  HWY_ATTR HWY_INLINE static hn::VFromD<D> ReduceSum(D d, hn::VFromD<D> v) {
+    (void)d;
+    return v;
+  }
+};
+
+template <size_t NumBlocks>
+struct BlockReduceTraits {
+  static_assert(NumBlocks > 1,
+                "Primary template BlockReduceTraits assumes NumBlocks > 1");
+  static_assert((NumBlocks & (NumBlocks - 1)) == 0,
+                "BlockReduceTraits requires NumBlocks to be a power of 2.");
+
+  template <typename D>
+  HWY_ATTR HWY_INLINE static hn::VFromD<hn::BlockDFromD<D>> ReduceSum(
+      D d, hn::VFromD<D> v) {
+    (void)d;
+    constexpr hn::Half<D> half_d;
+    auto v_half = hn::Add(hn::LowerHalf(half_d, v), hn::UpperHalf(half_d, v));
+    return BlockReduceTraits<NumBlocks / 2>::ReduceSum(half_d, v_half);
+  }
+};
+
+// ReduceSum across blocks.
+// For example, with a 4-block vector with 16 lanes of uint32_t:
+// [a3 b3 c3 d3 a2 b2 c2 d2 a1 b1 c1 d1 a0 b0 c0 d0]
+// returns a vector with 4 lanes:
+// [a3+a2+a1+a0 b3+b2+b1+b0 c3+c2+c1+c0 d3+d2+d1+d0]
+template <typename D>
+HWY_ATTR HWY_INLINE hn::Vec<hn::BlockDFromD<D>> BlockReduceSum(
+    D int_tag, hn::VFromD<D> v) {
+  return BlockReduceTraits<int_tag.MaxBlocks()>::ReduceSum(int_tag, v);
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace
+
+HWY_AFTER_NAMESPACE();
+
+#endif  // AOM_AOM_DSP_REDUCE_SUM_HWY_H_
diff --git a/aom_dsp/sad_hwy.h b/aom_dsp/sad_hwy.h
new file mode 100644
index 0000000..b142425
--- /dev/null
+++ b/aom_dsp/sad_hwy.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_SAD_HWY_H_
+#define AOM_AOM_DSP_SAD_HWY_H_
+
+#include "aom_dsp/reduce_sum_hwy.h"
+#include "third_party/highway/hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+
+namespace {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+template <int BlockWidth>
+HWY_MAYBE_UNUSED unsigned int SumOfAbsoluteDiff(
+    const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+    int ref_stride, int h, const uint8_t *second_pred = nullptr) {
+  constexpr hn::CappedTag<uint8_t, BlockWidth> pixel_tag;
+  constexpr hn::Repartition<uint64_t, decltype(pixel_tag)> intermediate_sum_tag;
+  const int vw = hn::Lanes(pixel_tag);
+  auto sum_sad = hn::Zero(intermediate_sum_tag);
+  const bool is_sad_avg = second_pred != nullptr;
+  for (int i = 0; i < h; ++i) {
+    for (int j = 0; j < BlockWidth; j += vw) {
+      auto src_vec = hn::LoadU(pixel_tag, &src_ptr[j]);
+      auto ref_vec = hn::LoadU(pixel_tag, &ref_ptr[j]);
+      if (is_sad_avg) {
+        auto sec_pred_vec = hn::LoadU(pixel_tag, &second_pred[j]);
+        ref_vec = hn::AverageRound(ref_vec, sec_pred_vec);
+      }
+      auto sad = hn::SumsOf8AbsDiff(src_vec, ref_vec);
+      sum_sad = hn::Add(sum_sad, sad);
+    }
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    if (is_sad_avg) {
+      second_pred += BlockWidth;
+    }
+  }
+  return static_cast<unsigned int>(
+      hn::ReduceSum(intermediate_sum_tag, sum_sad));
+}
+
+}  // namespace HWY_NAMESPACE
+}  // namespace
+
+#define FSAD(w, h, suffix)                                                   \
+  extern "C" unsigned int aom_sad##w##x##h##_##suffix(                       \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride);                                                       \
+  HWY_ATTR unsigned int aom_sad##w##x##h##_##suffix(                         \
+      const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
+      int ref_stride) {                                                      \
+    return HWY_NAMESPACE::SumOfAbsoluteDiff<w>(src_ptr, src_stride, ref_ptr, \
+                                               ref_stride, h);               \
+  }
+
+#define FOR_EACH_SAD_BLOCK_SIZE(X, suffix) \
+  X(128, 128, suffix)                      \
+  X(128, 64, suffix)                       \
+  X(64, 128, suffix)                       \
+  X(64, 64, suffix)                        \
+  X(64, 32, suffix)
+
+HWY_AFTER_NAMESPACE();
+
+#endif  // AOM_AOM_DSP_SAD_HWY_H_
diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index f19ff05..ee02380 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c
@@ -101,11 +101,17 @@
                             h / 2);                                           \
   }
 
+#if CONFIG_HIGHWAY
+#define FSAD64  \
+  FSADS64_H(64) \
+  FSADS64_H(32)
+#else
 #define FSAD64  \
   FSAD64_H(64)  \
   FSAD64_H(32)  \
   FSADS64_H(64) \
   FSADS64_H(32)
+#endif
 
 #define FSAD32  \
   FSAD32_H(64)  \
diff --git a/aom_dsp/x86/sad_hwy_avx2.cc b/aom_dsp/x86/sad_hwy_avx2.cc
new file mode 100644
index 0000000..2df2646
--- /dev/null
+++ b/aom_dsp/x86/sad_hwy_avx2.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/sad_hwy.h"
+
+FOR_EACH_SAD_BLOCK_SIZE(FSAD, avx2)
diff --git a/aom_dsp/x86/sad_impl_avx2.c b/aom_dsp/x86/sad_impl_avx2.c
index 0d1b5ab..2c6fa24 100644
--- a/aom_dsp/x86/sad_impl_avx2.c
+++ b/aom_dsp/x86/sad_impl_avx2.c
@@ -56,6 +56,7 @@
   return sum;
 }
 
+#if !CONFIG_HIGHWAY
 unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride) {
   unsigned int half_width = 64;
@@ -83,6 +84,7 @@
   sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
   return sum;
 }
+#endif
 
 unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
                                       const uint8_t *ref_ptr, int ref_stride) {