highway avx2: sad 64 and above
Microbenchmark (ns):
Speed up is compared against baseline avx2.
Positive means faster, negative means slower.
| Block size | baseline avx2 | hwy avx2 | hwy avx2 speedup |
|------------|---------------|----------|------------------|
| 128x128 | 548 | 450 | 17.88% |
| 128x64 | 143 | 152 | -6.29% |
| 64x128 | 141 | 135 | 4.26% |
| 64x64 | 71.5 | 65.1 | 8.95% |
| 64x32 | 40.5 | 31.1 | 23.21% |
Change-Id: I3169d8c04287d32913bb4cd56d5077ac88f8f00d
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index 33d9713..9ceb109 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -181,6 +181,11 @@
"${AOM_ROOT}/aom_dsp/variance.c"
"${AOM_ROOT}/aom_dsp/variance.h")
+ if(CONFIG_HIGHWAY)
+ list(APPEND AOM_DSP_ENCODER_SOURCES "${AOM_ROOT}/aom_dsp/reduce_sum_hwy.h"
+ "${AOM_ROOT}/aom_dsp/sad_hwy.h")
+ endif()
+
# Flow estimation library and grain/noise table/model.
if(NOT CONFIG_REALTIME_ONLY)
list(APPEND AOM_DSP_ENCODER_SOURCES
@@ -259,6 +264,11 @@
"${AOM_ROOT}/aom_dsp/x86/blk_sse_sum_avx2.c"
"${AOM_ROOT}/aom_dsp/x86/sum_squares_avx2.c")
+ if(CONFIG_HIGHWAY)
+ list(APPEND AOM_DSP_ENCODER_INTRIN_AVX2
+ "${AOM_ROOT}/aom_dsp/x86/sad_hwy_avx2.cc")
+ endif()
+
list(APPEND AOM_DSP_ENCODER_INTRIN_AVX
"${AOM_ROOT}/aom_dsp/x86/aom_quantize_avx.c")
diff --git a/aom_dsp/reduce_sum_hwy.h b/aom_dsp/reduce_sum_hwy.h
new file mode 100644
index 0000000..9f4c005
--- /dev/null
+++ b/aom_dsp/reduce_sum_hwy.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_REDUCE_SUM_HWY_H_
+#define AOM_AOM_DSP_REDUCE_SUM_HWY_H_
+
+#include <type_traits>
+#include "third_party/highway/hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+
+namespace {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+template <size_t NumBlocks>
+struct BlockReduceTraits;
+
+template <>
+struct BlockReduceTraits<1> {
+ template <typename D>
+ HWY_ATTR HWY_INLINE static hn::VFromD<D> ReduceSum(D d, hn::VFromD<D> v) {
+ (void)d;
+ return v;
+ }
+};
+
+template <size_t NumBlocks>
+struct BlockReduceTraits {
+ static_assert(NumBlocks > 1,
+ "Primary template BlockReduceTraits assumes NumBlocks > 1");
+ static_assert((NumBlocks & (NumBlocks - 1)) == 0,
+ "BlockReduceTraits requires NumBlocks to be a power of 2.");
+
+ template <typename D>
+ HWY_ATTR HWY_INLINE static hn::VFromD<hn::BlockDFromD<D>> ReduceSum(
+ D d, hn::VFromD<D> v) {
+ (void)d;
+ constexpr hn::Half<D> half_d;
+ auto v_half = hn::Add(hn::LowerHalf(half_d, v), hn::UpperHalf(half_d, v));
+ return BlockReduceTraits<NumBlocks / 2>::ReduceSum(half_d, v_half);
+ }
+};
+
+// ReduceSum across blocks.
+// For example, with a 4-block vector with 16 lanes of uint32_t:
+// [a3 b3 c3 d3 a2 b2 c2 d2 a1 b1 c1 d1 a0 b0 c0 d0]
+// returns a vector with 4 lanes:
+// [a3+a2+a1+a0 b3+b2+b1+b0 c3+c2+c1+c0 d3+d2+d1+d0]
+template <typename D>
+HWY_ATTR HWY_INLINE hn::Vec<hn::BlockDFromD<D>> BlockReduceSum(
+ D int_tag, hn::VFromD<D> v) {
+ return BlockReduceTraits<int_tag.MaxBlocks()>::ReduceSum(int_tag, v);
+}
+
+} // namespace HWY_NAMESPACE
+} // namespace
+
+HWY_AFTER_NAMESPACE();
+
+#endif // AOM_AOM_DSP_REDUCE_SUM_HWY_H_
diff --git a/aom_dsp/sad_hwy.h b/aom_dsp/sad_hwy.h
new file mode 100644
index 0000000..b142425
--- /dev/null
+++ b/aom_dsp/sad_hwy.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2025, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+#ifndef AOM_AOM_DSP_SAD_HWY_H_
+#define AOM_AOM_DSP_SAD_HWY_H_
+
+#include "aom_dsp/reduce_sum_hwy.h"
+#include "third_party/highway/hwy/highway.h"
+
+HWY_BEFORE_NAMESPACE();
+
+namespace {
+namespace HWY_NAMESPACE {
+
+namespace hn = hwy::HWY_NAMESPACE;
+
+template <int BlockWidth>
+HWY_MAYBE_UNUSED unsigned int SumOfAbsoluteDiff(
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,
+ int ref_stride, int h, const uint8_t *second_pred = nullptr) {
+ constexpr hn::CappedTag<uint8_t, BlockWidth> pixel_tag;
+ constexpr hn::Repartition<uint64_t, decltype(pixel_tag)> intermediate_sum_tag;
+ const int vw = hn::Lanes(pixel_tag);
+ auto sum_sad = hn::Zero(intermediate_sum_tag);
+ const bool is_sad_avg = second_pred != nullptr;
+ for (int i = 0; i < h; ++i) {
+ for (int j = 0; j < BlockWidth; j += vw) {
+ auto src_vec = hn::LoadU(pixel_tag, &src_ptr[j]);
+ auto ref_vec = hn::LoadU(pixel_tag, &ref_ptr[j]);
+ if (is_sad_avg) {
+ auto sec_pred_vec = hn::LoadU(pixel_tag, &second_pred[j]);
+ ref_vec = hn::AverageRound(ref_vec, sec_pred_vec);
+ }
+ auto sad = hn::SumsOf8AbsDiff(src_vec, ref_vec);
+ sum_sad = hn::Add(sum_sad, sad);
+ }
+ src_ptr += src_stride;
+ ref_ptr += ref_stride;
+ if (is_sad_avg) {
+ second_pred += BlockWidth;
+ }
+ }
+ return static_cast<unsigned int>(
+ hn::ReduceSum(intermediate_sum_tag, sum_sad));
+}
+
+} // namespace HWY_NAMESPACE
+} // namespace
+
+#define FSAD(w, h, suffix) \
+ extern "C" unsigned int aom_sad##w##x##h##_##suffix( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride); \
+ HWY_ATTR unsigned int aom_sad##w##x##h##_##suffix( \
+ const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, \
+ int ref_stride) { \
+ return HWY_NAMESPACE::SumOfAbsoluteDiff<w>(src_ptr, src_stride, ref_ptr, \
+ ref_stride, h); \
+ }
+
+#define FOR_EACH_SAD_BLOCK_SIZE(X, suffix) \
+ X(128, 128, suffix) \
+ X(128, 64, suffix) \
+ X(64, 128, suffix) \
+ X(64, 64, suffix) \
+ X(64, 32, suffix)
+
+HWY_AFTER_NAMESPACE();
+
+#endif // AOM_AOM_DSP_SAD_HWY_H_
diff --git a/aom_dsp/x86/sad_avx2.c b/aom_dsp/x86/sad_avx2.c
index f19ff05..ee02380 100644
--- a/aom_dsp/x86/sad_avx2.c
+++ b/aom_dsp/x86/sad_avx2.c
@@ -101,11 +101,17 @@
h / 2); \
}
+#if CONFIG_HIGHWAY
+#define FSAD64 \
+ FSADS64_H(64) \
+ FSADS64_H(32)
+#else
#define FSAD64 \
FSAD64_H(64) \
FSAD64_H(32) \
FSADS64_H(64) \
FSADS64_H(32)
+#endif
#define FSAD32 \
FSAD32_H(64) \
diff --git a/aom_dsp/x86/sad_hwy_avx2.cc b/aom_dsp/x86/sad_hwy_avx2.cc
new file mode 100644
index 0000000..2df2646
--- /dev/null
+++ b/aom_dsp/x86/sad_hwy_avx2.cc
@@ -0,0 +1,17 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#define HWY_BASELINE_TARGETS HWY_AVX2
+#define HWY_BROKEN_32BIT 0
+
+#include "aom_dsp/sad_hwy.h"
+
+FOR_EACH_SAD_BLOCK_SIZE(FSAD, avx2)
diff --git a/aom_dsp/x86/sad_impl_avx2.c b/aom_dsp/x86/sad_impl_avx2.c
index 0d1b5ab..2c6fa24 100644
--- a/aom_dsp/x86/sad_impl_avx2.c
+++ b/aom_dsp/x86/sad_impl_avx2.c
@@ -56,6 +56,7 @@
return sum;
}
+#if !CONFIG_HIGHWAY
unsigned int aom_sad128x64_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {
unsigned int half_width = 64;
@@ -83,6 +84,7 @@
sum += aom_sad128x64_avx2(src_ptr, src_stride, ref_ptr, ref_stride);
return sum;
}
+#endif
unsigned int aom_sad_skip_128x64_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride) {