blob: 58e18b0f729300e0b5fdaa7582bb953d7ee80010 [file]
/*
* Copyright (c) 2026, Alliance for Open Media. All rights reserved.
*
* This source code is subject to the terms of the BSD 2 Clause License and
* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
* was not distributed with this source code in the LICENSE file, you can
* obtain it at www.aomedia.org/license/software. If the Alliance for Open
* Media Patent License 1.0 was not distributed with this source code in the
* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
*/
#ifndef AOM_AOM_DSP_CONVOLVE_HWY_H_
#define AOM_AOM_DSP_CONVOLVE_HWY_H_
#include <cassert>
#include "aom_dsp/arm/aom_filter.h"
#include "third_party/highway/hwy/highway.h"
HWY_BEFORE_NAMESPACE();
namespace {
namespace HWY_NAMESPACE {
namespace hn = hwy::HWY_NAMESPACE;
template <typename D>
HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x4(D tag16, const uint8_t *buf,
ptrdiff_t stride) {
HWY_ALIGN int16_t buf_to_array[16];
for (int i = 0; i < 4; ++i) {
for (int j = 0; j < 4; ++j) {
buf_to_array[i * 4 + j] = buf[j];
}
buf += stride;
}
return hn::Load(tag16, buf_to_array);
}
template <typename D>
HWY_ATTR HWY_INLINE void StoreUnaligned4x4(D tag16, uint8_t *buf,
ptrdiff_t stride,
hn::VFromD<D> &vec) {
(void)tag16;
hn::Rebind<uint8_t, D> tag8;
auto vec_demoted = hn::DemoteTo(tag8, vec);
constexpr hn::Half<decltype(tag8)> half_tag;
constexpr hn::Half<decltype(half_tag)> quarter_tag;
auto vec1_2 = hn::LowerHalf(half_tag, vec_demoted);
auto vec2_2 = hn::UpperHalf(half_tag, vec_demoted);
auto vec1_4 = hn::LowerHalf(quarter_tag, vec1_2);
auto vec2_4 = hn::UpperHalf(quarter_tag, vec1_2);
auto vec3_4 = hn::LowerHalf(quarter_tag, vec2_2);
auto vec4_4 = hn::UpperHalf(quarter_tag, vec2_2);
hn::StoreU(vec1_4, quarter_tag, buf);
hn::StoreU(vec2_4, quarter_tag, buf + stride);
hn::StoreU(vec3_4, quarter_tag, buf + 2 * stride);
hn::StoreU(vec4_4, quarter_tag, buf + 3 * stride);
}
template <typename D>
HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned2x8(D tag16, const uint8_t *buf,
ptrdiff_t stride) {
hn::Rebind<uint8_t, D> tag8;
constexpr hn::Half<decltype(tag8)> half_tag8;
auto first_half = hn::LoadU(half_tag8, buf);
auto second_half = hn::LoadU(half_tag8, buf + stride);
return hn::PromoteTo(tag16, hn::Combine(tag8, first_half, second_half));
}
template <typename D>
HWY_ATTR HWY_INLINE void StoreUnaligned2x8(D tag, uint8_t *buf,
ptrdiff_t stride,
hn::VFromD<D> &vec) {
(void)tag;
hn::Rebind<uint8_t, D> tag8;
auto vec_demoted = hn::DemoteTo(tag8, vec);
constexpr hn::Half<decltype(tag8)> half_tag8;
auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
hn::StoreU(vec1_2, half_tag8, buf);
hn::StoreU(vec2_2, half_tag8, buf + stride);
}
template <typename D>
HWY_ATTR HWY_INLINE hn::VFromD<D> LoadUnaligned4x8(D scalable_tag,
const uint8_t *buf,
ptrdiff_t stride) {
hn::Rebind<uint8_t, D> tag8;
constexpr hn::Half<decltype(tag8)> half_tag8;
constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
auto first_quarter = hn::LoadU(quarter_tag8, buf);
auto second_quarter = hn::LoadU(quarter_tag8, buf + stride);
auto third_quarter = hn::LoadU(quarter_tag8, buf + 2 * stride);
auto fourth_quarter = hn::LoadU(quarter_tag8, buf + 3 * stride);
return hn::PromoteTo(
scalable_tag,
hn::Combine(tag8, hn::Combine(half_tag8, first_quarter, second_quarter),
hn::Combine(half_tag8, third_quarter, fourth_quarter)));
}
template <typename D>
HWY_ATTR HWY_INLINE void StoreUnaligned4x8(D tag, uint8_t *buf,
ptrdiff_t stride,
hn::VFromD<D> &vec) {
(void)tag;
hn::Rebind<uint8_t, D> tag8;
auto vec_demoted = hn::DemoteTo(tag8, vec);
constexpr hn::Half<decltype(tag8)> half_tag8;
constexpr hn::Half<decltype(half_tag8)> quarter_tag8;
auto vec1_2 = hn::UpperHalf(half_tag8, vec_demoted);
auto vec2_2 = hn::LowerHalf(half_tag8, vec_demoted);
auto vec1_4 = hn::UpperHalf(quarter_tag8, vec1_2);
auto vec2_4 = hn::LowerHalf(quarter_tag8, vec1_2);
auto vec3_4 = hn::UpperHalf(quarter_tag8, vec2_2);
auto vec4_4 = hn::LowerHalf(quarter_tag8, vec2_2);
hn::StoreU(vec1_4, quarter_tag8, buf);
hn::StoreU(vec2_4, quarter_tag8, buf + stride);
hn::StoreU(vec3_4, quarter_tag8, buf + 2 * stride);
hn::StoreU(vec4_4, quarter_tag8, buf + 3 * stride);
}
HWY_ATTR inline void ConvolveHoriz2Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
hn::ScalableTag<int16_t> mul_tag;
hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
auto filter_0 = hn::Set(mul_tag, filter_x[3]);
auto filter_1 = hn::Set(mul_tag, filter_x[4]);
auto vw = hn::Lanes(mul_tag);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += vw) {
auto src0 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j]));
auto src1 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 1]));
auto mulv = hn::RoundingShiftRight<FILTER_BITS>(src0 * filter_0 +
src1 * filter_1);
auto mulv_demoted = hn::DemoteTo(pixel_tag, mulv);
if (j + static_cast<int>(vw) > w) {
hn::StoreN(mulv_demoted, pixel_tag, &dst[j], w - j);
} else {
hn::StoreU(mulv_demoted, pixel_tag, &dst[j]);
}
}
src += src_stride;
dst += dst_stride;
}
}
template <typename D, typename DFilter>
HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve4_8(
D tag16, DFilter tag_filter, hn::VFromD<D> &s0, hn::VFromD<D> &s1,
hn::VFromD<D> &s2, hn::VFromD<D> &s3, hn::VFromD<DFilter> &filter) {
(void)tag_filter;
auto mul0 = hn::Mul(s0, hn::Set(tag16, hn::ExtractLane(filter, 0)));
auto mul1 = hn::Mul(s1, hn::Set(tag16, hn::ExtractLane(filter, 1)));
auto mul2 = hn::Mul(s2, hn::Set(tag16, hn::ExtractLane(filter, 2)));
auto mul3 = hn::Mul(s3, hn::Set(tag16, hn::ExtractLane(filter, 3)));
auto res = mul0 + mul1 + mul2 + mul3;
// Shift (FILTER_BITS - 1) because filter values were halved.
return hn::RoundingShiftRight<FILTER_BITS - 1>(res);
}
HWY_ATTR inline void ConvolveHoriz4Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
hn::CappedTag<int16_t, 16> tag16;
hn::CappedTag<int16_t, 4> filter_tag;
auto f_vec = hn::LoadU(filter_tag, filter_x + 2);
// All filter values are even, halve to reduce intermediate precision
// requirements.
f_vec = hn::ShiftRight<1>(f_vec);
if (w == 4) {
// Each iteration processes a 4x4 block
do {
auto src0 = LoadUnaligned4x4(tag16, src, src_stride);
auto src1 = LoadUnaligned4x4(tag16, src + 1, src_stride);
auto src2 = LoadUnaligned4x4(tag16, src + 2, src_stride);
auto src3 = LoadUnaligned4x4(tag16, src + 3, src_stride);
auto result =
Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
StoreUnaligned4x4(tag16, dst, dst_stride, result);
h -= 4;
src += 4 * src_stride;
dst += 4 * dst_stride;
} while (h > 0);
} else if (w == 8) {
// Each iteration processes a 2x8 block
do {
auto src0 = LoadUnaligned2x8(tag16, src, src_stride);
auto src1 = LoadUnaligned2x8(tag16, src + 1, src_stride);
auto src2 = LoadUnaligned2x8(tag16, src + 2, src_stride);
auto src3 = LoadUnaligned2x8(tag16, src + 3, src_stride);
auto result =
Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
StoreUnaligned2x8(tag16, dst, dst_stride, result);
h -= 2;
src += 2 * src_stride;
dst += 2 * dst_stride;
} while (h > 0);
} else if (w == 16) {
// One 1x16 block a time
do {
hn::Rebind<uint8_t, decltype(tag16)> tag8;
auto src0 = hn::PromoteTo(tag16, hn::LoadU(tag8, src));
auto src1 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 1));
auto src2 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 2));
auto src3 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 3));
auto result =
Convolve4_8(tag16, filter_tag, src0, src1, src2, src3, f_vec);
hn::StoreU(hn::DemoteTo(tag8, result), tag8, dst);
h--;
src += src_stride;
dst += dst_stride;
} while (h > 0);
} else {
hn::ScalableTag<int16_t> mul_tag;
hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
auto vw = hn::Lanes(mul_tag);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += vw) {
auto src0 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j]));
auto src1 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 1]));
auto src2 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 2]));
auto src3 = hn::PromoteTo(mul_tag, hn::LoadU(pixel_tag, &src[j + 3]));
auto result =
Convolve4_8(mul_tag, filter_tag, src0, src1, src2, src3, f_vec);
auto result_demoted = hn::DemoteTo(pixel_tag, result);
if (j + static_cast<int>(vw) > w) {
hn::StoreN(result_demoted, pixel_tag, &dst[j], w - j);
} else {
hn::StoreU(result_demoted, pixel_tag, &dst[j]);
}
}
src += src_stride;
dst += dst_stride;
}
}
}
template <typename D, typename DFilter>
HWY_ATTR HWY_INLINE hn::VFromD<D> Convolve8_8(
D tag16, DFilter tag_filter, hn::VFromD<D> &s0, hn::VFromD<D> &s1,
hn::VFromD<D> &s2, hn::VFromD<D> &s3, hn::VFromD<D> &s4, hn::VFromD<D> &s5,
hn::VFromD<D> &s6, hn::VFromD<D> &s7, hn::VFromD<DFilter> &filter) {
(void)tag_filter;
auto filter_0 = hn::ExtractLane(filter, 0);
auto filter_1 = hn::ExtractLane(filter, 1);
auto filter_2 = hn::ExtractLane(filter, 2);
auto filter_3 = hn::ExtractLane(filter, 3);
auto filter_4 = hn::ExtractLane(filter, 4);
auto filter_5 = hn::ExtractLane(filter, 5);
auto filter_6 = hn::ExtractLane(filter, 6);
auto filter_7 = hn::ExtractLane(filter, 7);
auto mul0 = hn::Mul(s0, hn::Set(tag16, filter_0));
auto mul1 = hn::Mul(s1, hn::Set(tag16, filter_1));
auto mul2 = hn::Mul(s2, hn::Set(tag16, filter_2));
auto mul3 = hn::Mul(s3, hn::Set(tag16, filter_3));
auto mul4 = hn::Mul(s4, hn::Set(tag16, filter_4));
auto mul5 = hn::Mul(s5, hn::Set(tag16, filter_5));
auto mul6 = hn::Mul(s6, hn::Set(tag16, filter_6));
auto mul7 = hn::Mul(s7, hn::Set(tag16, filter_7));
auto res = mul0 + mul1 + mul2 + mul3 + mul4 + mul5 + mul6 + mul7;
// Shift (FILTER_BITS - 1) because filter values were halved.
return hn::RoundingShiftRight<FILTER_BITS - 1>(res);
}
DECLARE_ALIGNED(32, static const uint8_t, filt_global[]) = {
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
HWY_ATTR inline void ConvolveHoriz8Tap(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int w, int h) {
hn::CappedTag<int16_t, 16> tag16;
hn::CappedTag<int16_t, 8> filter_tag;
auto f_vec = hn::LoadU(filter_tag, filter_x);
// All filter values are even, halve to reduce intermediate precision
// requirements.
f_vec = hn::ShiftRight<1>(f_vec);
if (w == 4) {
do {
auto src0 = LoadUnaligned4x4(tag16, src, src_stride);
auto src1 = LoadUnaligned4x4(tag16, src + 1, src_stride);
auto src2 = LoadUnaligned4x4(tag16, src + 2, src_stride);
auto src3 = LoadUnaligned4x4(tag16, src + 3, src_stride);
auto src4 = LoadUnaligned4x4(tag16, src + 4, src_stride);
auto src5 = LoadUnaligned4x4(tag16, src + 5, src_stride);
auto src6 = LoadUnaligned4x4(tag16, src + 6, src_stride);
auto src7 = LoadUnaligned4x4(tag16, src + 7, src_stride);
auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
src5, src6, src7, f_vec);
StoreUnaligned4x4(tag16, dst, dst_stride, result);
h -= 4;
src += 4 * src_stride;
dst += 4 * dst_stride;
} while (h > 0);
} else if (w == 8) {
// Each iteration processes a 2x8 block
do {
auto src0 = LoadUnaligned2x8(tag16, src, src_stride);
auto src1 = LoadUnaligned2x8(tag16, src + 1, src_stride);
auto src2 = LoadUnaligned2x8(tag16, src + 2, src_stride);
auto src3 = LoadUnaligned2x8(tag16, src + 3, src_stride);
auto src4 = LoadUnaligned2x8(tag16, src + 4, src_stride);
auto src5 = LoadUnaligned2x8(tag16, src + 5, src_stride);
auto src6 = LoadUnaligned2x8(tag16, src + 6, src_stride);
auto src7 = LoadUnaligned2x8(tag16, src + 7, src_stride);
auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
src5, src6, src7, f_vec);
StoreUnaligned2x8(tag16, dst, dst_stride, result);
h -= 2;
src += 2 * src_stride;
dst += 2 * dst_stride;
} while (h > 0);
} else if (w == 16) {
// One 1x16 block a time
do {
hn::Rebind<uint8_t, decltype(tag16)> tag8;
auto src0 = hn::PromoteTo(tag16, hn::LoadU(tag8, src));
auto src1 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 1));
auto src2 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 2));
auto src3 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 3));
auto src4 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 4));
auto src5 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 5));
auto src6 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 6));
auto src7 = hn::PromoteTo(tag16, hn::LoadU(tag8, src + 7));
auto result = Convolve8_8(tag16, filter_tag, src0, src1, src2, src3, src4,
src5, src6, src7, f_vec);
hn::StoreU(hn::DemoteTo(tag8, result), tag8, dst);
h--;
src += src_stride;
dst += dst_stride;
} while (h > 0);
} else {
// This tag will have 32 lanes (for avx512) or 16 lanes (for avx2)
hn::ScalableTag<int16_t> mul_tag;
hn::Rebind<uint8_t, decltype(mul_tag)> pixel_tag;
auto vw = hn::Lanes(mul_tag);
for (int i = 0; i < h; ++i) {
for (int j = 0; j < w; j += vw) {
auto s0 = hn::LoadU(pixel_tag, &src[j]);
auto s1 = hn::LoadU(pixel_tag, &src[j + 1]);
auto s2 = hn::LoadU(pixel_tag, &src[j + 2]);
auto s3 = hn::LoadU(pixel_tag, &src[j + 3]);
auto s4 = hn::LoadU(pixel_tag, &src[j + 4]);
auto s5 = hn::LoadU(pixel_tag, &src[j + 5]);
auto s6 = hn::LoadU(pixel_tag, &src[j + 6]);
auto s7 = hn::LoadU(pixel_tag, &src[j + 7]);
auto src0 = hn::PromoteTo(mul_tag, s0);
auto src1 = hn::PromoteTo(mul_tag, s1);
auto src2 = hn::PromoteTo(mul_tag, s2);
auto src3 = hn::PromoteTo(mul_tag, s3);
auto src4 = hn::PromoteTo(mul_tag, s4);
auto src5 = hn::PromoteTo(mul_tag, s5);
auto src6 = hn::PromoteTo(mul_tag, s6);
auto src7 = hn::PromoteTo(mul_tag, s7);
auto result = Convolve8_8(mul_tag, filter_tag, src0, src1, src2, src3,
src4, src5, src6, src7, f_vec);
auto result_demoted = hn::DemoteTo(pixel_tag, result);
if (j + static_cast<int>(vw) > w) {
hn::StoreN(result_demoted, pixel_tag, &dst[j], w - j);
} else {
hn::StoreU(result_demoted, pixel_tag, &dst[j]);
}
}
src += src_stride;
dst += dst_stride;
}
}
}
HWY_MAYBE_UNUSED void Convolve8Horiz(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
assert((intptr_t)dst % 4 == 0);
assert(dst_stride % 4 == 0);
(void)x_step_q4;
(void)filter_y;
(void)y_step_q4;
src -= ((SUBPEL_TAPS / 2) - 1);
int filter_taps = get_filter_taps_convolve8(filter_x);
if (filter_taps == 2) {
ConvolveHoriz2Tap(src + 3, src_stride, dst, dst_stride, filter_x, w, h);
} else if (filter_taps == 4) {
ConvolveHoriz4Tap(src + 2, src_stride, dst, dst_stride, filter_x, w, h);
} else {
// filter_taps = 8
ConvolveHoriz8Tap(src, src_stride, dst, dst_stride, filter_x, w, h);
}
}
} // namespace HWY_NAMESPACE
} // namespace
#define CONVOLVE8HORIZ(suffix) \
extern "C" void aom_convolve8_horiz_##suffix( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h); \
HWY_ATTR void aom_convolve8_horiz_##suffix( \
const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \
ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \
const int16_t *filter_y, int y_step_q4, int w, int h) { \
HWY_NAMESPACE::Convolve8Horiz(src, src_stride, dst, dst_stride, filter_x, \
x_step_q4, filter_y, y_step_q4, w, h); \
}
HWY_AFTER_NAMESPACE();
#endif // AOM_AOM_DSP_CONVOLVE_HWY_H_