| /* |
| * Copyright (c) 2025, Alliance for Open Media. All rights reserved. |
| * |
| * This source code is subject to the terms of the BSD 2 Clause License and |
| * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License |
| * was not distributed with this source code in the LICENSE file, you can |
| * obtain it at www.aomedia.org/license/software. If the Alliance for Open |
| * Media Patent License 1.0 was not distributed with this source code in the |
| * PATENTS file, you can obtain it at www.aomedia.org/license/patent. |
| */ |
| |
| #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ |
| #define AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ |
| |
| #include <stdint.h> |
| |
| #include "config/aom_config.h" |
| #include "config/av1_rtcd.h" |
| #include "third_party/highway/hwy/highway.h" |
| #include "aom_dsp/txfm_common.h" |
| #include "av1/common/av1_txfm.h" |
| #include "av1/common/enums.h" |
| #include "av1/encoder/av1_fwd_txfm1d.h" |
| #include "av1/encoder/av1_fwd_txfm1d_cfg.h" |
| |
| #define FOR_EACH_TXFM2D(X, suffix) \ |
| X(4, 4, suffix) \ |
| X(8, 8, suffix) \ |
| X(16, 16, suffix) \ |
| X(32, 32, suffix) \ |
| X(64, 64, suffix) \ |
| X(4, 8, suffix) \ |
| X(8, 4, suffix) \ |
| X(8, 16, suffix) \ |
| X(16, 8, suffix) \ |
| X(16, 32, suffix) \ |
| X(32, 16, suffix) \ |
| X(32, 64, suffix) \ |
| X(64, 32, suffix) \ |
| X(4, 16, suffix) \ |
| X(16, 4, suffix) \ |
| X(8, 32, suffix) \ |
| X(32, 8, suffix) \ |
| X(16, 64, suffix) \ |
| X(64, 16, suffix) |
| |
| #if HWY_CXX_LANG >= 201703L |
| #define CONSTEXPR_IF constexpr |
| #else |
| #define CONSTEXPR_IF |
| #endif |
| |
| HWY_BEFORE_NAMESPACE(); |
| |
| namespace { |
| namespace HWY_NAMESPACE { |
| |
| namespace hn = hwy::HWY_NAMESPACE; |
| |
| constexpr int8_t kForwardTransformShift[TX_SIZES_ALL][3] = { |
| { 2, 0, 0 }, // |
| { 2, -1, 0 }, // |
| { 2, -2, 0 }, // |
| { 2, -4, 0 }, // |
| { 0, -2, -2 }, // |
| { 2, -1, 0 }, // |
| { 2, -1, 0 }, // |
| { 2, -2, 0 }, // |
| { 2, -2, 0 }, // |
| { 2, -4, 0 }, // |
| { 2, -4, 0 }, // |
| { 0, -2, -2 }, // |
| { 2, -4, -2 }, // |
| { 2, -1, 0 }, // |
| { 2, -1, 0 }, // |
| { 2, -2, 0 }, // |
| { 2, -2, 0 }, // |
| { 0, -2, 0 }, // |
| { 2, -4, 0 }, // |
| }; |
| |
| constexpr int kTxSizeWideLog2[TX_SIZES_ALL] = { |
| 2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, |
| }; |
| |
| // Transform block height in log2 |
| constexpr int kTxSizeHighLog2[TX_SIZES_ALL] = { |
| 2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, |
| }; |
| |
| constexpr bool kApplyRectScaleList[TX_SIZES_ALL] = { |
| false, false, false, false, false, true, true, true, true, true, |
| true, true, true, false, false, false, false, false, false, |
| }; |
| |
| constexpr int8_t kForwardCosBitCol[MAX_TXWH_IDX /*txw_idx*/] |
| [MAX_TXWH_IDX /*txh_idx*/] = { |
| { 13, 13, 13, 0, 0 }, |
| { 13, 13, 13, 12, 0 }, |
| { 13, 13, 13, 12, 13 }, |
| { 0, 13, 13, 12, 13 }, |
| { 0, 0, 13, 12, 13 } |
| }; |
| |
| constexpr int8_t kForwardCosBitRow[MAX_TXWH_IDX /*txw_idx*/] |
| [MAX_TXWH_IDX /*txh_idx*/] = { |
| { 13, 13, 12, 0, 0 }, |
| { 13, 13, 13, 12, 0 }, |
| { 13, 13, 12, 13, 12 }, |
| { 0, 12, 13, 12, 11 }, |
| { 0, 0, 12, 11, 10 } |
| }; |
| |
| // Transform block width in pixels |
| constexpr int8_t kTxSizeWide[TX_SIZES_ALL] = { |
| 4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, |
| }; |
| |
| // Transform block height in pixels |
| constexpr int8_t kTxSizeHigh[TX_SIZES_ALL] = { |
| 4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, |
| }; |
| |
| constexpr int GetTxwIndex(TX_SIZE tx_size) { |
| return kTxSizeWideLog2[tx_size] - kTxSizeWideLog2[0]; |
| } |
| |
| constexpr int GetTxhIndex(TX_SIZE tx_size) { |
| return kTxSizeHighLog2[tx_size] - kTxSizeHighLog2[0]; |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> SetPair(D int_tag, int a, int b) { |
| return hn::BitCast( |
| int_tag, |
| hn::Set(hn::RepartitionToWide<D>(), |
| static_cast<int32_t>( |
| static_cast<uint16_t>(a) | |
| (static_cast<uint32_t>(static_cast<uint16_t>(b)) << 16)))); |
| } |
| |
| template <size_t LaneSize> |
| struct ButterflyTraits {}; |
| |
| template <> |
| struct ButterflyTraits<2> { |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Whole( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, |
| hn::TFromD<D> *HWY_RESTRICT out1, int bit, |
| hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| constexpr hn::RepartitionToWide<D> int32_tag; |
| const auto ww0 = SetPair(int_tag, w0, w1); |
| const auto ww1 = SetPair(int_tag, w1, -w0); |
| const auto i0 = hn::Load(int_tag, in0); |
| const auto i1 = hn::Load(int_tag, in1); |
| const auto t0 = hn::InterleaveLower(int_tag, i0, i1); |
| const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); |
| const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); |
| const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); |
| const auto v0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww1); |
| const auto v1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww1); |
| const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); |
| const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); |
| const auto d0 = hn::ShiftRightSame(hn::Add(v0, round), bit); |
| const auto d1 = hn::ShiftRightSame(hn::Add(v1, round), bit); |
| hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out0); |
| hn::Store(hn::ReorderDemote2To(int_tag, d0, d1), int_tag, out1); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Half( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, |
| int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| constexpr hn::RepartitionToWide<D> int32_tag; |
| const auto i0 = hn::Load(int_tag, in0); |
| const auto i1 = hn::Load(int_tag, in1); |
| const auto t0 = hn::InterleaveLower(int_tag, i0, i1); |
| const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); |
| const auto ww0 = SetPair(int_tag, w0, w1); |
| const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); |
| const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); |
| const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); |
| const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); |
| hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out); |
| } |
| }; |
| |
| template <> |
| struct ButterflyTraits<4> { |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Whole( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, |
| hn::TFromD<D> *HWY_RESTRICT out1, int bit, |
| hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| const auto i0 = hn::Load(int_tag, in0); |
| const auto i1 = hn::Load(int_tag, in1); |
| const auto ww0 = hn::Set(int_tag, w0); |
| const auto ww1 = hn::Set(int_tag, w1); |
| const auto in1_w1 = hn::Mul(i1, ww1); |
| const auto o0 = hn::MulAdd(i0, ww0, in1_w1); |
| hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out0); |
| const auto in1_w0 = hn::Mul(i1, ww0); |
| const auto o1 = hn::MulSub(i0, ww1, in1_w0); |
| hn::Store(hn::ShiftRightSame(hn::Add(o1, round), bit), int_tag, out1); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Half( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, |
| int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| const auto i0 = hn::Load(int_tag, in0); |
| const auto i1 = hn::Load(int_tag, in1); |
| const auto ww0 = hn::Set(int_tag, w0); |
| const auto ww1 = hn::Set(int_tag, w1); |
| const auto in1_w1 = hn::Mul(i1, ww1); |
| const auto o0 = hn::MulAdd(i0, ww0, in1_w1); |
| hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out); |
| } |
| }; |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void Butterfly( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, |
| hn::TFromD<D> *HWY_RESTRICT out1, int bit, |
| hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| ButterflyTraits<sizeof(hn::TFromD<D>)>::Whole(int_tag, w0, w1, in0, in1, out0, |
| out1, bit, round); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void HalfButterfly( |
| D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, |
| const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, |
| int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { |
| ButterflyTraits<sizeof(hn::TFromD<D>)>::Half(int_tag, w0, w1, in0, in1, out, |
| bit, round); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void AddSub(D int_tag, const hn::TFromD<D> *in0, |
| const hn::TFromD<D> *in1, |
| hn::TFromD<D> *out_add, |
| hn::TFromD<D> *out_sub) { |
| const auto i0 = hn::Load(int_tag, in0); |
| const auto i1 = hn::Load(int_tag, in1); |
| if CONSTEXPR_IF (sizeof(hn::TFromD<D>) == 2) { |
| hn::Store(hn::SaturatedAdd(i0, i1), int_tag, out_add); |
| hn::Store(hn::SaturatedSub(i0, i1), int_tag, out_sub); |
| } else { |
| hn::Store(hn::Add(i0, i1), int_tag, out_add); |
| hn::Store(hn::Sub(i0, i1), int_tag, out_sub); |
| } |
| } |
| |
| template <size_t LaneSize, size_t NumLanes> |
| struct Fdct4Traits { |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, |
| hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| using T = hn::TFromD<D>; |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| HWY_ALIGN_MAX T buf0[4 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| constexpr hn::Repartition<int32_t, D> int32_tag; |
| const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); |
| AddSub(int_tag, &in[0 * instride], &in[3 * instride], &buf0[0 * kNumLanes], |
| &buf0[3 * kNumLanes]); |
| AddSub(int_tag, &in[1 * instride], &in[2 * instride], &buf0[1 * kNumLanes], |
| &buf0[2 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &in[0 * instride], &in[2 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], |
| &buf0[2 * kNumLanes], &in[1 * instride], &in[3 * instride], |
| cos_bit, round); |
| } |
| }; |
| |
| template <> |
| struct Fdct4Traits<2, 4> { |
| template <typename D> |
| HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, |
| hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; |
| constexpr hn::Repartition<int32_t, decltype(demote_tag)> int32_tag; |
| const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); |
| const auto cospi_p32_p32 = SetPair(int_tag, cospi[32], cospi[32]); |
| const auto cospi_p32_m32 = SetPair(int_tag, cospi[32], -cospi[32]); |
| const auto cospi_p16_p48 = SetPair(int_tag, cospi[16], cospi[48]); |
| const auto cospi_p48_m16 = SetPair(int_tag, cospi[48], -cospi[16]); |
| const auto i0 = hn::Load(int_tag, &in[0 * instride]); |
| const auto i1 = hn::Load(int_tag, &in[1 * instride]); |
| const auto i2 = hn::Load(int_tag, &in[2 * instride]); |
| const auto i3 = hn::Load(int_tag, &in[3 * instride]); |
| const auto u0 = hn::InterleaveLower(int_tag, i0, i1); |
| const auto u1 = hn::InterleaveLower(int_tag, i3, i2); |
| const auto v0 = hn::Add(u0, u1); |
| const auto v1 = hn::Sub(u0, u1); |
| const auto x0 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_p32); |
| const auto x1 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_m32); |
| const auto x2 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p16_p48); |
| const auto x3 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p48_m16); |
| const auto v0w0 = hn::ShiftRightSame(hn::Add(x0, round), cos_bit); |
| const auto v0w1 = hn::ShiftRightSame(hn::Add(x1, round), cos_bit); |
| const auto v1w0 = hn::ShiftRightSame(hn::Add(x2, round), cos_bit); |
| const auto v1w1 = hn::ShiftRightSame(hn::Add(x3, round), cos_bit); |
| const auto o0 = hn::ReorderDemote2To(demote_tag, v0w0, v0w1); |
| const auto o1 = hn::ReorderDemote2To(demote_tag, v1w0, v1w1); |
| hn::Store(o0, demote_tag, &in[0 * instride]); |
| hn::Store(o1, demote_tag, &in[1 * instride]); |
| hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, |
| &in[2 * instride]); |
| hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, |
| &in[3 * instride]); |
| } |
| }; |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void Fdct4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| Fdct4Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(int_tag)>::Fdct4( |
| int_tag, in, cos_bit, instride); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void Fdct8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| |
| // Even 8 points 0, 2, ..., 14 |
| // stage 0 |
| // stage 1 |
| // buf0/buf1 |
| AddSub(int_tag, &in[0 * instride], &in[7 * instride], &buf0[0 * kNumLanes], |
| &buf1[7 * kNumLanes]); |
| // buf0/buf0 |
| AddSub(int_tag, &in[1 * instride], &in[6 * instride], &buf0[1 * kNumLanes], |
| &buf0[6 * kNumLanes]); |
| // buf0/buf0 |
| AddSub(int_tag, &in[2 * instride], &in[5 * instride], &buf0[2 * kNumLanes], |
| &buf0[5 * kNumLanes]); |
| // buf0/buf1 |
| AddSub(int_tag, &in[3 * instride], &in[4 * instride], &buf0[3 * kNumLanes], |
| &buf1[4 * kNumLanes]); |
| |
| // stage 2 |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes], |
| &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[5 * kNumLanes], |
| &buf0[6 * kNumLanes], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 3 |
| // type 0 |
| Butterfly(int_tag, cospi[32], cospi[32], &buf1[0 * kNumLanes], |
| &buf1[1 * kNumLanes], &in[0 * instride], &in[4 * instride], cos_bit, |
| round); |
| |
| // type 1 |
| Butterfly(int_tag, cospi[16], cospi[48], &buf1[3 * kNumLanes], |
| &buf1[2 * kNumLanes], &in[2 * instride], &in[6 * instride], cos_bit, |
| round); |
| |
| AddSub(int_tag, &buf1[4 * kNumLanes], &buf1[5 * kNumLanes], |
| &buf0[4 * kNumLanes], &buf0[5 * kNumLanes]); |
| AddSub(int_tag, &buf1[7 * kNumLanes], &buf1[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf0[6 * kNumLanes]); |
| |
| // stage 4 |
| // stage 5 |
| Butterfly(int_tag, cospi[8], cospi[56], &buf0[7 * kNumLanes], |
| &buf0[4 * kNumLanes], &in[1 * instride], &in[7 * instride], cos_bit, |
| round); |
| Butterfly(int_tag, cospi[40], cospi[24], &buf0[6 * kNumLanes], |
| &buf0[5 * kNumLanes], &in[5 * instride], &in[3 * instride], cos_bit, |
| round); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void Fdct16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| |
| // Calculate the column 0, 1, 2, 3 |
| // stage 0 |
| // stage 1 |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &in[i * instride], &in[(15 - i) * instride], |
| &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); |
| } |
| |
| // stage 2 |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], |
| &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], |
| &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 3 |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], |
| &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], |
| &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], |
| cos_bit, round); |
| |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], |
| &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], |
| &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); |
| } |
| |
| // stage 4 |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &in[0 * instride], &in[8 * instride], cos_bit, |
| round); |
| |
| Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], |
| &buf0[2 * kNumLanes], &in[4 * instride], &in[12 * instride], |
| cos_bit, round); |
| |
| AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], |
| &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); |
| AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], |
| &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); |
| |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], |
| &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 5 |
| Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], |
| &buf1[4 * kNumLanes], &in[2 * instride], &in[14 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], |
| &buf1[5 * kNumLanes], &in[10 * instride], &in[6 * instride], |
| cos_bit, round); |
| |
| AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], |
| &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); |
| AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); |
| AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], |
| &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); |
| AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); |
| |
| // stage 6 |
| Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], |
| &buf0[8 * kNumLanes], &in[1 * instride], &in[15 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], |
| &buf0[9 * kNumLanes], &in[9 * instride], &in[7 * instride], cos_bit, |
| round); |
| Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], |
| &buf0[10 * kNumLanes], &in[5 * instride], &in[11 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], |
| &buf0[11 * kNumLanes], &in[13 * instride], &in[3 * instride], |
| cos_bit, round); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE void Fdct32(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, size_t instride) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[32 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[32 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| // stage 0 |
| // stage 1 |
| for (size_t i = 0; i < 16; ++i) { |
| AddSub(int_tag, &in[i * instride], &in[(31 - i) * instride], |
| &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); |
| } |
| |
| // stage 2 |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], |
| &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[20 * kNumLanes], |
| &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[21 * kNumLanes], |
| &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[22 * kNumLanes], |
| &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[23 * kNumLanes], |
| &buf1[24 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 3 |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], |
| &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], |
| &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], |
| cos_bit, round); |
| |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], |
| &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], |
| &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); |
| } |
| |
| // stage 4 |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], |
| &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], |
| &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], |
| cos_bit, round); |
| |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], |
| &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], |
| &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); |
| } |
| |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf1[18 * kNumLanes], |
| &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf1[19 * kNumLanes], |
| &buf1[28 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[20 * kNumLanes], |
| &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[21 * kNumLanes], |
| &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 5 |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &in[0 * instride], &in[16 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], |
| &buf0[2 * kNumLanes], &in[8 * instride], &in[24 * instride], |
| cos_bit, round); |
| AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], |
| &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); |
| AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], |
| &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], |
| &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| |
| AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[19 * kNumLanes], |
| &buf1[16 * kNumLanes], &buf1[19 * kNumLanes]); |
| AddSub(int_tag, &buf1[17 * kNumLanes], &buf0[18 * kNumLanes], |
| &buf1[17 * kNumLanes], &buf1[18 * kNumLanes]); |
| AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[20 * kNumLanes], |
| &buf1[23 * kNumLanes], &buf1[20 * kNumLanes]); |
| AddSub(int_tag, &buf1[22 * kNumLanes], &buf0[21 * kNumLanes], |
| &buf1[22 * kNumLanes], &buf1[21 * kNumLanes]); |
| AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[27 * kNumLanes], |
| &buf1[24 * kNumLanes], &buf1[27 * kNumLanes]); |
| AddSub(int_tag, &buf1[25 * kNumLanes], &buf0[26 * kNumLanes], |
| &buf1[25 * kNumLanes], &buf1[26 * kNumLanes]); |
| AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[28 * kNumLanes], |
| &buf1[31 * kNumLanes], &buf1[28 * kNumLanes]); |
| AddSub(int_tag, &buf1[30 * kNumLanes], &buf0[29 * kNumLanes], |
| &buf1[30 * kNumLanes], &buf1[29 * kNumLanes]); |
| |
| // stage 6 |
| Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], |
| &buf1[4 * kNumLanes], &in[4 * instride], &in[28 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], |
| &buf1[5 * kNumLanes], &in[20 * instride], &in[12 * instride], |
| cos_bit, round); |
| AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], |
| &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); |
| AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); |
| AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], |
| &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); |
| AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); |
| Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], |
| &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], |
| &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], |
| &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], |
| &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 7 |
| Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], |
| &buf0[8 * kNumLanes], &in[2 * instride], &in[30 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], |
| &buf0[9 * kNumLanes], &in[18 * instride], &in[14 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], |
| &buf0[10 * kNumLanes], &in[10 * instride], &in[22 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], |
| &buf0[11 * kNumLanes], &in[26 * instride], &in[6 * instride], |
| cos_bit, round); |
| AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], |
| &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); |
| AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], |
| &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); |
| AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], |
| &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); |
| AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], |
| &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); |
| AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], |
| &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); |
| AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], |
| &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); |
| AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], |
| &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); |
| AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], |
| &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); |
| |
| // stage 8 & 9 |
| Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], |
| &buf1[16 * kNumLanes], &in[1 * instride], &in[31 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], |
| &buf1[17 * kNumLanes], &in[17 * instride], &in[15 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], |
| &buf1[18 * kNumLanes], &in[9 * instride], &in[23 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], |
| &buf1[19 * kNumLanes], &in[25 * instride], &in[7 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], |
| &buf1[20 * kNumLanes], &in[5 * instride], &in[27 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], |
| &buf1[21 * kNumLanes], &in[21 * instride], &in[11 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], |
| &buf1[22 * kNumLanes], &in[13 * instride], &in[19 * instride], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], |
| &buf1[23 * kNumLanes], &in[29 * instride], &in[3 * instride], |
| cos_bit, round); |
| |
| // stage 9 was fused with prior stages. |
| } |
| |
| template <size_t InStride, size_t OutStride, typename D> |
| HWY_ATTR HWY_NOINLINE void Fdct64(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[64 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[64 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| |
| // stage 1 |
| #if HWY_TARGET == HWY_SSE4 |
| // For whatever reason, some compilers don't unroll this when building for |
| // SSE4; help them along. |
| HWY_UNROLL(32) |
| #endif |
| for (size_t i = 0; i < 32; ++i) { |
| AddSub(int_tag, &in[i * InStride], &in[(63 - i) * InStride], |
| &buf0[i * kNumLanes], &buf0[(63 - i) * kNumLanes]); |
| } |
| |
| // stage 2 |
| for (size_t i = 0; i < 16; ++i) { |
| AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(31 - i) * kNumLanes], |
| &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 8; ++i) { |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(40 + i) * kNumLanes], |
| &buf0[(55 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], |
| &buf1[(55 - i) * kNumLanes], cos_bit, round); |
| } |
| |
| // stage 3 |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], |
| &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[(20 + i) * kNumLanes], |
| &buf1[(27 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], |
| &buf0[(27 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(47 - i) * kNumLanes], |
| &buf0[(32 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(48 + i) * kNumLanes], |
| &buf0[(63 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes]); |
| } |
| // stage 4 |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[(0 + i) * kNumLanes], &buf0[(7 - i) * kNumLanes], |
| &buf1[(0 + i) * kNumLanes], &buf1[(7 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(10 + i) * kNumLanes], |
| &buf0[(13 - i) * kNumLanes], &buf1[(10 + i) * kNumLanes], |
| &buf1[(13 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], |
| &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], |
| &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf0[(36 + i) * kNumLanes], |
| &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], |
| &buf1[(59 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 4; i < 8; ++i) { |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[(36 + i) * kNumLanes], |
| &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], |
| &buf1[(59 - i) * kNumLanes], cos_bit, round); |
| } |
| // stage 5 |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(3 - i) * kNumLanes], |
| &buf0[(0 + i) * kNumLanes], &buf0[(3 - i) * kNumLanes]); |
| } |
| Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], |
| &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], |
| cos_bit, round); |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], |
| &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], |
| &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf1[(18 + i) * kNumLanes], |
| &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], |
| &buf0[(29 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 2; i < 4; ++i) { |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[(18 + i) * kNumLanes], |
| &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], |
| &buf0[(29 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(39 - i) * kNumLanes], |
| &buf0[(32 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], |
| &buf0[(47 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes], |
| &buf0[(48 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(56 + i) * kNumLanes], |
| &buf0[(63 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes]); |
| } |
| // stage 6 |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], |
| &buf0[2 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], |
| cos_bit, round); |
| AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], |
| &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); |
| AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], |
| &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); |
| Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], |
| &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(19 - i) * kNumLanes], |
| &buf1[(16 + i) * kNumLanes], &buf1[(19 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(23 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], |
| &buf1[(23 - i) * kNumLanes], &buf1[(20 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(24 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes], |
| &buf1[(24 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(28 + i) * kNumLanes], |
| &buf1[(31 - i) * kNumLanes], &buf1[(28 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| Butterfly(int_tag, -cospi[8], cospi[56], &buf0[(34 + i) * kNumLanes], |
| &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], |
| &buf1[(61 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 2; i < 4; ++i) { |
| Butterfly(int_tag, -cospi[56], -cospi[8], &buf0[(34 + i) * kNumLanes], |
| &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], |
| &buf1[(61 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| Butterfly(int_tag, -cospi[40], cospi[24], &buf0[(42 + i) * kNumLanes], |
| &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], |
| &buf1[(53 - i) * kNumLanes], cos_bit, round); |
| } |
| for (size_t i = 2; i < 4; ++i) { |
| Butterfly(int_tag, -cospi[24], -cospi[40], &buf0[(42 + i) * kNumLanes], |
| &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], |
| &buf1[(53 - i) * kNumLanes], cos_bit, round); |
| } |
| // stage 7 |
| Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], |
| &buf1[4 * kNumLanes], &buf0[4 * kNumLanes], &buf0[7 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], |
| &buf1[5 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], |
| cos_bit, round); |
| AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], |
| &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); |
| AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); |
| AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], |
| &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); |
| AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); |
| Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], |
| &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], |
| &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], |
| &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], |
| &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], |
| cos_bit, round); |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(35 - i) * kNumLanes], |
| &buf0[(32 + i) * kNumLanes], &buf0[(35 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(39 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], |
| &buf0[(39 - i) * kNumLanes], &buf0[(36 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(40 + i) * kNumLanes], &buf1[(43 - i) * kNumLanes], |
| &buf0[(40 + i) * kNumLanes], &buf0[(43 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(44 + i) * kNumLanes], |
| &buf0[(47 - i) * kNumLanes], &buf0[(44 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(51 - i) * kNumLanes], |
| &buf0[(48 + i) * kNumLanes], &buf0[(51 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(55 - i) * kNumLanes], &buf1[(52 + i) * kNumLanes], |
| &buf0[(55 - i) * kNumLanes], &buf0[(52 + i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(56 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], |
| &buf0[(56 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes]); |
| } |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(60 + i) * kNumLanes], |
| &buf0[(63 - i) * kNumLanes], &buf0[(60 + i) * kNumLanes]); |
| } |
| // stage 8 |
| Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], |
| &buf0[8 * kNumLanes], &buf1[8 * kNumLanes], &buf1[15 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], |
| &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], |
| &buf0[10 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], |
| cos_bit, round); |
| AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], |
| &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); |
| AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], |
| &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); |
| AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], |
| &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); |
| AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], |
| &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); |
| AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], |
| &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); |
| AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], |
| &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); |
| AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], |
| &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); |
| AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], |
| &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); |
| Butterfly(int_tag, -cospi[4], cospi[60], &buf0[33 * kNumLanes], |
| &buf0[62 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[60], -cospi[4], &buf0[34 * kNumLanes], |
| &buf0[61 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[36], cospi[28], &buf0[37 * kNumLanes], |
| &buf0[58 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[28], -cospi[36], &buf0[38 * kNumLanes], |
| &buf0[57 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[20], cospi[44], &buf0[41 * kNumLanes], |
| &buf0[54 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[44], -cospi[20], &buf0[42 * kNumLanes], |
| &buf0[53 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[52], cospi[12], &buf0[45 * kNumLanes], |
| &buf0[50 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, -cospi[12], -cospi[52], &buf0[46 * kNumLanes], |
| &buf0[49 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], |
| cos_bit, round); |
| // stage 9 |
| Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], |
| &buf1[16 * kNumLanes], &buf0[16 * kNumLanes], &buf0[31 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], |
| &buf1[17 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], |
| &buf1[18 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], |
| &buf1[19 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], |
| &buf1[20 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], |
| &buf1[21 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], |
| &buf1[22 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], |
| &buf1[23 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], |
| cos_bit, round); |
| AddSub(int_tag, &buf0[32 * kNumLanes], &buf1[33 * kNumLanes], |
| &buf0[32 * kNumLanes], &buf0[33 * kNumLanes]); |
| AddSub(int_tag, &buf0[35 * kNumLanes], &buf1[34 * kNumLanes], |
| &buf0[35 * kNumLanes], &buf0[34 * kNumLanes]); |
| AddSub(int_tag, &buf0[36 * kNumLanes], &buf1[37 * kNumLanes], |
| &buf0[36 * kNumLanes], &buf0[37 * kNumLanes]); |
| AddSub(int_tag, &buf0[39 * kNumLanes], &buf1[38 * kNumLanes], |
| &buf0[39 * kNumLanes], &buf0[38 * kNumLanes]); |
| AddSub(int_tag, &buf0[40 * kNumLanes], &buf1[41 * kNumLanes], |
| &buf0[40 * kNumLanes], &buf0[41 * kNumLanes]); |
| AddSub(int_tag, &buf0[43 * kNumLanes], &buf1[42 * kNumLanes], |
| &buf0[43 * kNumLanes], &buf0[42 * kNumLanes]); |
| AddSub(int_tag, &buf0[44 * kNumLanes], &buf1[45 * kNumLanes], |
| &buf0[44 * kNumLanes], &buf0[45 * kNumLanes]); |
| AddSub(int_tag, &buf0[47 * kNumLanes], &buf1[46 * kNumLanes], |
| &buf0[47 * kNumLanes], &buf0[46 * kNumLanes]); |
| AddSub(int_tag, &buf0[48 * kNumLanes], &buf1[49 * kNumLanes], |
| &buf0[48 * kNumLanes], &buf0[49 * kNumLanes]); |
| AddSub(int_tag, &buf0[51 * kNumLanes], &buf1[50 * kNumLanes], |
| &buf0[51 * kNumLanes], &buf0[50 * kNumLanes]); |
| AddSub(int_tag, &buf0[52 * kNumLanes], &buf1[53 * kNumLanes], |
| &buf0[52 * kNumLanes], &buf0[53 * kNumLanes]); |
| AddSub(int_tag, &buf0[55 * kNumLanes], &buf1[54 * kNumLanes], |
| &buf0[55 * kNumLanes], &buf0[54 * kNumLanes]); |
| AddSub(int_tag, &buf0[56 * kNumLanes], &buf1[57 * kNumLanes], |
| &buf0[56 * kNumLanes], &buf0[57 * kNumLanes]); |
| AddSub(int_tag, &buf0[59 * kNumLanes], &buf1[58 * kNumLanes], |
| &buf0[59 * kNumLanes], &buf0[58 * kNumLanes]); |
| AddSub(int_tag, &buf0[60 * kNumLanes], &buf1[61 * kNumLanes], |
| &buf0[60 * kNumLanes], &buf0[61 * kNumLanes]); |
| AddSub(int_tag, &buf0[63 * kNumLanes], &buf1[62 * kNumLanes], |
| &buf0[63 * kNumLanes], &buf0[62 * kNumLanes]); |
| // stage 10 |
| Butterfly(int_tag, cospi[1], cospi[63], &buf0[63 * kNumLanes], |
| &buf0[32 * kNumLanes], &buf1[32 * kNumLanes], &buf1[63 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[33], cospi[31], &buf0[62 * kNumLanes], |
| &buf0[33 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[17], cospi[47], &buf0[61 * kNumLanes], |
| &buf0[34 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[49], cospi[15], &buf0[60 * kNumLanes], |
| &buf0[35 * kNumLanes], &buf1[35 * kNumLanes], &buf1[60 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[9], cospi[55], &buf0[59 * kNumLanes], |
| &buf0[36 * kNumLanes], &buf1[36 * kNumLanes], &buf1[59 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[41], cospi[23], &buf0[58 * kNumLanes], |
| &buf0[37 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[25], cospi[39], &buf0[57 * kNumLanes], |
| &buf0[38 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[57], cospi[7], &buf0[56 * kNumLanes], |
| &buf0[39 * kNumLanes], &buf1[39 * kNumLanes], &buf1[56 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[05], cospi[59], &buf0[55 * kNumLanes], |
| &buf0[40 * kNumLanes], &buf1[40 * kNumLanes], &buf1[55 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[37], cospi[27], &buf0[54 * kNumLanes], |
| &buf0[41 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[21], cospi[43], &buf0[53 * kNumLanes], |
| &buf0[42 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[53], cospi[11], &buf0[52 * kNumLanes], |
| &buf0[43 * kNumLanes], &buf1[43 * kNumLanes], &buf1[52 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[13], cospi[51], &buf0[51 * kNumLanes], |
| &buf0[44 * kNumLanes], &buf1[44 * kNumLanes], &buf1[51 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[45], cospi[19], &buf0[50 * kNumLanes], |
| &buf0[45 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[29], cospi[35], &buf0[49 * kNumLanes], |
| &buf0[46 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], |
| cos_bit, round); |
| Butterfly(int_tag, cospi[61], cospi[3], &buf0[48 * kNumLanes], |
| &buf0[47 * kNumLanes], &buf1[47 * kNumLanes], &buf1[48 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 11 |
| hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[0 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[63 * kNumLanes], &in[63 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[32 * kNumLanes], &in[1 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[31 * kNumLanes], &in[62 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[16 * kNumLanes], &in[2 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[47 * kNumLanes], &in[61 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[48 * kNumLanes], &in[3 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[60 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[4 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[55 * kNumLanes], &in[59 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[40 * kNumLanes], &in[5 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[23 * kNumLanes], &in[58 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[24 * kNumLanes], &in[6 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[39 * kNumLanes], &in[57 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[56 * kNumLanes], &in[7 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[56 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[4 * kNumLanes], &in[8 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[59 * kNumLanes], &in[55 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[36 * kNumLanes], &in[9 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[27 * kNumLanes], &in[54 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[20 * kNumLanes], &in[10 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[43 * kNumLanes], &in[53 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[52 * kNumLanes], &in[11 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[52 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[12 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[51 * kNumLanes], &in[51 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[44 * kNumLanes], &in[13 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[19 * kNumLanes], &in[50 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[28 * kNumLanes], &in[14 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[35 * kNumLanes], &in[49 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[60 * kNumLanes], &in[15 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[48 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[16 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[61 * kNumLanes], &in[47 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[34 * kNumLanes], &in[17 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[29 * kNumLanes], &in[46 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[18 * kNumLanes], &in[18 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[45 * kNumLanes], &in[45 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[50 * kNumLanes], &in[19 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[44 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[20 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[53 * kNumLanes], &in[43 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[42 * kNumLanes], &in[21 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[21 * kNumLanes], &in[42 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[26 * kNumLanes], &in[22 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[37 * kNumLanes], &in[41 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[58 * kNumLanes], &in[23 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[40 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[6 * kNumLanes], &in[24 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[57 * kNumLanes], &in[39 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[38 * kNumLanes], &in[25 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[25 * kNumLanes], &in[38 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[22 * kNumLanes], &in[26 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[41 * kNumLanes], &in[37 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[54 * kNumLanes], &in[27 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[36 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[28 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[49 * kNumLanes], &in[35 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[46 * kNumLanes], &in[29 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[17 * kNumLanes], &in[34 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf0[30 * kNumLanes], &in[30 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[33 * kNumLanes], &in[33 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[62 * kNumLanes], &in[31 * OutStride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[32 * OutStride]); |
| } |
| |
| template <size_t LaneSize, size_t NumLanes> |
| struct Fadst4Traits { |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, |
| hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, |
| const size_t instride) { |
| const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); |
| const auto round = hn::Set(int_tag, 1 << (cos_bit - 1)); |
| const auto sinpi1 = hn::Set(int_tag, sinpi[1]); |
| const auto sinpi2 = hn::Set(int_tag, sinpi[2]); |
| const auto sinpi3 = hn::Set(int_tag, sinpi[3]); |
| const auto sinpi4 = hn::Set(int_tag, sinpi[4]); |
| const auto in0 = hn::Load(int_tag, &in[0 * instride]); |
| const auto in1 = hn::Load(int_tag, &in[1 * instride]); |
| const auto in2 = hn::Load(int_tag, &in[2 * instride]); |
| const auto in3 = hn::Load(int_tag, &in[3 * instride]); |
| auto s0 = hn::Mul(in0, sinpi1); |
| auto s1 = hn::Mul(in0, sinpi4); |
| auto s2 = hn::Mul(in1, sinpi2); |
| auto s3 = hn::Mul(in1, sinpi1); |
| auto s4 = hn::Mul(in2, sinpi3); |
| auto s5 = hn::Mul(in3, sinpi4); |
| auto s6 = hn::Mul(in3, sinpi2); |
| auto s7 = hn::Sub(hn::Add(in0, in1), in3); |
| auto x0 = hn::Add(hn::Add(s0, s2), s5); |
| auto x1 = hn::Mul(s7, sinpi3); |
| auto x2 = hn::Add(hn::Sub(s1, s3), s6); |
| auto x3 = s4; |
| s0 = hn::Add(x0, x3); |
| s1 = x1; |
| s2 = hn::Sub(x2, x3); |
| s3 = hn::Add(hn::Sub(x2, x0), x3); |
| auto u0 = hn::Add(s0, round); |
| u0 = hn::ShiftRightSame(u0, cos_bit); |
| auto u1 = hn::Add(s1, round); |
| u1 = hn::ShiftRightSame(u1, cos_bit); |
| auto u2 = hn::Add(s2, round); |
| u2 = hn::ShiftRightSame(u2, cos_bit); |
| auto u3 = hn::Add(s3, round); |
| u3 = hn::ShiftRightSame(u3, cos_bit); |
| hn::Store(u0, int_tag, &in[0 * instride]); |
| hn::Store(u1, int_tag, &in[1 * instride]); |
| hn::Store(u2, int_tag, &in[2 * instride]); |
| hn::Store(u3, int_tag, &in[3 * instride]); |
| } |
| }; |
| |
| template <> |
| struct Fadst4Traits<2, 4> { |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, |
| hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, |
| const size_t instride) { |
| (void)int_tag; |
| const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); |
| constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; |
| constexpr hn::RepartitionToWide<decltype(demote_tag)> int32_tag; |
| const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); |
| const auto sinpi_p01_p02 = SetPair(demote_tag, sinpi[1], sinpi[2]); |
| const auto sinpi_p04_m01 = SetPair(demote_tag, sinpi[4], -sinpi[1]); |
| const auto sinpi_p03_p04 = SetPair(demote_tag, sinpi[3], sinpi[4]); |
| const auto sinpi_m03_p02 = SetPair(demote_tag, -sinpi[3], sinpi[2]); |
| const auto sinpi_p03_p03 = hn::Set(demote_tag, sinpi[3]); |
| const auto in0 = hn::Load(demote_tag, &in[0 * instride]); |
| const auto in1 = hn::Load(demote_tag, &in[1 * instride]); |
| const auto in2 = hn::Load(demote_tag, &in[2 * instride]); |
| const auto in3 = hn::Load(demote_tag, &in[3 * instride]); |
| const auto in7 = hn::Add(in0, in1); |
| auto u0 = hn::InterleaveLower(in0, in1); |
| auto u1 = hn::InterleaveLower(in2, in3); |
| auto u2 = hn::InterleaveLower(in7, hn::Zero(demote_tag)); |
| auto u3 = hn::InterleaveLower(in2, hn::Zero(demote_tag)); |
| auto u4 = hn::InterleaveLower(in3, hn::Zero(demote_tag)); |
| auto v0 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p01_p02); // s0 + s2 |
| auto v1 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_p03_p04); // s4 + s5 |
| auto v2 = hn::WidenMulPairwiseAdd(int32_tag, u2, sinpi_p03_p03); // x1 |
| auto v3 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p04_m01); // s1 - s3 |
| auto v4 = |
| hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_m03_p02); // -s4 + s6 |
| auto v5 = hn::WidenMulPairwiseAdd(int32_tag, u3, sinpi_p03_p03); // s4 |
| auto v6 = hn::WidenMulPairwiseAdd(int32_tag, u4, sinpi_p03_p03); |
| auto w0 = hn::Add(v0, v1); |
| auto w1 = hn::Sub(v2, v6); |
| auto w2 = hn::Add(v3, v4); |
| auto w3 = hn::Sub(w2, w0); |
| auto w4 = hn::ShiftLeft<2>(v5); |
| auto w5 = hn::Sub(w4, v5); |
| auto w6 = hn::Add(w3, w5); |
| v0 = hn::Add(w0, round); |
| v1 = hn::Add(w1, round); |
| v2 = hn::Add(w2, round); |
| v3 = hn::Add(w6, round); |
| w0 = hn::ShiftRightSame(v0, cos_bit); |
| w1 = hn::ShiftRightSame(v1, cos_bit); |
| w2 = hn::ShiftRightSame(v2, cos_bit); |
| w3 = hn::ShiftRightSame(v3, cos_bit); |
| auto o0 = hn::ReorderDemote2To(demote_tag, w0, w2); |
| auto o1 = hn::ReorderDemote2To(demote_tag, w1, w3); |
| hn::Store(o0, demote_tag, &in[0 * instride]); |
| hn::Store(o1, demote_tag, &in[1 * instride]); |
| hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, |
| &in[2 * instride]); |
| hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, |
| &in[3 * instride]); |
| } |
| }; |
| |
| template <size_t NumLanes> |
| struct Fadst4Traits<2, NumLanes> { |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, |
| hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, |
| const size_t instride) { |
| const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); |
| constexpr hn::RepartitionToWide<D> int32_tag; |
| const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); |
| const auto sinpi_p01_p02 = SetPair(int_tag, sinpi[1], sinpi[2]); |
| const auto sinpi_p04_m01 = SetPair(int_tag, sinpi[4], -sinpi[1]); |
| const auto sinpi_p03_p04 = SetPair(int_tag, sinpi[3], sinpi[4]); |
| const auto sinpi_m03_p02 = SetPair(int_tag, -sinpi[3], sinpi[2]); |
| const auto sinpi_p03_p03 = hn::Set(int_tag, sinpi[3]); |
| const auto in0 = hn::Load(int_tag, &in[0 * instride]); |
| const auto in1 = hn::Load(int_tag, &in[1 * instride]); |
| const auto in2 = hn::Load(int_tag, &in[2 * instride]); |
| const auto in3 = hn::Load(int_tag, &in[3 * instride]); |
| const auto in7 = hn::Add(in0, in1); |
| auto ul0 = hn::InterleaveLower(int_tag, in0, in1); |
| auto uh0 = hn::InterleaveUpper(int_tag, in0, in1); |
| auto ul1 = hn::InterleaveLower(int_tag, in2, in3); |
| auto uh1 = hn::InterleaveUpper(int_tag, in2, in3); |
| auto ul2 = hn::InterleaveLower(int_tag, in7, hn::Zero(int_tag)); |
| auto uh2 = hn::InterleaveUpper(int_tag, in7, hn::Zero(int_tag)); |
| auto ul3 = hn::InterleaveLower(int_tag, in2, hn::Zero(int_tag)); |
| auto uh3 = hn::InterleaveUpper(int_tag, in2, hn::Zero(int_tag)); |
| auto ul4 = hn::InterleaveLower(int_tag, in3, hn::Zero(int_tag)); |
| auto uh4 = hn::InterleaveUpper(int_tag, in3, hn::Zero(int_tag)); |
| auto vl0 = |
| hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p01_p02); // s0 + s2 |
| auto vh0 = |
| hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p01_p02); // s0 + s2 |
| auto vl1 = |
| hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_p03_p04); // s4 + s5 |
| auto vh1 = |
| hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_p03_p04); // s4 + s5 |
| auto vl2 = hn::WidenMulPairwiseAdd(int32_tag, ul2, sinpi_p03_p03); // x1 |
| auto vh2 = hn::WidenMulPairwiseAdd(int32_tag, uh2, sinpi_p03_p03); // x1 |
| auto vl3 = |
| hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p04_m01); // s1 - s3 |
| auto vh3 = |
| hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p04_m01); // s1 - s3 |
| auto vl4 = |
| hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_m03_p02); // -s4 + s6 |
| auto vh4 = |
| hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_m03_p02); // -s4 + s6 |
| auto vl5 = hn::WidenMulPairwiseAdd(int32_tag, ul3, sinpi_p03_p03); // s4 |
| auto vh5 = hn::WidenMulPairwiseAdd(int32_tag, uh3, sinpi_p03_p03); // s4 |
| auto vl6 = hn::WidenMulPairwiseAdd(int32_tag, ul4, sinpi_p03_p03); |
| auto vh6 = hn::WidenMulPairwiseAdd(int32_tag, uh4, sinpi_p03_p03); |
| auto wl0 = hn::Add(vl0, vl1); |
| auto wh0 = hn::Add(vh0, vh1); |
| auto wl1 = hn::Sub(vl2, vl6); |
| auto wh1 = hn::Sub(vh2, vh6); |
| auto wl2 = hn::Add(vl3, vl4); |
| auto wh2 = hn::Add(vh3, vh4); |
| auto wl3 = hn::Sub(wl2, wl0); |
| auto wh3 = hn::Sub(wh2, wh0); |
| auto wl4 = hn::ShiftLeft<2>(vl5); |
| auto wh4 = hn::ShiftLeft<2>(vh5); |
| auto wl5 = hn::Sub(wl4, vl5); |
| auto wh5 = hn::Sub(wh4, vh5); |
| auto wl6 = hn::Add(wl3, wl5); |
| auto wh6 = hn::Add(wh3, wh5); |
| vl0 = hn::Add(wl0, round); |
| vh0 = hn::Add(wh0, round); |
| vl1 = hn::Add(wl1, round); |
| vh1 = hn::Add(wh1, round); |
| vl2 = hn::Add(wl2, round); |
| vh2 = hn::Add(wh2, round); |
| vl3 = hn::Add(wl6, round); |
| vh3 = hn::Add(wh6, round); |
| wl0 = hn::ShiftRightSame(vl0, cos_bit); |
| wh0 = hn::ShiftRightSame(vh0, cos_bit); |
| wl1 = hn::ShiftRightSame(vl1, cos_bit); |
| wh1 = hn::ShiftRightSame(vh1, cos_bit); |
| wl2 = hn::ShiftRightSame(vl2, cos_bit); |
| wh2 = hn::ShiftRightSame(vh2, cos_bit); |
| wl3 = hn::ShiftRightSame(vl3, cos_bit); |
| wh3 = hn::ShiftRightSame(vh3, cos_bit); |
| auto o0 = hn::ReorderDemote2To(int_tag, wl0, wh0); |
| auto o1 = hn::ReorderDemote2To(int_tag, wl1, wh1); |
| auto o2 = hn::ReorderDemote2To(int_tag, wl2, wh2); |
| auto o3 = hn::ReorderDemote2To(int_tag, wl3, wh3); |
| hn::Store(o0, int_tag, &in[0 * instride]); |
| hn::Store(o1, int_tag, &in[1 * instride]); |
| hn::Store(o2, int_tag, &in[2 * instride]); |
| hn::Store(o3, int_tag, &in[3 * instride]); |
| } |
| }; |
| |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE void Fadst4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, const size_t instride) { |
| Fadst4Traits<sizeof(hn::TFromD<D>), |
| hn::MaxLanes(int_tag)>::template Fadst4<Width>(int_tag, in, |
| cos_bit, |
| instride); |
| } |
| |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE void Fadst8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, const size_t instride) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; |
| const int32_t *HWY_RESTRICT cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| |
| // stage 0 |
| // stage 1 |
| hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, |
| &buf0[0 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, |
| &buf0[1 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, |
| &buf0[2 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, |
| &buf0[3 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, |
| &buf0[4 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, |
| &buf0[5 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, |
| &buf0[6 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, |
| &buf0[7 * kNumLanes]); |
| |
| // stage 2 |
| hwy::CopyBytes<2 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], |
| cos_bit, round); |
| hwy::CopyBytes<2 * kNumBytes>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 3 |
| for (size_t j = 0; j < 8; j += 4) { |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], |
| &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], |
| &buf0[(2 + i + j) * kNumLanes]); |
| } |
| } |
| |
| // stage 4 |
| hwy::CopyBytes<4 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); |
| |
| // stage 5 |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(4 + i) * kNumLanes], |
| &buf0[(0 + i) * kNumLanes], &buf0[(4 + i) * kNumLanes]); |
| } |
| |
| // stage 6 |
| HalfButterfly(int_tag, cospi[4], cospi[60], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[60], -cospi[4], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[20], cospi[44], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[44], -cospi[20], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[36], cospi[28], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[28], -cospi[36], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[52], cospi[12], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[12], -cospi[52], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); |
| |
| // stage 7 |
| hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[1 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[3 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[5 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[7 * instride]); |
| } |
| |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE void Fadst16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, |
| const int8_t cos_bit, const size_t instride) { |
| constexpr size_t kNumLanes = hn::MaxLanes(int_tag); |
| constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); |
| HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; |
| HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; |
| const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); |
| const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); |
| |
| // stage 0 |
| // stage 1 |
| hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, |
| &buf0[0 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[15 * instride])), int_tag, |
| &buf0[1 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, |
| &buf0[2 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[8 * instride]), int_tag, |
| &buf0[3 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, |
| &buf0[4 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[12 * instride]), int_tag, |
| &buf0[5 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, |
| &buf0[6 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[11 * instride])), int_tag, |
| &buf0[7 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, |
| &buf0[8 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[14 * instride]), int_tag, |
| &buf0[9 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, |
| &buf0[10 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[9 * instride])), int_tag, |
| &buf0[11 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, |
| &buf0[12 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[13 * instride])), int_tag, |
| &buf0[13 * kNumLanes]); |
| hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, |
| &buf0[14 * kNumLanes]); |
| hn::Store(hn::Load(int_tag, &in[10 * instride]), int_tag, |
| &buf0[15 * kNumLanes]); |
| |
| // stage 2 |
| hwy::CopyBytes<kNumBytes * 2>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], |
| cos_bit, round); |
| hwy::CopyBytes<kNumBytes * 2>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], |
| cos_bit, round); |
| hwy::CopyBytes<kNumBytes * 2>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf1[11 * kNumLanes], |
| cos_bit, round); |
| hwy::CopyBytes<kNumBytes * 2>(&buf0[12 * kNumLanes], &buf1[12 * kNumLanes]); |
| Butterfly(int_tag, cospi[32], cospi[32], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf1[15 * kNumLanes], |
| cos_bit, round); |
| |
| // stage 3 |
| for (size_t j = 0; j < 16; j += 4) { |
| for (size_t i = 0; i < 2; ++i) { |
| AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], |
| &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], |
| &buf0[(2 + i + j) * kNumLanes]); |
| } |
| } |
| |
| // stage 4 |
| hwy::CopyBytes<kNumBytes * 4>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); |
| hwy::CopyBytes<kNumBytes * 4>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); |
| |
| // stage 5 |
| for (size_t j = 0; j < 16; j += 8) { |
| for (size_t i = 0; i < 4; ++i) { |
| AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], |
| &buf1[(4 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], |
| &buf0[(4 + i + j) * kNumLanes]); |
| } |
| } |
| |
| // stage 6 |
| hwy::CopyBytes<kNumBytes * 8>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); |
| HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[8 * kNumLanes], |
| &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[56], -cospi[8], &buf0[8 * kNumLanes], |
| &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[24], -cospi[40], &buf0[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, -cospi[56], cospi[8], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, -cospi[24], cospi[40], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); |
| |
| // stage 7 |
| for (size_t i = 0; i < 8; ++i) { |
| AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(8 + i) * kNumLanes], |
| &buf0[(0 + i) * kNumLanes], &buf0[(8 + i) * kNumLanes]); |
| } |
| |
| // stage 8 |
| HalfButterfly(int_tag, cospi[2], cospi[62], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[62], -cospi[2], &buf0[0 * kNumLanes], |
| &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[10], cospi[54], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[54], -cospi[10], &buf0[2 * kNumLanes], |
| &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[18], cospi[46], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[46], -cospi[18], &buf0[4 * kNumLanes], |
| &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[26], cospi[38], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[38], -cospi[26], &buf0[6 * kNumLanes], |
| &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[34], cospi[30], &buf0[8 * kNumLanes], |
| &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[30], -cospi[34], &buf0[8 * kNumLanes], |
| &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[42], cospi[22], &buf0[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[22], -cospi[42], &buf0[10 * kNumLanes], |
| &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[50], cospi[14], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[14], -cospi[50], &buf0[12 * kNumLanes], |
| &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[58], cospi[6], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); |
| HalfButterfly(int_tag, cospi[6], -cospi[58], &buf0[14 * kNumLanes], |
| &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); |
| |
| // stage 9 |
| hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[1 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[3 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[5 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[7 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[8 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[9 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[10 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[11 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[12 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[13 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[14 * instride]); |
| hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[15 * instride]); |
| } |
| |
| template <size_t Width, typename D> |
| HWY_ATTR HWY_INLINE void IdtxAdd2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { |
| for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { |
| auto v = hn::Load(tag, &in[x]); |
| hn::Store(hn::Add(v, v), tag, &in[x]); |
| } |
| } |
| |
| template <size_t Width, int Shift, typename D> |
| HWY_ATTR HWY_INLINE void IdtxShift(D tag, hn::TFromD<D> *HWY_RESTRICT in) { |
| for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { |
| hn::Store(hn::ShiftLeft<Shift>(hn::Load(tag, &in[x])), tag, &in[x]); |
| } |
| } |
| |
| template <int Scale, typename D> |
| HWY_ATTR HWY_INLINE void PromoteScale2x16ByNewSqrt2( |
| D tag, hn::VFromD<D> v, hn::VFromD<hn::RepartitionToWide<D>> &out0, |
| hn::VFromD<hn::RepartitionToWide<D>> &out1) { |
| constexpr hn::RepartitionToWide<D> int32_tag; |
| auto one = hn::Set(tag, 1); |
| auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); |
| auto a0 = hn::InterleaveLower(tag, v, one); |
| auto a1 = hn::InterleaveUpper(tag, v, one); |
| out0 = hn::ShiftRight<NewSqrt2Bits>( |
| hn::WidenMulPairwiseAdd(int32_tag, a0, scale_rounding)); |
| out1 = hn::ShiftRight<NewSqrt2Bits>( |
| hn::WidenMulPairwiseAdd(int32_tag, a1, scale_rounding)); |
| } |
| |
| template <size_t LaneSize, size_t NumLanes> |
| struct ScaleByNewSqrt2Traits { |
| template <int Scale, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, |
| hn::VFromD<D> v) { |
| auto fact = hn::Set(tag, Scale * NewSqrt2); |
| auto offset = hn::Set(tag, 1 << (NewSqrt2Bits - 1)); |
| return hn::ShiftRight<NewSqrt2Bits>(hn::MulAdd(v, fact, offset)); |
| } |
| }; |
| |
| template <> |
| struct ScaleByNewSqrt2Traits<2, 4> { |
| template <int Scale, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, |
| hn::VFromD<D> v) { |
| auto one = hn::Set(tag, 1); |
| auto scale_rounding = |
| SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); |
| constexpr hn::Rebind<int32_t, D> int32_tag; |
| auto a = hn::InterleaveLower(tag, v, one); |
| auto b = hn::ShiftRight<NewSqrt2Bits>( |
| hn::WidenMulPairwiseAdd(int32_tag, a, scale_rounding)); |
| return hn::DemoteTo(tag, b); |
| } |
| }; |
| |
| template <size_t NumLanes> |
| struct ScaleByNewSqrt2Traits<2, NumLanes> { |
| template <int Scale, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, |
| hn::VFromD<D> v) { |
| hn::VFromD<hn::RepartitionToWide<D>> b0, b1; |
| PromoteScale2x16ByNewSqrt2<Scale>(tag, v, b0, b1); |
| return hn::ReorderDemote2To(tag, b0, b1); |
| } |
| }; |
| |
| template <int Scale, typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> ScaleByNewSqrt2(D tag, hn::VFromD<D> v) { |
| return ScaleByNewSqrt2Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(tag)>:: |
| template ScaleByNewSqrt2<Scale>(tag, v); |
| } |
| |
| template <size_t Width, int Scale, typename D> |
| HWY_ATTR HWY_INLINE void IdtxSqrt2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { |
| for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { |
| hn::Store(ScaleByNewSqrt2<Scale>(tag, hn::Load(tag, &in[x])), tag, &in[x]); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FdctNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Width>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fdct4(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FdctNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Stride>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fdct8(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FdctNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Stride>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fdct16(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FdctNx32Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Stride>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fdct32(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t InWidth, size_t InStride, size_t OutWidth, size_t OutStride, |
| typename T> |
| HWY_ATTR void FdctNx64Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, InWidth>(); |
| for (size_t i = 0; i < OutWidth; i += hn::MaxLanes(int_tag)) { |
| Fdct64<InStride, OutStride>(int_tag, &in[i], cos_bit); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR HWY_INLINE void FadstNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Width>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fadst4<Width>(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FadstNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Stride>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fadst8<Width>(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, typename T> |
| HWY_ATTR void FadstNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| constexpr auto int_tag = hn::CappedTag<T, Stride>(); |
| for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { |
| Fadst16<Width>(int_tag, &in[i], cos_bit, Stride); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, size_t BlockHeight, typename T> |
| HWY_ATTR void IdtxAdd2Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| (void)cos_bit; |
| constexpr auto int_tag = hn::CappedTag<T, Width>(); |
| for (size_t y = 0; y < BlockHeight; ++y) { |
| IdtxAdd2<Width>(int_tag, &in[y * Stride]); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, size_t BlockHeight, int Scale, |
| typename T> |
| HWY_ATTR void IdtxSqrt2Block(T *HWY_RESTRICT in, int8_t cos_bit) { |
| (void)cos_bit; |
| constexpr auto int_tag = hn::CappedTag<T, Width>(); |
| for (size_t y = 0; y < BlockHeight; ++y) { |
| IdtxSqrt2<Width, Scale>(int_tag, &in[y * Stride]); |
| } |
| } |
| |
| template <size_t Width, size_t Stride, size_t BlockHeight, int Shift, |
| typename T> |
| HWY_ATTR void IdtxShiftBlock(T *HWY_RESTRICT in, int8_t cos_bit) { |
| (void)cos_bit; |
| constexpr auto int_tag = hn::CappedTag<T, Width>(); |
| for (size_t y = 0; y < BlockHeight; ++y) { |
| IdtxShift<Width, Shift>(int_tag, &in[y * Stride]); |
| } |
| } |
| |
| template <typename T> |
| void TransformFail(T *in, int8_t cos_bit) { |
| (void)in; |
| (void)cos_bit; |
| assert(false && "Incorrect transform requested."); |
| } |
| |
| template <typename T> |
| using Transform1D = void (*)(T *in, int8_t cos_bit); |
| |
| template <bool PositiveOrZero> |
| struct RoundShiftTraits {}; |
| |
| template <> |
| struct RoundShiftTraits<true> { |
| template <int Bit, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, |
| hn::VFromD<D> value) { |
| (void)int_tag; |
| if CONSTEXPR_IF (Bit == 0) { |
| return value; |
| } else { |
| return hn::ShiftLeft<Bit>(value); |
| } |
| } |
| }; |
| |
| template <> |
| struct RoundShiftTraits<false> { |
| template <int Bit, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, |
| hn::VFromD<D> value) { |
| const auto round = hn::Set(int_tag, 1 << (-Bit - 1)); |
| return hn::ShiftRight<-Bit>(hn::Add(value, round)); |
| } |
| }; |
| |
| template <int Bit, typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> RoundShift(D int_tag, hn::VFromD<D> value) { |
| return RoundShiftTraits<(Bit >= 0)>::template Shift<Bit>(int_tag, value); |
| } |
| |
| template <bool ApplyRectScale, typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> RectScale(D int_tag, hn::VFromD<D> v) { |
| if CONSTEXPR_IF (ApplyRectScale) { |
| return ScaleByNewSqrt2<1>(int_tag, v); |
| } |
| return v; |
| } |
| |
| template <bool IsSame> |
| struct MaybePromoteTraits {}; |
| |
| template <> |
| struct MaybePromoteTraits<true> { |
| template <typename VIn, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { |
| (void)out_tag; |
| return in; |
| } |
| |
| template <typename VIn, typename D> |
| HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, |
| hn::TFromD<D> *out) { |
| hn::StoreU(v, int_tag, out); |
| } |
| }; |
| |
| template <> |
| struct MaybePromoteTraits<false> { |
| template <typename VIn, typename D> |
| HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { |
| return hn::PromoteTo(out_tag, in); |
| } |
| |
| template <typename VIn, typename TOut, typename D> |
| HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, TOut *out) { |
| (void)int_tag; |
| constexpr hn::Repartition<TOut, D> store_tag; |
| hn::StoreU(hn::PromoteLowerTo(store_tag, v), store_tag, out); |
| hn::StoreU(hn::PromoteUpperTo(store_tag, v), store_tag, |
| out + hn::MaxLanes(store_tag)); |
| } |
| }; |
| |
| template <typename VIn, typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> MaybePromoteTo(D out_tag, VIn in) { |
| return MaybePromoteTraits< |
| std::is_same<hn::TFromD<D>, hn::TFromV<VIn>>::value>::PromoteTo(out_tag, |
| in); |
| } |
| |
| template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE void Transpose4(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, size_t instride, |
| size_t outstride) { |
| constexpr hn::FixedTag<TIn, 4> int_tag; |
| auto i0 = RectScale<ApplyRectScale>( |
| int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[0 * instride]))); |
| auto i1 = RectScale<ApplyRectScale>( |
| int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[1 * instride]))); |
| auto i2 = RectScale<ApplyRectScale>( |
| int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[2 * instride]))); |
| auto i3 = RectScale<ApplyRectScale>( |
| int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[3 * instride]))); |
| HWY_ALIGN_MAX TOut interleaved[16]; |
| constexpr hn::FixedTag<TOut, 4> out_tag; |
| hn::StoreInterleaved4(MaybePromoteTo(out_tag, i0), |
| MaybePromoteTo(out_tag, i1), |
| MaybePromoteTo(out_tag, i2), |
| MaybePromoteTo(out_tag, i3), out_tag, interleaved); |
| for (size_t i = 0; i < 4; ++i) { |
| hwy::CopyBytes<hn::MaxLanes(int_tag) * sizeof(*out)>(&interleaved[i * 4], |
| &out[i * outstride]); |
| } |
| } |
| |
| template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE void Transpose8(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, size_t instride, |
| size_t outstride) { |
| constexpr hn::FixedTag<TIn, 8> int_tag; |
| constexpr hn::Rebind<TOut, decltype(int_tag)> out_tag; |
| // N.B. there isn't a StoreInterleaved8, so hand-code Transpose8. |
| constexpr hn::RepartitionToWide<decltype(out_tag)> wide_int_tag; |
| HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved0[16]; |
| HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved1[16]; |
| auto i0 = hn::Load(int_tag, &in[0 * instride]); |
| auto i1 = hn::Load(int_tag, &in[1 * instride]); |
| auto i2 = hn::Load(int_tag, &in[2 * instride]); |
| auto i3 = hn::Load(int_tag, &in[3 * instride]); |
| auto i4 = hn::Load(int_tag, &in[4 * instride]); |
| auto i5 = hn::Load(int_tag, &in[5 * instride]); |
| auto i6 = hn::Load(int_tag, &in[6 * instride]); |
| auto i7 = hn::Load(int_tag, &in[7 * instride]); |
| auto s0 = hn::Undefined(out_tag); |
| auto s1 = hn::Undefined(out_tag); |
| auto s2 = hn::Undefined(out_tag); |
| auto s3 = hn::Undefined(out_tag); |
| auto s4 = hn::Undefined(out_tag); |
| auto s5 = hn::Undefined(out_tag); |
| auto s6 = hn::Undefined(out_tag); |
| auto s7 = hn::Undefined(out_tag); |
| auto ip0 = MaybePromoteTo(out_tag, i0); |
| auto ip1 = MaybePromoteTo(out_tag, i1); |
| auto ip2 = MaybePromoteTo(out_tag, i2); |
| auto ip3 = MaybePromoteTo(out_tag, i3); |
| auto ip4 = MaybePromoteTo(out_tag, i4); |
| auto ip5 = MaybePromoteTo(out_tag, i5); |
| auto ip6 = MaybePromoteTo(out_tag, i6); |
| auto ip7 = MaybePromoteTo(out_tag, i7); |
| s0 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip0)); |
| s1 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip1)); |
| s2 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip2)); |
| s3 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip3)); |
| s4 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip4)); |
| s5 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip5)); |
| s6 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip6)); |
| s7 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip7)); |
| auto u0 = hn::ZipLower(wide_int_tag, s0, s1); |
| auto u1 = hn::ZipUpper(wide_int_tag, s0, s1); |
| auto u2 = hn::ZipLower(wide_int_tag, s2, s3); |
| auto u3 = hn::ZipUpper(wide_int_tag, s2, s3); |
| auto u4 = hn::ZipLower(wide_int_tag, s4, s5); |
| auto u5 = hn::ZipUpper(wide_int_tag, s4, s5); |
| auto u6 = hn::ZipLower(wide_int_tag, s6, s7); |
| auto u7 = hn::ZipUpper(wide_int_tag, s6, s7); |
| hn::StoreInterleaved4(u0, u2, u4, u6, wide_int_tag, interleaved0); |
| hn::StoreInterleaved4(u1, u3, u5, u7, wide_int_tag, interleaved1); |
| constexpr size_t kNumBytes = hn::MaxLanes(int_tag) * sizeof(*out); |
| if CONSTEXPR_IF (sizeof(TOut) == 2) { |
| hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[2 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[3 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[4 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[5 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); |
| } else { |
| hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[2 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[3 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[4 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[5 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); |
| hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); |
| } |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveEvenBlocks(D tag, |
| hn::VFromD<D> a, |
| hn::VFromD<D> b) { |
| static_assert(sizeof(hn::TFromD<D>) == 8, |
| "LocalInterleaveEvenBlocks requires 64-bit lanes."); |
| HWY_ALIGN static constexpr int64_t kIndices[] = { 0, 1, 8 + 0, 8 + 1, |
| 4, 5, 8 + 4, 8 + 5 }; |
| auto indices = hn::SetTableIndices(tag, kIndices); |
| return hn::TwoTablesLookupLanes(tag, a, b, indices); |
| } |
| |
| template <typename D> |
| HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveOddBlocks(D tag, |
| hn::VFromD<D> a, |
| hn::VFromD<D> b) { |
| static_assert(sizeof(hn::TFromD<D>) == 8, |
| "LocalInterleaveOddBlocks requires 64-bit lanes."); |
| HWY_ALIGN static constexpr int64_t kIndices[] = { 2, 3, 8 + 2, 8 + 3, |
| 6, 7, 8 + 6, 8 + 7 }; |
| auto indices = hn::SetTableIndices(tag, kIndices); |
| return hn::TwoTablesLookupLanes(tag, a, b, indices); |
| } |
| |
| template <size_t LaneSize> |
| struct Transpose16Traits {}; |
| |
| template <> |
| struct Transpose16Traits<2> { |
| template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, |
| size_t outstride) { |
| constexpr hn::FixedTag<TIn, 16> int_tag; |
| static_assert(hn::MaxLanes(int_tag) == 16, |
| "16-bit Transpose16 requires an 16-lane int_tag"); |
| constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; |
| constexpr hn::RepartitionToWide<decltype(wide_int_tag)> widex2_int_tag; |
| HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> |
| y[16 * hn::MaxLanes(wide_int_tag)]; |
| HWY_ALIGN_MAX hn::TFromD<decltype(widex2_int_tag)> |
| z[16 * hn::MaxLanes(widex2_int_tag)]; |
| for (size_t i = 0; i < 16; i += 2) { |
| auto i0 = RectScale<ApplyRectScale>( |
| int_tag, |
| RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); |
| auto i1 = RectScale<ApplyRectScale>( |
| int_tag, |
| RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); |
| hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, |
| &y[(i + 0) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, |
| &y[(i + 1) * hn::MaxLanes(wide_int_tag)]); |
| } |
| for (size_t i = 0; i < 16; i += 4) { |
| for (size_t j = 0; j < 2; ++j) { |
| auto i0 = hn::Load(wide_int_tag, |
| &y[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); |
| auto i2 = hn::Load(wide_int_tag, |
| &y[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(hn::ZipLower(widex2_int_tag, i0, i2), widex2_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); |
| hn::Store(hn::ZipUpper(widex2_int_tag, i0, i2), widex2_int_tag, |
| &z[(i + j + 2) * hn::MaxLanes(widex2_int_tag)]); |
| } |
| } |
| for (size_t i = 0; i < 16; i += 8) { |
| for (size_t j = 0; j < 4; ++j) { |
| auto i0 = hn::Load(widex2_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); |
| auto i4 = hn::Load(widex2_int_tag, |
| &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); |
| hn::Store(hn::InterleaveLower(widex2_int_tag, i0, i4), widex2_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); |
| hn::Store(hn::InterleaveUpper(widex2_int_tag, i0, i4), widex2_int_tag, |
| &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); |
| } |
| } |
| static constexpr size_t kStoreIndex[] = { 0, 4, 2, 6, 1, 5, 3, 7, |
| 8, 12, 10, 14, 9, 13, 11, 15 }; |
| for (size_t j = 0; j < 8; ++j) { |
| auto i0 = |
| hn::Load(widex2_int_tag, &z[(j + 0) * hn::MaxLanes(widex2_int_tag)]); |
| auto i8 = |
| hn::Load(widex2_int_tag, &z[(j + 8) * hn::MaxLanes(widex2_int_tag)]); |
| hn::StoreU( |
| hn::BitCast(int_tag, hn::ConcatLowerLower(widex2_int_tag, i8, i0)), |
| int_tag, &out[kStoreIndex[j + 0] * outstride]); |
| hn::StoreU( |
| hn::BitCast(int_tag, hn::ConcatUpperUpper(widex2_int_tag, i8, i0)), |
| int_tag, &out[kStoreIndex[j + 8] * outstride]); |
| } |
| } |
| }; |
| |
| template <> |
| struct Transpose16Traits<4> { |
| template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, |
| size_t outstride) { |
| constexpr hn::FixedTag<TIn, 16> int_tag; |
| static_assert(hn::MaxLanes(int_tag) == 16, |
| "32-bit Transpose16 requires an 16-lane int_tag"); |
| constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; |
| HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> |
| z[16 * hn::MaxLanes(wide_int_tag)]; |
| for (size_t i = 0; i < 16; i += 2) { |
| auto i0 = RectScale<ApplyRectScale>( |
| int_tag, |
| RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); |
| auto i1 = RectScale<ApplyRectScale>( |
| int_tag, |
| RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); |
| hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, |
| &z[(i + 0) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, |
| &z[(i + 1) * hn::MaxLanes(wide_int_tag)]); |
| } |
| for (size_t i = 0; i < 16; i += 4) { |
| for (size_t j = 0; j < 2; ++j) { |
| auto i0 = hn::Load(wide_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); |
| auto i2 = hn::Load(wide_int_tag, |
| &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(hn::InterleaveLower(wide_int_tag, i0, i2), wide_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(hn::InterleaveUpper(wide_int_tag, i0, i2), wide_int_tag, |
| &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); |
| } |
| } |
| for (size_t i = 0; i < 16; i += 8) { |
| for (size_t j = 0; j < 4; ++j) { |
| auto i0 = hn::Load(wide_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); |
| auto i4 = hn::Load(wide_int_tag, |
| &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(LocalInterleaveEvenBlocks(wide_int_tag, i0, i4), wide_int_tag, |
| &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); |
| hn::Store(LocalInterleaveOddBlocks(wide_int_tag, i0, i4), wide_int_tag, |
| &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); |
| } |
| } |
| static constexpr size_t kStoreIndex[] = { 0, 2, 1, 3, 4, 6, 5, 7, |
| 8, 10, 9, 11, 12, 14, 13, 15 }; |
| for (size_t j = 0; j < 8; ++j) { |
| auto i0 = |
| hn::Load(wide_int_tag, &z[(j + 0) * hn::MaxLanes(wide_int_tag)]); |
| auto i8 = |
| hn::Load(wide_int_tag, &z[(j + 8) * hn::MaxLanes(wide_int_tag)]); |
| hn::StoreU( |
| hn::BitCast(int_tag, hn::ConcatLowerLower(wide_int_tag, i8, i0)), |
| int_tag, &out[kStoreIndex[j + 0] * outstride]); |
| hn::StoreU( |
| hn::BitCast(int_tag, hn::ConcatUpperUpper(wide_int_tag, i8, i0)), |
| int_tag, &out[kStoreIndex[j + 8] * outstride]); |
| } |
| } |
| }; |
| |
| template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE void Transpose16(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, size_t instride, |
| size_t outstride) { |
| static_assert(sizeof(TOut) == sizeof(TIn), |
| "Transpose16 does not directly support integer promotion."); |
| Transpose16Traits<sizeof(TIn)>::template Transpose16<Bit, ApplyRectScale>( |
| in, out, instride, outstride); |
| } |
| |
| template <size_t NumLanes, bool RequiresPromotion> |
| struct TransposeTraits {}; |
| |
| template <> |
| struct TransposeTraits<16, true> { |
| template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, size_t outstride) { |
| constexpr auto int_tag = |
| hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); |
| constexpr hn::Rebind<TIn, decltype(int_tag)> input_tag; |
| HWY_ALIGN_MAX hn::TFromD<decltype(int_tag)> p[16 * hn::MaxLanes(int_tag)]; |
| for (size_t r = 0; r < Height; r += 16) { |
| for (size_t c = 0; c < Width; c += 16) { |
| for (size_t i = 0; i < 16; ++i) { |
| hn::Store( |
| hn::PromoteTo(int_tag, |
| hn::Load(input_tag, &in[(r + i) * instride + c])), |
| int_tag, &p[i * hn::MaxLanes(int_tag)]); |
| } |
| Transpose16<Bit, ApplyRectScale>(p, &out[c * outstride + r], |
| hn::MaxLanes(int_tag), outstride); |
| } |
| } |
| } |
| }; |
| |
| template <> |
| struct TransposeTraits<16, false> { |
| template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, size_t outstride) { |
| for (size_t r = 0; r < Height; r += 16) { |
| for (size_t c = 0; c < Width; c += 16) { |
| Transpose16<Bit, ApplyRectScale>(&in[r * instride + c], |
| &out[c * outstride + r], instride, |
| outstride); |
| } |
| } |
| } |
| }; |
| |
| template <bool RequiresPromotion> |
| struct TransposeTraits<8, RequiresPromotion> { |
| template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, size_t outstride) { |
| for (size_t r = 0; r < Height; r += 8) { |
| for (size_t c = 0; c < Width; c += 8) { |
| Transpose8<Bit, ApplyRectScale>(&in[r * instride + c], |
| &out[c * outstride + r], instride, |
| outstride); |
| } |
| } |
| } |
| }; |
| |
| template <bool RequiresPromotion> |
| struct TransposeTraits<4, RequiresPromotion> { |
| template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, |
| size_t instride, size_t outstride) { |
| for (size_t r = 0; r < Height; r += 4) { |
| for (size_t c = 0; c < Width; c += 4) { |
| Transpose4<Bit, ApplyRectScale>(&in[r * instride + c], |
| &out[c * outstride + r], instride, |
| outstride); |
| } |
| } |
| } |
| }; |
| |
| template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE void Transpose(const TIn *HWY_RESTRICT in, |
| TOut *HWY_RESTRICT out, size_t instride, |
| size_t outstride) { |
| constexpr auto int_tag = |
| hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); |
| TransposeTraits<hn::MaxLanes(int_tag), !std::is_same<TIn, TOut>::value>:: |
| template Transpose<Width, Height, Bit, ApplyRectScale>(in, out, instride, |
| outstride); |
| } |
| |
| template <size_t Width, size_t Height, int Shift, bool ApplyRectScale, |
| typename TIn, typename TOut> |
| HWY_ATTR HWY_INLINE void StoreBlock(const TIn *HWY_RESTRICT in, size_t instride, |
| TOut *HWY_RESTRICT out, size_t outstride) { |
| constexpr hn::CappedTag<TIn, Width> load_tag; |
| for (size_t r = 0; r < Height; ++r) { |
| for (size_t c = 0; c < Width; c += hn::MaxLanes(load_tag)) { |
| auto v = RectScale<ApplyRectScale>( |
| load_tag, RoundShift<Shift>( |
| load_tag, hn::Load(load_tag, &in[r * instride + c]))); |
| MaybePromoteTraits<std::is_same<TIn, TOut>::value>::PromoteStore2( |
| load_tag, v, &out[r * outstride + c]); |
| } |
| } |
| } |
| |
| template <int8_t Shift, size_t Width, bool FlipLeftRight, typename TInput, |
| typename TIn> |
| HWY_ATTR HWY_INLINE void LoadLine(const TInput *HWY_RESTRICT input, |
| TIn *HWY_RESTRICT in) { |
| constexpr hn::CappedTag<TIn, Width> store_tag; |
| constexpr hn::Rebind<TInput, decltype(store_tag)> load_tag; |
| for (size_t x = 0; x < Width / hn::MaxLanes(load_tag); ++x) { |
| auto v = hn::LoadU(load_tag, &input[x * hn::MaxLanes(load_tag)]); |
| if CONSTEXPR_IF (FlipLeftRight) { |
| v = hn::Reverse(load_tag, v); |
| } |
| auto vp = MaybePromoteTo(store_tag, v); |
| hn::Store( |
| hn::ShiftLeft<Shift>(vp), store_tag, |
| &in[(FlipLeftRight ? (Width / hn::MaxLanes(store_tag)) - x - 1 : x) * |
| hn::MaxLanes(store_tag)]); |
| } |
| } |
| |
| template <int8_t Shift, size_t Width, size_t OutStride, size_t Height, |
| bool FlipUpDown, bool FlipLeftRight, typename TInput, typename TIn> |
| HWY_ATTR HWY_INLINE void LoadBuffer(const TInput *HWY_RESTRICT input, |
| TIn *HWY_RESTRICT in, size_t stride) { |
| for (size_t y = 0; y < Height; ++y) { |
| LoadLine<Shift, Width, FlipLeftRight>( |
| input + y * stride, &in[(FlipUpDown ? Height - y - 1 : y) * OutStride]); |
| } |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, |
| typename T> |
| HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform4(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| switch (tx_type) { |
| case DCT_1D: FdctNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; |
| case IDTX_1D: |
| IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 1>(in, cos_bit); |
| break; |
| default: FadstNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; |
| } |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, |
| typename T> |
| HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform8(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| switch (tx_type) { |
| case DCT_1D: FdctNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; |
| case IDTX_1D: |
| IdtxAdd2Block<TransformWidth, BlockWidth, BlockHeight>(in, cos_bit); |
| break; |
| default: FadstNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; |
| } |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, |
| typename T> |
| HWY_ATTR HWY_INLINE void Transform16(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| static const Transform1D<T> kTransform[] = { |
| FdctNx16Block<TransformWidth, BlockWidth, T>, // DCT_1D |
| FadstNx16Block<TransformWidth, BlockWidth, T>, // ADST_1D |
| FadstNx16Block<TransformWidth, BlockWidth, T>, // FLIPADST_1D |
| IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 2, T>, // IDTX_1D |
| }; |
| kTransform[tx_type](in, cos_bit); |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, |
| typename T> |
| HWY_ATTR HWY_INLINE void Transform32(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| static const Transform1D<T> kTransform[] = { |
| FdctNx32Block<TransformWidth, BlockWidth, T>, // DCT_1D |
| TransformFail<T>, // ADST_1D |
| TransformFail<T>, // FLIPADST_1D |
| IdtxShiftBlock<TransformWidth, BlockWidth, BlockHeight, 2, T>, // IDTX_1D |
| }; |
| kTransform[tx_type](in, cos_bit); |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, typename T> |
| HWY_ATTR HWY_INLINE void TransformFull64(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| (void)tx_type; |
| assert(tx_type == DCT_1D); |
| FdctNx64Block<TransformWidth, BlockWidth, TransformWidth, BlockWidth>( |
| in, cos_bit); |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, |
| size_t BlockHeight, typename T> |
| HWY_ATTR HWY_INLINE void TransformBelow32(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| if CONSTEXPR_IF (TransformHeight == 4) { |
| Transform4<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); |
| } else if CONSTEXPR_IF (TransformHeight == 8) { |
| Transform8<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); |
| } else if CONSTEXPR_IF (TransformHeight == 16) { |
| Transform16<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); |
| } else if CONSTEXPR_IF (TransformHeight == 32) { |
| Transform32<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); |
| } else { |
| assert(false && "Unsupported transform size."); |
| } |
| } |
| |
| template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, |
| size_t BlockHeight, typename T> |
| HWY_ATTR HWY_INLINE void RowTransform(TX_TYPE_1D tx_type, T *in, |
| int8_t cos_bit) { |
| if CONSTEXPR_IF (TransformWidth == 64 && TransformHeight == 64) { |
| assert(tx_type == DCT_1D); |
| // 64x64 only writes 32x32 of coefficients. |
| FdctNx64Block<TransformWidth, BlockWidth, 32, 32>(in, cos_bit); |
| } else if CONSTEXPR_IF (TransformHeight == 64) { |
| TransformFull64<TransformWidth, BlockWidth>(tx_type, in, cos_bit); |
| } else { |
| TransformBelow32<TransformWidth, BlockWidth, TransformHeight, BlockHeight>( |
| tx_type, in, cos_bit); |
| } |
| } |
| |
| template <TX_SIZE TxSize, typename T> |
| HWY_ATTR HWY_MAYBE_UNUSED void ForwardTransform2D(const int16_t *input, |
| int32_t *output, |
| size_t stride, |
| TX_TYPE tx_type) { |
| constexpr size_t kWidth = kTxSizeWide[TxSize]; |
| constexpr size_t kHeight = kTxSizeHigh[TxSize]; |
| // Ensure the storage is aligned to the architecture's block width. |
| constexpr size_t kMinVectorSize = |
| hn::BlockDFromD<hn::ScalableTag<T>>().MaxBytes() / sizeof(uint8_t); |
| constexpr size_t kBlockWidth = AOMMAX(kMinVectorSize / sizeof(T), kWidth); |
| constexpr size_t kBlockHeight = AOMMAX(kMinVectorSize / sizeof(T), kHeight); |
| HWY_ALIGN_MAX T buf0[kBlockWidth * kBlockHeight]; |
| constexpr bool kBigRectangle = (kBlockWidth == 64 && kBlockHeight >= 32) || |
| (kBlockWidth >= 32 && kBlockHeight == 64); |
| using T2 = typename std::conditional<kBigRectangle, int32_t, T>::type; |
| HWY_ALIGN_MAX T2 buf1[kBlockWidth * kBlockHeight]; |
| constexpr int8_t kShift[3] = { kForwardTransformShift[TxSize][0], |
| kForwardTransformShift[TxSize][1], |
| kForwardTransformShift[TxSize][2] }; |
| constexpr int kTransformWidthIndex = GetTxwIndex(TxSize); |
| constexpr int kTransformHeightIndex = GetTxhIndex(TxSize); |
| constexpr int8_t cos_bit_col = |
| kForwardCosBitCol[kTransformWidthIndex][kTransformHeightIndex]; |
| constexpr int8_t cos_bit_row = |
| kForwardCosBitRow[kTransformWidthIndex][kTransformHeightIndex]; |
| const TX_TYPE_1D vertical_transform = vtx_tab[tx_type]; |
| const TX_TYPE_1D horizontal_transform = htx_tab[tx_type]; |
| constexpr bool kApplyRectScale = kApplyRectScaleList[TxSize]; |
| switch ((vertical_transform == FLIPADST_1D ? 1 : 0) | |
| (horizontal_transform == FLIPADST_1D ? 2 : 0)) { |
| case 0: |
| LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, false>( |
| input, buf0, stride); |
| break; |
| case 1: |
| LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, false>( |
| input, buf0, stride); |
| break; |
| case 2: |
| LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, true>( |
| input, buf0, stride); |
| break; |
| case 3: |
| LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, true>( |
| input, buf0, stride); |
| break; |
| } |
| if CONSTEXPR_IF (kHeight == 64) { |
| TransformFull64<kWidth, kBlockWidth>(vertical_transform, buf0, cos_bit_col); |
| } else { |
| TransformBelow32<kWidth, kBlockWidth, kHeight, kBlockHeight>( |
| vertical_transform, buf0, cos_bit_col); |
| } |
| Transpose<kWidth, kHeight, kShift[1], false>(buf0, buf1, kBlockWidth, |
| kBlockHeight); |
| if CONSTEXPR_IF (kWidth == 64 && kHeight == 64) { |
| // 64x64 only writes 32x32 of coefficients. |
| assert(tx_type == DCT_1D); |
| FdctNx64Block<kHeight, kBlockHeight, 32, 32>(buf1, cos_bit_row); |
| StoreBlock<32, 32, kShift[2], kApplyRectScale>(buf1, 32, output, 32); |
| } else if CONSTEXPR_IF (kHeight == 64 && (kWidth == 16 || kWidth == 32)) { |
| // 32x64 and 16x64 coefficients are packed into Wx32, discarding the |
| // right-most results. |
| RowTransform<32, kBlockHeight, kWidth, kBlockWidth>(horizontal_transform, |
| buf1, cos_bit_row); |
| StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, |
| output, 32); |
| } else { |
| RowTransform<kHeight, kBlockHeight, kWidth, kBlockWidth>( |
| horizontal_transform, buf1, cos_bit_row); |
| StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, |
| output, kHeight); |
| } |
| if CONSTEXPR_IF (kHeight <= 16 && kWidth == 64) { |
| hwy::ZeroBytes<kHeight * 32 * sizeof(*output)>(output + kHeight * 32); |
| } |
| } |
| |
| HWY_MAYBE_UNUSED void LowBitdepthForwardTransform2D(const int16_t *src_diff, |
| tran_low_t *coeff, |
| int diff_stride, |
| TxfmParam *txfm_param) { |
| if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { |
| assert(txfm_param->tx_type == DCT_DCT); |
| av1_fwht4x4(src_diff, coeff, diff_stride); |
| return; |
| } |
| using TransformFunction = decltype(&ForwardTransform2D<TX_4X4, int16_t>); |
| constexpr TransformFunction kTable[] = { |
| #define POINTER(w, h, _) &ForwardTransform2D<TX_##w##X##h, int16_t>, |
| FOR_EACH_TXFM2D(POINTER, _) |
| #undef POINTER |
| }; |
| kTable[txfm_param->tx_size](src_diff, coeff, diff_stride, |
| txfm_param->tx_type); |
| } |
| |
| } // namespace HWY_NAMESPACE |
| } // namespace |
| |
| HWY_AFTER_NAMESPACE(); |
| |
| #define MAKE_HIGHBD_TXFM2D(w, h, suffix) \ |
| extern "C" void av1_fwd_txfm2d_##w##x##h##_##suffix( \ |
| const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ |
| int bd); \ |
| HWY_ATTR void av1_fwd_txfm2d_##w##x##h##_##suffix( \ |
| const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ |
| int bd) { \ |
| (void)bd; \ |
| HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int32_t>(input, output, \ |
| stride, tx_type); \ |
| } |
| |
| #define MAKE_LOWBD_TXFM2D(w, h, suffix) \ |
| extern "C" void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ |
| const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ |
| int bd); \ |
| HWY_ATTR void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix( \ |
| const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type, \ |
| int bd) { \ |
| (void)bd; \ |
| HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int16_t>(input, output, \ |
| stride, tx_type); \ |
| } |
| |
| #define MAKE_LOWBD_TXFM2D_DISPATCH(suffix) \ |
| extern "C" void av1_lowbd_fwd_txfm_##suffix( \ |
| const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ |
| TxfmParam *txfm_param); \ |
| HWY_ATTR void av1_lowbd_fwd_txfm_##suffix( \ |
| const int16_t *src_diff, tran_low_t *coeff, int diff_stride, \ |
| TxfmParam *txfm_param) { \ |
| HWY_NAMESPACE::LowBitdepthForwardTransform2D(src_diff, coeff, diff_stride, \ |
| txfm_param); \ |
| } |
| |
| #endif // AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ |