| /* | 
 |  * Copyright (c) 2025, Alliance for Open Media. All rights reserved. | 
 |  * | 
 |  * This source code is subject to the terms of the BSD 2 Clause License and | 
 |  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License | 
 |  * was not distributed with this source code in the LICENSE file, you can | 
 |  * obtain it at www.aomedia.org/license/software. If the Alliance for Open | 
 |  * Media Patent License 1.0 was not distributed with this source code in the | 
 |  * PATENTS file, you can obtain it at www.aomedia.org/license/patent. | 
 |  */ | 
 |  | 
 | #ifndef AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ | 
 | #define AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ | 
 |  | 
 | #include <stdint.h> | 
 |  | 
 | #include "config/aom_config.h" | 
 | #include "config/av1_rtcd.h" | 
 | #include "third_party/highway/hwy/highway.h" | 
 | #include "aom_dsp/txfm_common.h" | 
 | #include "av1/common/av1_txfm.h" | 
 | #include "av1/common/enums.h" | 
 | #include "av1/encoder/av1_fwd_txfm1d.h" | 
 | #include "av1/encoder/av1_fwd_txfm1d_cfg.h" | 
 |  | 
 | #define FOR_EACH_TXFM2D(X, suffix) \ | 
 |   X(4, 4, suffix)                  \ | 
 |   X(8, 8, suffix)                  \ | 
 |   X(16, 16, suffix)                \ | 
 |   X(32, 32, suffix)                \ | 
 |   X(64, 64, suffix)                \ | 
 |   X(4, 8, suffix)                  \ | 
 |   X(8, 4, suffix)                  \ | 
 |   X(8, 16, suffix)                 \ | 
 |   X(16, 8, suffix)                 \ | 
 |   X(16, 32, suffix)                \ | 
 |   X(32, 16, suffix)                \ | 
 |   X(32, 64, suffix)                \ | 
 |   X(64, 32, suffix)                \ | 
 |   X(4, 16, suffix)                 \ | 
 |   X(16, 4, suffix)                 \ | 
 |   X(8, 32, suffix)                 \ | 
 |   X(32, 8, suffix)                 \ | 
 |   X(16, 64, suffix)                \ | 
 |   X(64, 16, suffix) | 
 |  | 
 | #if HWY_CXX_LANG >= 201703L | 
 | #define CONSTEXPR_IF constexpr | 
 | #else | 
 | #define CONSTEXPR_IF | 
 | #endif | 
 |  | 
 | HWY_BEFORE_NAMESPACE(); | 
 |  | 
 | namespace { | 
 | namespace HWY_NAMESPACE { | 
 |  | 
 | namespace hn = hwy::HWY_NAMESPACE; | 
 |  | 
 | constexpr int8_t kForwardTransformShift[TX_SIZES_ALL][3] = { | 
 |   { 2, 0, 0 },    // | 
 |   { 2, -1, 0 },   // | 
 |   { 2, -2, 0 },   // | 
 |   { 2, -4, 0 },   // | 
 |   { 0, -2, -2 },  // | 
 |   { 2, -1, 0 },   // | 
 |   { 2, -1, 0 },   // | 
 |   { 2, -2, 0 },   // | 
 |   { 2, -2, 0 },   // | 
 |   { 2, -4, 0 },   // | 
 |   { 2, -4, 0 },   // | 
 |   { 0, -2, -2 },  // | 
 |   { 2, -4, -2 },  // | 
 |   { 2, -1, 0 },   // | 
 |   { 2, -1, 0 },   // | 
 |   { 2, -2, 0 },   // | 
 |   { 2, -2, 0 },   // | 
 |   { 0, -2, 0 },   // | 
 |   { 2, -4, 0 },   // | 
 | }; | 
 |  | 
 | constexpr int kTxSizeWideLog2[TX_SIZES_ALL] = { | 
 |   2, 3, 4, 5, 6, 2, 3, 3, 4, 4, 5, 5, 6, 2, 4, 3, 5, 4, 6, | 
 | }; | 
 |  | 
 | // Transform block height in log2 | 
 | constexpr int kTxSizeHighLog2[TX_SIZES_ALL] = { | 
 |   2, 3, 4, 5, 6, 3, 2, 4, 3, 5, 4, 6, 5, 4, 2, 5, 3, 6, 4, | 
 | }; | 
 |  | 
 | constexpr bool kApplyRectScaleList[TX_SIZES_ALL] = { | 
 |   false, false, false, false, false, true,  true,  true,  true,  true, | 
 |   true,  true,  true,  false, false, false, false, false, false, | 
 | }; | 
 |  | 
 | constexpr int8_t kForwardCosBitCol[MAX_TXWH_IDX /*txw_idx*/] | 
 |                                   [MAX_TXWH_IDX /*txh_idx*/] = { | 
 |                                     { 13, 13, 13, 0, 0 }, | 
 |                                     { 13, 13, 13, 12, 0 }, | 
 |                                     { 13, 13, 13, 12, 13 }, | 
 |                                     { 0, 13, 13, 12, 13 }, | 
 |                                     { 0, 0, 13, 12, 13 } | 
 |                                   }; | 
 |  | 
 | constexpr int8_t kForwardCosBitRow[MAX_TXWH_IDX /*txw_idx*/] | 
 |                                   [MAX_TXWH_IDX /*txh_idx*/] = { | 
 |                                     { 13, 13, 12, 0, 0 }, | 
 |                                     { 13, 13, 13, 12, 0 }, | 
 |                                     { 13, 13, 12, 13, 12 }, | 
 |                                     { 0, 12, 13, 12, 11 }, | 
 |                                     { 0, 0, 12, 11, 10 } | 
 |                                   }; | 
 |  | 
 | // Transform block width in pixels | 
 | constexpr int8_t kTxSizeWide[TX_SIZES_ALL] = { | 
 |   4, 8, 16, 32, 64, 4, 8, 8, 16, 16, 32, 32, 64, 4, 16, 8, 32, 16, 64, | 
 | }; | 
 |  | 
 | // Transform block height in pixels | 
 | constexpr int8_t kTxSizeHigh[TX_SIZES_ALL] = { | 
 |   4, 8, 16, 32, 64, 8, 4, 16, 8, 32, 16, 64, 32, 16, 4, 32, 8, 64, 16, | 
 | }; | 
 |  | 
 | constexpr int GetTxwIndex(TX_SIZE tx_size) { | 
 |   return kTxSizeWideLog2[tx_size] - kTxSizeWideLog2[0]; | 
 | } | 
 |  | 
 | constexpr int GetTxhIndex(TX_SIZE tx_size) { | 
 |   return kTxSizeHighLog2[tx_size] - kTxSizeHighLog2[0]; | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> SetPair(D int_tag, int a, int b) { | 
 |   return hn::BitCast( | 
 |       int_tag, | 
 |       hn::Set(hn::RepartitionToWide<D>(), | 
 |               static_cast<int32_t>( | 
 |                   static_cast<uint16_t>(a) | | 
 |                   (static_cast<uint32_t>(static_cast<uint16_t>(b)) << 16)))); | 
 | } | 
 |  | 
 | template <size_t LaneSize> | 
 | struct ButterflyTraits {}; | 
 |  | 
 | template <> | 
 | struct ButterflyTraits<2> { | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Whole( | 
 |       D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |       const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, | 
 |       hn::TFromD<D> *HWY_RESTRICT out1, int bit, | 
 |       hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |     constexpr hn::RepartitionToWide<D> int32_tag; | 
 |     const auto ww0 = SetPair(int_tag, w0, w1); | 
 |     const auto ww1 = SetPair(int_tag, w1, -w0); | 
 |     const auto i0 = hn::Load(int_tag, in0); | 
 |     const auto i1 = hn::Load(int_tag, in1); | 
 |     const auto t0 = hn::InterleaveLower(int_tag, i0, i1); | 
 |     const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); | 
 |     const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); | 
 |     const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); | 
 |     const auto v0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww1); | 
 |     const auto v1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww1); | 
 |     const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); | 
 |     const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); | 
 |     const auto d0 = hn::ShiftRightSame(hn::Add(v0, round), bit); | 
 |     const auto d1 = hn::ShiftRightSame(hn::Add(v1, round), bit); | 
 |     hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out0); | 
 |     hn::Store(hn::ReorderDemote2To(int_tag, d0, d1), int_tag, out1); | 
 |   } | 
 |  | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Half( | 
 |       D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |       const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, | 
 |       int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |     constexpr hn::RepartitionToWide<D> int32_tag; | 
 |     const auto i0 = hn::Load(int_tag, in0); | 
 |     const auto i1 = hn::Load(int_tag, in1); | 
 |     const auto t0 = hn::InterleaveLower(int_tag, i0, i1); | 
 |     const auto t1 = hn::InterleaveUpper(int_tag, i0, i1); | 
 |     const auto ww0 = SetPair(int_tag, w0, w1); | 
 |     const auto u0 = hn::WidenMulPairwiseAdd(int32_tag, t0, ww0); | 
 |     const auto u1 = hn::WidenMulPairwiseAdd(int32_tag, t1, ww0); | 
 |     const auto c0 = hn::ShiftRightSame(hn::Add(u0, round), bit); | 
 |     const auto c1 = hn::ShiftRightSame(hn::Add(u1, round), bit); | 
 |     hn::Store(hn::ReorderDemote2To(int_tag, c0, c1), int_tag, out); | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct ButterflyTraits<4> { | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Whole( | 
 |       D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |       const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, | 
 |       hn::TFromD<D> *HWY_RESTRICT out1, int bit, | 
 |       hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |     const auto i0 = hn::Load(int_tag, in0); | 
 |     const auto i1 = hn::Load(int_tag, in1); | 
 |     const auto ww0 = hn::Set(int_tag, w0); | 
 |     const auto ww1 = hn::Set(int_tag, w1); | 
 |     const auto in1_w1 = hn::Mul(i1, ww1); | 
 |     const auto o0 = hn::MulAdd(i0, ww0, in1_w1); | 
 |     hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out0); | 
 |     const auto in1_w0 = hn::Mul(i1, ww0); | 
 |     const auto o1 = hn::MulSub(i0, ww1, in1_w0); | 
 |     hn::Store(hn::ShiftRightSame(hn::Add(o1, round), bit), int_tag, out1); | 
 |   } | 
 |  | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Half( | 
 |       D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |       const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, | 
 |       int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |     const auto i0 = hn::Load(int_tag, in0); | 
 |     const auto i1 = hn::Load(int_tag, in1); | 
 |     const auto ww0 = hn::Set(int_tag, w0); | 
 |     const auto ww1 = hn::Set(int_tag, w1); | 
 |     const auto in1_w1 = hn::Mul(i1, ww1); | 
 |     const auto o0 = hn::MulAdd(i0, ww0, in1_w1); | 
 |     hn::Store(hn::ShiftRightSame(hn::Add(o0, round), bit), int_tag, out); | 
 |   } | 
 | }; | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void Butterfly( | 
 |     D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |     const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out0, | 
 |     hn::TFromD<D> *HWY_RESTRICT out1, int bit, | 
 |     hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |   ButterflyTraits<sizeof(hn::TFromD<D>)>::Whole(int_tag, w0, w1, in0, in1, out0, | 
 |                                                 out1, bit, round); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void HalfButterfly( | 
 |     D int_tag, int w0, int w1, const hn::TFromD<D> *HWY_RESTRICT in0, | 
 |     const hn::TFromD<D> *HWY_RESTRICT in1, hn::TFromD<D> *HWY_RESTRICT out, | 
 |     int bit, hn::VFromD<hn::Repartition<int32_t, D>> round) { | 
 |   ButterflyTraits<sizeof(hn::TFromD<D>)>::Half(int_tag, w0, w1, in0, in1, out, | 
 |                                                bit, round); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void AddSub(D int_tag, const hn::TFromD<D> *in0, | 
 |                                 const hn::TFromD<D> *in1, | 
 |                                 hn::TFromD<D> *out_add, | 
 |                                 hn::TFromD<D> *out_sub) { | 
 |   const auto i0 = hn::Load(int_tag, in0); | 
 |   const auto i1 = hn::Load(int_tag, in1); | 
 |   if CONSTEXPR_IF (sizeof(hn::TFromD<D>) == 2) { | 
 |     hn::Store(hn::SaturatedAdd(i0, i1), int_tag, out_add); | 
 |     hn::Store(hn::SaturatedSub(i0, i1), int_tag, out_sub); | 
 |   } else { | 
 |     hn::Store(hn::Add(i0, i1), int_tag, out_add); | 
 |     hn::Store(hn::Sub(i0, i1), int_tag, out_sub); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t LaneSize, size_t NumLanes> | 
 | struct Fdct4Traits { | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, | 
 |                                         hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                         const int8_t cos_bit, size_t instride) { | 
 |     using T = hn::TFromD<D>; | 
 |     constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |     HWY_ALIGN_MAX T buf0[4 * kNumLanes]; | 
 |     const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |     constexpr hn::Repartition<int32_t, D> int32_tag; | 
 |     const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); | 
 |     AddSub(int_tag, &in[0 * instride], &in[3 * instride], &buf0[0 * kNumLanes], | 
 |            &buf0[3 * kNumLanes]); | 
 |     AddSub(int_tag, &in[1 * instride], &in[2 * instride], &buf0[1 * kNumLanes], | 
 |            &buf0[2 * kNumLanes]); | 
 |     Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], | 
 |               &buf0[1 * kNumLanes], &in[0 * instride], &in[2 * instride], | 
 |               cos_bit, round); | 
 |     Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], | 
 |               &buf0[2 * kNumLanes], &in[1 * instride], &in[3 * instride], | 
 |               cos_bit, round); | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct Fdct4Traits<2, 4> { | 
 |   template <typename D> | 
 |   HWY_ATTR HWY_INLINE static void Fdct4(D int_tag, | 
 |                                         hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                         const int8_t cos_bit, size_t instride) { | 
 |     const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |     constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; | 
 |     constexpr hn::Repartition<int32_t, decltype(demote_tag)> int32_tag; | 
 |     const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); | 
 |     const auto cospi_p32_p32 = SetPair(int_tag, cospi[32], cospi[32]); | 
 |     const auto cospi_p32_m32 = SetPair(int_tag, cospi[32], -cospi[32]); | 
 |     const auto cospi_p16_p48 = SetPair(int_tag, cospi[16], cospi[48]); | 
 |     const auto cospi_p48_m16 = SetPair(int_tag, cospi[48], -cospi[16]); | 
 |     const auto i0 = hn::Load(int_tag, &in[0 * instride]); | 
 |     const auto i1 = hn::Load(int_tag, &in[1 * instride]); | 
 |     const auto i2 = hn::Load(int_tag, &in[2 * instride]); | 
 |     const auto i3 = hn::Load(int_tag, &in[3 * instride]); | 
 |     const auto u0 = hn::InterleaveLower(int_tag, i0, i1); | 
 |     const auto u1 = hn::InterleaveLower(int_tag, i3, i2); | 
 |     const auto v0 = hn::Add(u0, u1); | 
 |     const auto v1 = hn::Sub(u0, u1); | 
 |     const auto x0 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_p32); | 
 |     const auto x1 = hn::WidenMulPairwiseAdd(int32_tag, v0, cospi_p32_m32); | 
 |     const auto x2 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p16_p48); | 
 |     const auto x3 = hn::WidenMulPairwiseAdd(int32_tag, v1, cospi_p48_m16); | 
 |     const auto v0w0 = hn::ShiftRightSame(hn::Add(x0, round), cos_bit); | 
 |     const auto v0w1 = hn::ShiftRightSame(hn::Add(x1, round), cos_bit); | 
 |     const auto v1w0 = hn::ShiftRightSame(hn::Add(x2, round), cos_bit); | 
 |     const auto v1w1 = hn::ShiftRightSame(hn::Add(x3, round), cos_bit); | 
 |     const auto o0 = hn::ReorderDemote2To(demote_tag, v0w0, v0w1); | 
 |     const auto o1 = hn::ReorderDemote2To(demote_tag, v1w0, v1w1); | 
 |     hn::Store(o0, demote_tag, &in[0 * instride]); | 
 |     hn::Store(o1, demote_tag, &in[1 * instride]); | 
 |     hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, | 
 |               &in[2 * instride]); | 
 |     hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, | 
 |               &in[3 * instride]); | 
 |   } | 
 | }; | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void Fdct4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                const int8_t cos_bit, size_t instride) { | 
 |   Fdct4Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(int_tag)>::Fdct4( | 
 |       int_tag, in, cos_bit, instride); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void Fdct8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                const int8_t cos_bit, size_t instride) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |  | 
 |   // Even 8 points 0, 2, ..., 14 | 
 |   // stage 0 | 
 |   // stage 1 | 
 |   // buf0/buf1 | 
 |   AddSub(int_tag, &in[0 * instride], &in[7 * instride], &buf0[0 * kNumLanes], | 
 |          &buf1[7 * kNumLanes]); | 
 |   // buf0/buf0 | 
 |   AddSub(int_tag, &in[1 * instride], &in[6 * instride], &buf0[1 * kNumLanes], | 
 |          &buf0[6 * kNumLanes]); | 
 |   // buf0/buf0 | 
 |   AddSub(int_tag, &in[2 * instride], &in[5 * instride], &buf0[2 * kNumLanes], | 
 |          &buf0[5 * kNumLanes]); | 
 |   // buf0/buf1 | 
 |   AddSub(int_tag, &in[3 * instride], &in[4 * instride], &buf0[3 * kNumLanes], | 
 |          &buf1[4 * kNumLanes]); | 
 |  | 
 |   // stage 2 | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes], | 
 |            &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf0[5 * kNumLanes], | 
 |             &buf0[6 * kNumLanes], &buf1[5 * kNumLanes], &buf1[6 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 3 | 
 |   // type 0 | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf1[0 * kNumLanes], | 
 |             &buf1[1 * kNumLanes], &in[0 * instride], &in[4 * instride], cos_bit, | 
 |             round); | 
 |  | 
 |   // type 1 | 
 |   Butterfly(int_tag, cospi[16], cospi[48], &buf1[3 * kNumLanes], | 
 |             &buf1[2 * kNumLanes], &in[2 * instride], &in[6 * instride], cos_bit, | 
 |             round); | 
 |  | 
 |   AddSub(int_tag, &buf1[4 * kNumLanes], &buf1[5 * kNumLanes], | 
 |          &buf0[4 * kNumLanes], &buf0[5 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[7 * kNumLanes], &buf1[6 * kNumLanes], | 
 |          &buf0[7 * kNumLanes], &buf0[6 * kNumLanes]); | 
 |  | 
 |   // stage 4 | 
 |   // stage 5 | 
 |   Butterfly(int_tag, cospi[8], cospi[56], &buf0[7 * kNumLanes], | 
 |             &buf0[4 * kNumLanes], &in[1 * instride], &in[7 * instride], cos_bit, | 
 |             round); | 
 |   Butterfly(int_tag, cospi[40], cospi[24], &buf0[6 * kNumLanes], | 
 |             &buf0[5 * kNumLanes], &in[5 * instride], &in[3 * instride], cos_bit, | 
 |             round); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void Fdct16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                 const int8_t cos_bit, size_t instride) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |  | 
 |   // Calculate the column 0, 1, 2, 3 | 
 |   // stage 0 | 
 |   // stage 1 | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &in[i * instride], &in[(15 - i) * instride], | 
 |            &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 2 | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], | 
 |            &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], | 
 |             &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], | 
 |             &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 3 | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], | 
 |            &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], | 
 |             &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], | 
 |            &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], | 
 |            &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 4 | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], | 
 |             &buf0[1 * kNumLanes], &in[0 * instride], &in[8 * instride], cos_bit, | 
 |             round); | 
 |  | 
 |   Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], | 
 |             &buf0[2 * kNumLanes], &in[4 * instride], &in[12 * instride], | 
 |             cos_bit, round); | 
 |  | 
 |   AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], | 
 |          &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], | 
 |          &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); | 
 |  | 
 |   Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], | 
 |             &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], | 
 |             &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 5 | 
 |   Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], | 
 |             &buf1[4 * kNumLanes], &in[2 * instride], &in[14 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], | 
 |             &buf1[5 * kNumLanes], &in[10 * instride], &in[6 * instride], | 
 |             cos_bit, round); | 
 |  | 
 |   AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], | 
 |          &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], | 
 |          &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], | 
 |          &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], | 
 |          &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); | 
 |  | 
 |   // stage 6 | 
 |   Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], | 
 |             &buf0[8 * kNumLanes], &in[1 * instride], &in[15 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], | 
 |             &buf0[9 * kNumLanes], &in[9 * instride], &in[7 * instride], cos_bit, | 
 |             round); | 
 |   Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], | 
 |             &buf0[10 * kNumLanes], &in[5 * instride], &in[11 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], | 
 |             &buf0[11 * kNumLanes], &in[13 * instride], &in[3 * instride], | 
 |             cos_bit, round); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE void Fdct32(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                 const int8_t cos_bit, size_t instride) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[32 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[32 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |   // stage 0 | 
 |   // stage 1 | 
 |   for (size_t i = 0; i < 16; ++i) { | 
 |     AddSub(int_tag, &in[i * instride], &in[(31 - i) * instride], | 
 |            &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 2 | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], | 
 |            &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[20 * kNumLanes], | 
 |             &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[21 * kNumLanes], | 
 |             &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[22 * kNumLanes], | 
 |             &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[23 * kNumLanes], | 
 |             &buf1[24 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 3 | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(7 - i) * kNumLanes], | 
 |            &buf1[i * kNumLanes], &buf1[(7 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf0[10 * kNumLanes], | 
 |             &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf0[11 * kNumLanes], | 
 |             &buf0[12 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], | 
 |            &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], | 
 |            &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 4 | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(3 - i) * kNumLanes], | 
 |            &buf0[i * kNumLanes], &buf0[(3 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], | 
 |             &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], | 
 |            &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], | 
 |            &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   Butterfly(int_tag, -cospi[16], cospi[48], &buf1[18 * kNumLanes], | 
 |             &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[16], cospi[48], &buf1[19 * kNumLanes], | 
 |             &buf1[28 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[20 * kNumLanes], | 
 |             &buf1[27 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[21 * kNumLanes], | 
 |             &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 5 | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], | 
 |             &buf0[1 * kNumLanes], &in[0 * instride], &in[16 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], | 
 |             &buf0[2 * kNumLanes], &in[8 * instride], &in[24 * instride], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], | 
 |          &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], | 
 |          &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); | 
 |   Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], | 
 |             &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], | 
 |             &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[19 * kNumLanes], | 
 |          &buf1[16 * kNumLanes], &buf1[19 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[17 * kNumLanes], &buf0[18 * kNumLanes], | 
 |          &buf1[17 * kNumLanes], &buf1[18 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[20 * kNumLanes], | 
 |          &buf1[23 * kNumLanes], &buf1[20 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[22 * kNumLanes], &buf0[21 * kNumLanes], | 
 |          &buf1[22 * kNumLanes], &buf1[21 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[27 * kNumLanes], | 
 |          &buf1[24 * kNumLanes], &buf1[27 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[25 * kNumLanes], &buf0[26 * kNumLanes], | 
 |          &buf1[25 * kNumLanes], &buf1[26 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[28 * kNumLanes], | 
 |          &buf1[31 * kNumLanes], &buf1[28 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[30 * kNumLanes], &buf0[29 * kNumLanes], | 
 |          &buf1[30 * kNumLanes], &buf1[29 * kNumLanes]); | 
 |  | 
 |   // stage 6 | 
 |   Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], | 
 |             &buf1[4 * kNumLanes], &in[4 * instride], &in[28 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], | 
 |             &buf1[5 * kNumLanes], &in[20 * instride], &in[12 * instride], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], | 
 |          &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], | 
 |          &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], | 
 |          &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], | 
 |          &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); | 
 |   Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], | 
 |             &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], | 
 |             &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], | 
 |             &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], | 
 |             &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 7 | 
 |   Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], | 
 |             &buf0[8 * kNumLanes], &in[2 * instride], &in[30 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], | 
 |             &buf0[9 * kNumLanes], &in[18 * instride], &in[14 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], | 
 |             &buf0[10 * kNumLanes], &in[10 * instride], &in[22 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], | 
 |             &buf0[11 * kNumLanes], &in[26 * instride], &in[6 * instride], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], | 
 |          &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], | 
 |          &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], | 
 |          &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], | 
 |          &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], | 
 |          &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], | 
 |          &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], | 
 |          &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], | 
 |          &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); | 
 |  | 
 |   // stage 8 & 9 | 
 |   Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], | 
 |             &buf1[16 * kNumLanes], &in[1 * instride], &in[31 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], | 
 |             &buf1[17 * kNumLanes], &in[17 * instride], &in[15 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], | 
 |             &buf1[18 * kNumLanes], &in[9 * instride], &in[23 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], | 
 |             &buf1[19 * kNumLanes], &in[25 * instride], &in[7 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], | 
 |             &buf1[20 * kNumLanes], &in[5 * instride], &in[27 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], | 
 |             &buf1[21 * kNumLanes], &in[21 * instride], &in[11 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], | 
 |             &buf1[22 * kNumLanes], &in[13 * instride], &in[19 * instride], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], | 
 |             &buf1[23 * kNumLanes], &in[29 * instride], &in[3 * instride], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 9 was fused with prior stages. | 
 | } | 
 |  | 
 | template <size_t InStride, size_t OutStride, typename D> | 
 | HWY_ATTR HWY_NOINLINE void Fdct64(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                   const int8_t cos_bit) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[64 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[64 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |  | 
 |   // stage 1 | 
 | #if HWY_TARGET == HWY_SSE4 | 
 |   // For whatever reason, some compilers don't unroll this when building for | 
 |   // SSE4; help them along. | 
 |   HWY_UNROLL(32) | 
 | #endif | 
 |   for (size_t i = 0; i < 32; ++i) { | 
 |     AddSub(int_tag, &in[i * InStride], &in[(63 - i) * InStride], | 
 |            &buf0[i * kNumLanes], &buf0[(63 - i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 2 | 
 |   for (size_t i = 0; i < 16; ++i) { | 
 |     AddSub(int_tag, &buf0[i * kNumLanes], &buf0[(31 - i) * kNumLanes], | 
 |            &buf1[i * kNumLanes], &buf1[(31 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(40 + i) * kNumLanes], | 
 |               &buf0[(55 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], | 
 |               &buf1[(55 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |  | 
 |   // stage 3 | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &buf1[i * kNumLanes], &buf1[(15 - i) * kNumLanes], | 
 |            &buf0[i * kNumLanes], &buf0[(15 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     Butterfly(int_tag, -cospi[32], cospi[32], &buf1[(20 + i) * kNumLanes], | 
 |               &buf1[(27 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], | 
 |               &buf0[(27 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(47 - i) * kNumLanes], | 
 |            &buf0[(32 + i) * kNumLanes], &buf0[(47 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(48 + i) * kNumLanes], | 
 |            &buf0[(63 - i) * kNumLanes], &buf0[(48 + i) * kNumLanes]); | 
 |   } | 
 |   // stage 4 | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[(0 + i) * kNumLanes], &buf0[(7 - i) * kNumLanes], | 
 |            &buf1[(0 + i) * kNumLanes], &buf1[(7 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     Butterfly(int_tag, -cospi[32], cospi[32], &buf0[(10 + i) * kNumLanes], | 
 |               &buf0[(13 - i) * kNumLanes], &buf1[(10 + i) * kNumLanes], | 
 |               &buf1[(13 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(23 - i) * kNumLanes], | 
 |            &buf1[(16 + i) * kNumLanes], &buf1[(23 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(24 + i) * kNumLanes], | 
 |            &buf1[(31 - i) * kNumLanes], &buf1[(24 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     Butterfly(int_tag, -cospi[16], cospi[48], &buf0[(36 + i) * kNumLanes], | 
 |               &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], | 
 |               &buf1[(59 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 4; i < 8; ++i) { | 
 |     Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[(36 + i) * kNumLanes], | 
 |               &buf0[(59 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], | 
 |               &buf1[(59 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   // stage 5 | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(3 - i) * kNumLanes], | 
 |            &buf0[(0 + i) * kNumLanes], &buf0[(3 - i) * kNumLanes]); | 
 |   } | 
 |   Butterfly(int_tag, -cospi[32], cospi[32], &buf1[5 * kNumLanes], | 
 |             &buf1[6 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], | 
 |             cos_bit, round); | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(8 + i) * kNumLanes], &buf1[(11 - i) * kNumLanes], | 
 |            &buf0[(8 + i) * kNumLanes], &buf0[(11 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(15 - i) * kNumLanes], &buf1[(12 + i) * kNumLanes], | 
 |            &buf0[(15 - i) * kNumLanes], &buf0[(12 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     Butterfly(int_tag, -cospi[16], cospi[48], &buf1[(18 + i) * kNumLanes], | 
 |               &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], | 
 |               &buf0[(29 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 2; i < 4; ++i) { | 
 |     Butterfly(int_tag, -cospi[48], -cospi[16], &buf1[(18 + i) * kNumLanes], | 
 |               &buf1[(29 - i) * kNumLanes], &buf0[(18 + i) * kNumLanes], | 
 |               &buf0[(29 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(39 - i) * kNumLanes], | 
 |            &buf0[(32 + i) * kNumLanes], &buf0[(39 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(40 + i) * kNumLanes], | 
 |            &buf0[(47 - i) * kNumLanes], &buf0[(40 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(55 - i) * kNumLanes], | 
 |            &buf0[(48 + i) * kNumLanes], &buf0[(55 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(56 + i) * kNumLanes], | 
 |            &buf0[(63 - i) * kNumLanes], &buf0[(56 + i) * kNumLanes]); | 
 |   } | 
 |   // stage 6 | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[0 * kNumLanes], | 
 |             &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], &buf1[1 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[16], cospi[48], &buf0[3 * kNumLanes], | 
 |             &buf0[2 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf1[4 * kNumLanes], &buf0[5 * kNumLanes], | 
 |          &buf1[4 * kNumLanes], &buf1[5 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[7 * kNumLanes], &buf0[6 * kNumLanes], | 
 |          &buf1[7 * kNumLanes], &buf1[6 * kNumLanes]); | 
 |   Butterfly(int_tag, -cospi[16], cospi[48], &buf0[9 * kNumLanes], | 
 |             &buf0[14 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[48], -cospi[16], &buf0[10 * kNumLanes], | 
 |             &buf0[13 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[(16 + i) * kNumLanes], &buf0[(19 - i) * kNumLanes], | 
 |            &buf1[(16 + i) * kNumLanes], &buf1[(19 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[(23 - i) * kNumLanes], &buf0[(20 + i) * kNumLanes], | 
 |            &buf1[(23 - i) * kNumLanes], &buf1[(20 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[(24 + i) * kNumLanes], &buf0[(27 - i) * kNumLanes], | 
 |            &buf1[(24 + i) * kNumLanes], &buf1[(27 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf1[(31 - i) * kNumLanes], &buf0[(28 + i) * kNumLanes], | 
 |            &buf1[(31 - i) * kNumLanes], &buf1[(28 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     Butterfly(int_tag, -cospi[8], cospi[56], &buf0[(34 + i) * kNumLanes], | 
 |               &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], | 
 |               &buf1[(61 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 2; i < 4; ++i) { | 
 |     Butterfly(int_tag, -cospi[56], -cospi[8], &buf0[(34 + i) * kNumLanes], | 
 |               &buf0[(61 - i) * kNumLanes], &buf1[(34 + i) * kNumLanes], | 
 |               &buf1[(61 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     Butterfly(int_tag, -cospi[40], cospi[24], &buf0[(42 + i) * kNumLanes], | 
 |               &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], | 
 |               &buf1[(53 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   for (size_t i = 2; i < 4; ++i) { | 
 |     Butterfly(int_tag, -cospi[24], -cospi[40], &buf0[(42 + i) * kNumLanes], | 
 |               &buf0[(53 - i) * kNumLanes], &buf1[(42 + i) * kNumLanes], | 
 |               &buf1[(53 - i) * kNumLanes], cos_bit, round); | 
 |   } | 
 |   // stage 7 | 
 |   Butterfly(int_tag, cospi[8], cospi[56], &buf1[7 * kNumLanes], | 
 |             &buf1[4 * kNumLanes], &buf0[4 * kNumLanes], &buf0[7 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[40], cospi[24], &buf1[6 * kNumLanes], | 
 |             &buf1[5 * kNumLanes], &buf0[5 * kNumLanes], &buf0[6 * kNumLanes], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf0[8 * kNumLanes], &buf1[9 * kNumLanes], | 
 |          &buf0[8 * kNumLanes], &buf0[9 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], | 
 |          &buf0[11 * kNumLanes], &buf0[10 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[12 * kNumLanes], &buf1[13 * kNumLanes], | 
 |          &buf0[12 * kNumLanes], &buf0[13 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], | 
 |          &buf0[15 * kNumLanes], &buf0[14 * kNumLanes]); | 
 |   Butterfly(int_tag, -cospi[8], cospi[56], &buf1[17 * kNumLanes], | 
 |             &buf1[30 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[56], -cospi[8], &buf1[18 * kNumLanes], | 
 |             &buf1[29 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[40], cospi[24], &buf1[21 * kNumLanes], | 
 |             &buf1[26 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[24], -cospi[40], &buf1[22 * kNumLanes], | 
 |             &buf1[25 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], | 
 |             cos_bit, round); | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(32 + i) * kNumLanes], &buf1[(35 - i) * kNumLanes], | 
 |            &buf0[(32 + i) * kNumLanes], &buf0[(35 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(39 - i) * kNumLanes], &buf1[(36 + i) * kNumLanes], | 
 |            &buf0[(39 - i) * kNumLanes], &buf0[(36 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(40 + i) * kNumLanes], &buf1[(43 - i) * kNumLanes], | 
 |            &buf0[(40 + i) * kNumLanes], &buf0[(43 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(47 - i) * kNumLanes], &buf1[(44 + i) * kNumLanes], | 
 |            &buf0[(47 - i) * kNumLanes], &buf0[(44 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(48 + i) * kNumLanes], &buf1[(51 - i) * kNumLanes], | 
 |            &buf0[(48 + i) * kNumLanes], &buf0[(51 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(55 - i) * kNumLanes], &buf1[(52 + i) * kNumLanes], | 
 |            &buf0[(55 - i) * kNumLanes], &buf0[(52 + i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(56 + i) * kNumLanes], &buf1[(59 - i) * kNumLanes], | 
 |            &buf0[(56 + i) * kNumLanes], &buf0[(59 - i) * kNumLanes]); | 
 |   } | 
 |   for (size_t i = 0; i < 2; ++i) { | 
 |     AddSub(int_tag, &buf0[(63 - i) * kNumLanes], &buf1[(60 + i) * kNumLanes], | 
 |            &buf0[(63 - i) * kNumLanes], &buf0[(60 + i) * kNumLanes]); | 
 |   } | 
 |   // stage 8 | 
 |   Butterfly(int_tag, cospi[4], cospi[60], &buf0[15 * kNumLanes], | 
 |             &buf0[8 * kNumLanes], &buf1[8 * kNumLanes], &buf1[15 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[36], cospi[28], &buf0[14 * kNumLanes], | 
 |             &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], &buf1[14 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[20], cospi[44], &buf0[13 * kNumLanes], | 
 |             &buf0[10 * kNumLanes], &buf1[10 * kNumLanes], &buf1[13 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[52], cospi[12], &buf0[12 * kNumLanes], | 
 |             &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], &buf1[12 * kNumLanes], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf1[16 * kNumLanes], &buf0[17 * kNumLanes], | 
 |          &buf1[16 * kNumLanes], &buf1[17 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[19 * kNumLanes], &buf0[18 * kNumLanes], | 
 |          &buf1[19 * kNumLanes], &buf1[18 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[20 * kNumLanes], &buf0[21 * kNumLanes], | 
 |          &buf1[20 * kNumLanes], &buf1[21 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[23 * kNumLanes], &buf0[22 * kNumLanes], | 
 |          &buf1[23 * kNumLanes], &buf1[22 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[24 * kNumLanes], &buf0[25 * kNumLanes], | 
 |          &buf1[24 * kNumLanes], &buf1[25 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[27 * kNumLanes], &buf0[26 * kNumLanes], | 
 |          &buf1[27 * kNumLanes], &buf1[26 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[28 * kNumLanes], &buf0[29 * kNumLanes], | 
 |          &buf1[28 * kNumLanes], &buf1[29 * kNumLanes]); | 
 |   AddSub(int_tag, &buf1[31 * kNumLanes], &buf0[30 * kNumLanes], | 
 |          &buf1[31 * kNumLanes], &buf1[30 * kNumLanes]); | 
 |   Butterfly(int_tag, -cospi[4], cospi[60], &buf0[33 * kNumLanes], | 
 |             &buf0[62 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[60], -cospi[4], &buf0[34 * kNumLanes], | 
 |             &buf0[61 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[36], cospi[28], &buf0[37 * kNumLanes], | 
 |             &buf0[58 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[28], -cospi[36], &buf0[38 * kNumLanes], | 
 |             &buf0[57 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[20], cospi[44], &buf0[41 * kNumLanes], | 
 |             &buf0[54 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[44], -cospi[20], &buf0[42 * kNumLanes], | 
 |             &buf0[53 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[52], cospi[12], &buf0[45 * kNumLanes], | 
 |             &buf0[50 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, -cospi[12], -cospi[52], &buf0[46 * kNumLanes], | 
 |             &buf0[49 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], | 
 |             cos_bit, round); | 
 |   // stage 9 | 
 |   Butterfly(int_tag, cospi[2], cospi[62], &buf1[31 * kNumLanes], | 
 |             &buf1[16 * kNumLanes], &buf0[16 * kNumLanes], &buf0[31 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[34], cospi[30], &buf1[30 * kNumLanes], | 
 |             &buf1[17 * kNumLanes], &buf0[17 * kNumLanes], &buf0[30 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[18], cospi[46], &buf1[29 * kNumLanes], | 
 |             &buf1[18 * kNumLanes], &buf0[18 * kNumLanes], &buf0[29 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[50], cospi[14], &buf1[28 * kNumLanes], | 
 |             &buf1[19 * kNumLanes], &buf0[19 * kNumLanes], &buf0[28 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[10], cospi[54], &buf1[27 * kNumLanes], | 
 |             &buf1[20 * kNumLanes], &buf0[20 * kNumLanes], &buf0[27 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[42], cospi[22], &buf1[26 * kNumLanes], | 
 |             &buf1[21 * kNumLanes], &buf0[21 * kNumLanes], &buf0[26 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[26], cospi[38], &buf1[25 * kNumLanes], | 
 |             &buf1[22 * kNumLanes], &buf0[22 * kNumLanes], &buf0[25 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[58], cospi[6], &buf1[24 * kNumLanes], | 
 |             &buf1[23 * kNumLanes], &buf0[23 * kNumLanes], &buf0[24 * kNumLanes], | 
 |             cos_bit, round); | 
 |   AddSub(int_tag, &buf0[32 * kNumLanes], &buf1[33 * kNumLanes], | 
 |          &buf0[32 * kNumLanes], &buf0[33 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[35 * kNumLanes], &buf1[34 * kNumLanes], | 
 |          &buf0[35 * kNumLanes], &buf0[34 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[36 * kNumLanes], &buf1[37 * kNumLanes], | 
 |          &buf0[36 * kNumLanes], &buf0[37 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[39 * kNumLanes], &buf1[38 * kNumLanes], | 
 |          &buf0[39 * kNumLanes], &buf0[38 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[40 * kNumLanes], &buf1[41 * kNumLanes], | 
 |          &buf0[40 * kNumLanes], &buf0[41 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[43 * kNumLanes], &buf1[42 * kNumLanes], | 
 |          &buf0[43 * kNumLanes], &buf0[42 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[44 * kNumLanes], &buf1[45 * kNumLanes], | 
 |          &buf0[44 * kNumLanes], &buf0[45 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[47 * kNumLanes], &buf1[46 * kNumLanes], | 
 |          &buf0[47 * kNumLanes], &buf0[46 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[48 * kNumLanes], &buf1[49 * kNumLanes], | 
 |          &buf0[48 * kNumLanes], &buf0[49 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[51 * kNumLanes], &buf1[50 * kNumLanes], | 
 |          &buf0[51 * kNumLanes], &buf0[50 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[52 * kNumLanes], &buf1[53 * kNumLanes], | 
 |          &buf0[52 * kNumLanes], &buf0[53 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[55 * kNumLanes], &buf1[54 * kNumLanes], | 
 |          &buf0[55 * kNumLanes], &buf0[54 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[56 * kNumLanes], &buf1[57 * kNumLanes], | 
 |          &buf0[56 * kNumLanes], &buf0[57 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[59 * kNumLanes], &buf1[58 * kNumLanes], | 
 |          &buf0[59 * kNumLanes], &buf0[58 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[60 * kNumLanes], &buf1[61 * kNumLanes], | 
 |          &buf0[60 * kNumLanes], &buf0[61 * kNumLanes]); | 
 |   AddSub(int_tag, &buf0[63 * kNumLanes], &buf1[62 * kNumLanes], | 
 |          &buf0[63 * kNumLanes], &buf0[62 * kNumLanes]); | 
 |   // stage 10 | 
 |   Butterfly(int_tag, cospi[1], cospi[63], &buf0[63 * kNumLanes], | 
 |             &buf0[32 * kNumLanes], &buf1[32 * kNumLanes], &buf1[63 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[33], cospi[31], &buf0[62 * kNumLanes], | 
 |             &buf0[33 * kNumLanes], &buf1[33 * kNumLanes], &buf1[62 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[17], cospi[47], &buf0[61 * kNumLanes], | 
 |             &buf0[34 * kNumLanes], &buf1[34 * kNumLanes], &buf1[61 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[49], cospi[15], &buf0[60 * kNumLanes], | 
 |             &buf0[35 * kNumLanes], &buf1[35 * kNumLanes], &buf1[60 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[9], cospi[55], &buf0[59 * kNumLanes], | 
 |             &buf0[36 * kNumLanes], &buf1[36 * kNumLanes], &buf1[59 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[41], cospi[23], &buf0[58 * kNumLanes], | 
 |             &buf0[37 * kNumLanes], &buf1[37 * kNumLanes], &buf1[58 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[25], cospi[39], &buf0[57 * kNumLanes], | 
 |             &buf0[38 * kNumLanes], &buf1[38 * kNumLanes], &buf1[57 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[57], cospi[7], &buf0[56 * kNumLanes], | 
 |             &buf0[39 * kNumLanes], &buf1[39 * kNumLanes], &buf1[56 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[05], cospi[59], &buf0[55 * kNumLanes], | 
 |             &buf0[40 * kNumLanes], &buf1[40 * kNumLanes], &buf1[55 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[37], cospi[27], &buf0[54 * kNumLanes], | 
 |             &buf0[41 * kNumLanes], &buf1[41 * kNumLanes], &buf1[54 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[21], cospi[43], &buf0[53 * kNumLanes], | 
 |             &buf0[42 * kNumLanes], &buf1[42 * kNumLanes], &buf1[53 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[53], cospi[11], &buf0[52 * kNumLanes], | 
 |             &buf0[43 * kNumLanes], &buf1[43 * kNumLanes], &buf1[52 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[13], cospi[51], &buf0[51 * kNumLanes], | 
 |             &buf0[44 * kNumLanes], &buf1[44 * kNumLanes], &buf1[51 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[45], cospi[19], &buf0[50 * kNumLanes], | 
 |             &buf0[45 * kNumLanes], &buf1[45 * kNumLanes], &buf1[50 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[29], cospi[35], &buf0[49 * kNumLanes], | 
 |             &buf0[46 * kNumLanes], &buf1[46 * kNumLanes], &buf1[49 * kNumLanes], | 
 |             cos_bit, round); | 
 |   Butterfly(int_tag, cospi[61], cospi[3], &buf0[48 * kNumLanes], | 
 |             &buf0[47 * kNumLanes], &buf1[47 * kNumLanes], &buf1[48 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 11 | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[0 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[63 * kNumLanes], &in[63 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[32 * kNumLanes], &in[1 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[31 * kNumLanes], &in[62 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[16 * kNumLanes], &in[2 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[47 * kNumLanes], &in[61 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[48 * kNumLanes], &in[3 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[60 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[4 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[55 * kNumLanes], &in[59 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[40 * kNumLanes], &in[5 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[23 * kNumLanes], &in[58 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[24 * kNumLanes], &in[6 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[39 * kNumLanes], &in[57 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[56 * kNumLanes], &in[7 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[56 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[4 * kNumLanes], &in[8 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[59 * kNumLanes], &in[55 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[36 * kNumLanes], &in[9 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[27 * kNumLanes], &in[54 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[20 * kNumLanes], &in[10 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[43 * kNumLanes], &in[53 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[52 * kNumLanes], &in[11 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[52 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[12 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[51 * kNumLanes], &in[51 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[44 * kNumLanes], &in[13 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[19 * kNumLanes], &in[50 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[28 * kNumLanes], &in[14 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[35 * kNumLanes], &in[49 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[60 * kNumLanes], &in[15 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[48 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[16 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[61 * kNumLanes], &in[47 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[34 * kNumLanes], &in[17 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[29 * kNumLanes], &in[46 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[18 * kNumLanes], &in[18 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[45 * kNumLanes], &in[45 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[50 * kNumLanes], &in[19 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[44 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[20 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[53 * kNumLanes], &in[43 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[42 * kNumLanes], &in[21 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[21 * kNumLanes], &in[42 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[26 * kNumLanes], &in[22 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[37 * kNumLanes], &in[41 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[58 * kNumLanes], &in[23 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[40 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[6 * kNumLanes], &in[24 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[57 * kNumLanes], &in[39 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[38 * kNumLanes], &in[25 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[25 * kNumLanes], &in[38 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[22 * kNumLanes], &in[26 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[41 * kNumLanes], &in[37 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[54 * kNumLanes], &in[27 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[36 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[28 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[49 * kNumLanes], &in[35 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[46 * kNumLanes], &in[29 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[17 * kNumLanes], &in[34 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf0[30 * kNumLanes], &in[30 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[33 * kNumLanes], &in[33 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[62 * kNumLanes], &in[31 * OutStride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[32 * OutStride]); | 
 | } | 
 |  | 
 | template <size_t LaneSize, size_t NumLanes> | 
 | struct Fadst4Traits { | 
 |   template <size_t Width, typename D> | 
 |   HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, | 
 |                                          hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                          const int8_t cos_bit, | 
 |                                          const size_t instride) { | 
 |     const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); | 
 |     const auto round = hn::Set(int_tag, 1 << (cos_bit - 1)); | 
 |     const auto sinpi1 = hn::Set(int_tag, sinpi[1]); | 
 |     const auto sinpi2 = hn::Set(int_tag, sinpi[2]); | 
 |     const auto sinpi3 = hn::Set(int_tag, sinpi[3]); | 
 |     const auto sinpi4 = hn::Set(int_tag, sinpi[4]); | 
 |     const auto in0 = hn::Load(int_tag, &in[0 * instride]); | 
 |     const auto in1 = hn::Load(int_tag, &in[1 * instride]); | 
 |     const auto in2 = hn::Load(int_tag, &in[2 * instride]); | 
 |     const auto in3 = hn::Load(int_tag, &in[3 * instride]); | 
 |     auto s0 = hn::Mul(in0, sinpi1); | 
 |     auto s1 = hn::Mul(in0, sinpi4); | 
 |     auto s2 = hn::Mul(in1, sinpi2); | 
 |     auto s3 = hn::Mul(in1, sinpi1); | 
 |     auto s4 = hn::Mul(in2, sinpi3); | 
 |     auto s5 = hn::Mul(in3, sinpi4); | 
 |     auto s6 = hn::Mul(in3, sinpi2); | 
 |     auto s7 = hn::Sub(hn::Add(in0, in1), in3); | 
 |     auto x0 = hn::Add(hn::Add(s0, s2), s5); | 
 |     auto x1 = hn::Mul(s7, sinpi3); | 
 |     auto x2 = hn::Add(hn::Sub(s1, s3), s6); | 
 |     auto x3 = s4; | 
 |     s0 = hn::Add(x0, x3); | 
 |     s1 = x1; | 
 |     s2 = hn::Sub(x2, x3); | 
 |     s3 = hn::Add(hn::Sub(x2, x0), x3); | 
 |     auto u0 = hn::Add(s0, round); | 
 |     u0 = hn::ShiftRightSame(u0, cos_bit); | 
 |     auto u1 = hn::Add(s1, round); | 
 |     u1 = hn::ShiftRightSame(u1, cos_bit); | 
 |     auto u2 = hn::Add(s2, round); | 
 |     u2 = hn::ShiftRightSame(u2, cos_bit); | 
 |     auto u3 = hn::Add(s3, round); | 
 |     u3 = hn::ShiftRightSame(u3, cos_bit); | 
 |     hn::Store(u0, int_tag, &in[0 * instride]); | 
 |     hn::Store(u1, int_tag, &in[1 * instride]); | 
 |     hn::Store(u2, int_tag, &in[2 * instride]); | 
 |     hn::Store(u3, int_tag, &in[3 * instride]); | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct Fadst4Traits<2, 4> { | 
 |   template <size_t Width, typename D> | 
 |   HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, | 
 |                                          hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                          const int8_t cos_bit, | 
 |                                          const size_t instride) { | 
 |     (void)int_tag; | 
 |     const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); | 
 |     constexpr hn::FixedTag<hn::TFromD<D>, 8> demote_tag; | 
 |     constexpr hn::RepartitionToWide<decltype(demote_tag)> int32_tag; | 
 |     const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); | 
 |     const auto sinpi_p01_p02 = SetPair(demote_tag, sinpi[1], sinpi[2]); | 
 |     const auto sinpi_p04_m01 = SetPair(demote_tag, sinpi[4], -sinpi[1]); | 
 |     const auto sinpi_p03_p04 = SetPair(demote_tag, sinpi[3], sinpi[4]); | 
 |     const auto sinpi_m03_p02 = SetPair(demote_tag, -sinpi[3], sinpi[2]); | 
 |     const auto sinpi_p03_p03 = hn::Set(demote_tag, sinpi[3]); | 
 |     const auto in0 = hn::Load(demote_tag, &in[0 * instride]); | 
 |     const auto in1 = hn::Load(demote_tag, &in[1 * instride]); | 
 |     const auto in2 = hn::Load(demote_tag, &in[2 * instride]); | 
 |     const auto in3 = hn::Load(demote_tag, &in[3 * instride]); | 
 |     const auto in7 = hn::Add(in0, in1); | 
 |     auto u0 = hn::InterleaveLower(in0, in1); | 
 |     auto u1 = hn::InterleaveLower(in2, in3); | 
 |     auto u2 = hn::InterleaveLower(in7, hn::Zero(demote_tag)); | 
 |     auto u3 = hn::InterleaveLower(in2, hn::Zero(demote_tag)); | 
 |     auto u4 = hn::InterleaveLower(in3, hn::Zero(demote_tag)); | 
 |     auto v0 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p01_p02);  // s0 + s2 | 
 |     auto v1 = hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_p03_p04);  // s4 + s5 | 
 |     auto v2 = hn::WidenMulPairwiseAdd(int32_tag, u2, sinpi_p03_p03);  // x1 | 
 |     auto v3 = hn::WidenMulPairwiseAdd(int32_tag, u0, sinpi_p04_m01);  // s1 - s3 | 
 |     auto v4 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, u1, sinpi_m03_p02);  // -s4 + s6 | 
 |     auto v5 = hn::WidenMulPairwiseAdd(int32_tag, u3, sinpi_p03_p03);  // s4 | 
 |     auto v6 = hn::WidenMulPairwiseAdd(int32_tag, u4, sinpi_p03_p03); | 
 |     auto w0 = hn::Add(v0, v1); | 
 |     auto w1 = hn::Sub(v2, v6); | 
 |     auto w2 = hn::Add(v3, v4); | 
 |     auto w3 = hn::Sub(w2, w0); | 
 |     auto w4 = hn::ShiftLeft<2>(v5); | 
 |     auto w5 = hn::Sub(w4, v5); | 
 |     auto w6 = hn::Add(w3, w5); | 
 |     v0 = hn::Add(w0, round); | 
 |     v1 = hn::Add(w1, round); | 
 |     v2 = hn::Add(w2, round); | 
 |     v3 = hn::Add(w6, round); | 
 |     w0 = hn::ShiftRightSame(v0, cos_bit); | 
 |     w1 = hn::ShiftRightSame(v1, cos_bit); | 
 |     w2 = hn::ShiftRightSame(v2, cos_bit); | 
 |     w3 = hn::ShiftRightSame(v3, cos_bit); | 
 |     auto o0 = hn::ReorderDemote2To(demote_tag, w0, w2); | 
 |     auto o1 = hn::ReorderDemote2To(demote_tag, w1, w3); | 
 |     hn::Store(o0, demote_tag, &in[0 * instride]); | 
 |     hn::Store(o1, demote_tag, &in[1 * instride]); | 
 |     hn::Store(hn::ShiftRightLanes<4>(demote_tag, o0), demote_tag, | 
 |               &in[2 * instride]); | 
 |     hn::Store(hn::ShiftRightLanes<4>(demote_tag, o1), demote_tag, | 
 |               &in[3 * instride]); | 
 |   } | 
 | }; | 
 |  | 
 | template <size_t NumLanes> | 
 | struct Fadst4Traits<2, NumLanes> { | 
 |   template <size_t Width, typename D> | 
 |   HWY_ATTR HWY_INLINE static void Fadst4(D int_tag, | 
 |                                          hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                          const int8_t cos_bit, | 
 |                                          const size_t instride) { | 
 |     const int32_t *HWY_RESTRICT const sinpi = sinpi_arr(cos_bit); | 
 |     constexpr hn::RepartitionToWide<D> int32_tag; | 
 |     const auto round = hn::Set(int32_tag, 1 << (cos_bit - 1)); | 
 |     const auto sinpi_p01_p02 = SetPair(int_tag, sinpi[1], sinpi[2]); | 
 |     const auto sinpi_p04_m01 = SetPair(int_tag, sinpi[4], -sinpi[1]); | 
 |     const auto sinpi_p03_p04 = SetPair(int_tag, sinpi[3], sinpi[4]); | 
 |     const auto sinpi_m03_p02 = SetPair(int_tag, -sinpi[3], sinpi[2]); | 
 |     const auto sinpi_p03_p03 = hn::Set(int_tag, sinpi[3]); | 
 |     const auto in0 = hn::Load(int_tag, &in[0 * instride]); | 
 |     const auto in1 = hn::Load(int_tag, &in[1 * instride]); | 
 |     const auto in2 = hn::Load(int_tag, &in[2 * instride]); | 
 |     const auto in3 = hn::Load(int_tag, &in[3 * instride]); | 
 |     const auto in7 = hn::Add(in0, in1); | 
 |     auto ul0 = hn::InterleaveLower(int_tag, in0, in1); | 
 |     auto uh0 = hn::InterleaveUpper(int_tag, in0, in1); | 
 |     auto ul1 = hn::InterleaveLower(int_tag, in2, in3); | 
 |     auto uh1 = hn::InterleaveUpper(int_tag, in2, in3); | 
 |     auto ul2 = hn::InterleaveLower(int_tag, in7, hn::Zero(int_tag)); | 
 |     auto uh2 = hn::InterleaveUpper(int_tag, in7, hn::Zero(int_tag)); | 
 |     auto ul3 = hn::InterleaveLower(int_tag, in2, hn::Zero(int_tag)); | 
 |     auto uh3 = hn::InterleaveUpper(int_tag, in2, hn::Zero(int_tag)); | 
 |     auto ul4 = hn::InterleaveLower(int_tag, in3, hn::Zero(int_tag)); | 
 |     auto uh4 = hn::InterleaveUpper(int_tag, in3, hn::Zero(int_tag)); | 
 |     auto vl0 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p01_p02);  // s0 + s2 | 
 |     auto vh0 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p01_p02);  // s0 + s2 | 
 |     auto vl1 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_p03_p04);  // s4 + s5 | 
 |     auto vh1 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_p03_p04);  // s4 + s5 | 
 |     auto vl2 = hn::WidenMulPairwiseAdd(int32_tag, ul2, sinpi_p03_p03);  // x1 | 
 |     auto vh2 = hn::WidenMulPairwiseAdd(int32_tag, uh2, sinpi_p03_p03);  // x1 | 
 |     auto vl3 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, ul0, sinpi_p04_m01);  // s1 - s3 | 
 |     auto vh3 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, uh0, sinpi_p04_m01);  // s1 - s3 | 
 |     auto vl4 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, ul1, sinpi_m03_p02);  // -s4 + s6 | 
 |     auto vh4 = | 
 |         hn::WidenMulPairwiseAdd(int32_tag, uh1, sinpi_m03_p02);  // -s4 + s6 | 
 |     auto vl5 = hn::WidenMulPairwiseAdd(int32_tag, ul3, sinpi_p03_p03);  // s4 | 
 |     auto vh5 = hn::WidenMulPairwiseAdd(int32_tag, uh3, sinpi_p03_p03);  // s4 | 
 |     auto vl6 = hn::WidenMulPairwiseAdd(int32_tag, ul4, sinpi_p03_p03); | 
 |     auto vh6 = hn::WidenMulPairwiseAdd(int32_tag, uh4, sinpi_p03_p03); | 
 |     auto wl0 = hn::Add(vl0, vl1); | 
 |     auto wh0 = hn::Add(vh0, vh1); | 
 |     auto wl1 = hn::Sub(vl2, vl6); | 
 |     auto wh1 = hn::Sub(vh2, vh6); | 
 |     auto wl2 = hn::Add(vl3, vl4); | 
 |     auto wh2 = hn::Add(vh3, vh4); | 
 |     auto wl3 = hn::Sub(wl2, wl0); | 
 |     auto wh3 = hn::Sub(wh2, wh0); | 
 |     auto wl4 = hn::ShiftLeft<2>(vl5); | 
 |     auto wh4 = hn::ShiftLeft<2>(vh5); | 
 |     auto wl5 = hn::Sub(wl4, vl5); | 
 |     auto wh5 = hn::Sub(wh4, vh5); | 
 |     auto wl6 = hn::Add(wl3, wl5); | 
 |     auto wh6 = hn::Add(wh3, wh5); | 
 |     vl0 = hn::Add(wl0, round); | 
 |     vh0 = hn::Add(wh0, round); | 
 |     vl1 = hn::Add(wl1, round); | 
 |     vh1 = hn::Add(wh1, round); | 
 |     vl2 = hn::Add(wl2, round); | 
 |     vh2 = hn::Add(wh2, round); | 
 |     vl3 = hn::Add(wl6, round); | 
 |     vh3 = hn::Add(wh6, round); | 
 |     wl0 = hn::ShiftRightSame(vl0, cos_bit); | 
 |     wh0 = hn::ShiftRightSame(vh0, cos_bit); | 
 |     wl1 = hn::ShiftRightSame(vl1, cos_bit); | 
 |     wh1 = hn::ShiftRightSame(vh1, cos_bit); | 
 |     wl2 = hn::ShiftRightSame(vl2, cos_bit); | 
 |     wh2 = hn::ShiftRightSame(vh2, cos_bit); | 
 |     wl3 = hn::ShiftRightSame(vl3, cos_bit); | 
 |     wh3 = hn::ShiftRightSame(vh3, cos_bit); | 
 |     auto o0 = hn::ReorderDemote2To(int_tag, wl0, wh0); | 
 |     auto o1 = hn::ReorderDemote2To(int_tag, wl1, wh1); | 
 |     auto o2 = hn::ReorderDemote2To(int_tag, wl2, wh2); | 
 |     auto o3 = hn::ReorderDemote2To(int_tag, wl3, wh3); | 
 |     hn::Store(o0, int_tag, &in[0 * instride]); | 
 |     hn::Store(o1, int_tag, &in[1 * instride]); | 
 |     hn::Store(o2, int_tag, &in[2 * instride]); | 
 |     hn::Store(o3, int_tag, &in[3 * instride]); | 
 |   } | 
 | }; | 
 |  | 
 | template <size_t Width, typename D> | 
 | HWY_ATTR HWY_INLINE void Fadst4(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                 const int8_t cos_bit, const size_t instride) { | 
 |   Fadst4Traits<sizeof(hn::TFromD<D>), | 
 |                hn::MaxLanes(int_tag)>::template Fadst4<Width>(int_tag, in, | 
 |                                                               cos_bit, | 
 |                                                               instride); | 
 | } | 
 |  | 
 | template <size_t Width, typename D> | 
 | HWY_ATTR HWY_INLINE void Fadst8(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                 const int8_t cos_bit, const size_t instride) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[8 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[8 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |  | 
 |   // stage 0 | 
 |   // stage 1 | 
 |   hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, | 
 |             &buf0[0 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, | 
 |             &buf0[1 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, | 
 |             &buf0[2 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, | 
 |             &buf0[3 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, | 
 |             &buf0[4 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, | 
 |             &buf0[5 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, | 
 |             &buf0[6 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, | 
 |             &buf0[7 * kNumLanes]); | 
 |  | 
 |   // stage 2 | 
 |   hwy::CopyBytes<2 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], | 
 |             &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], | 
 |             cos_bit, round); | 
 |   hwy::CopyBytes<2 * kNumBytes>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], | 
 |             &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 3 | 
 |   for (size_t j = 0; j < 8; j += 4) { | 
 |     for (size_t i = 0; i < 2; ++i) { | 
 |       AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], | 
 |              &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], | 
 |              &buf0[(2 + i + j) * kNumLanes]); | 
 |     } | 
 |   } | 
 |  | 
 |   // stage 4 | 
 |   hwy::CopyBytes<4 * kNumBytes>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); | 
 |  | 
 |   // stage 5 | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(4 + i) * kNumLanes], | 
 |            &buf0[(0 + i) * kNumLanes], &buf0[(4 + i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 6 | 
 |   HalfButterfly(int_tag, cospi[4], cospi[60], &buf0[0 * kNumLanes], | 
 |                 &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[60], -cospi[4], &buf0[0 * kNumLanes], | 
 |                 &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[20], cospi[44], &buf0[2 * kNumLanes], | 
 |                 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[44], -cospi[20], &buf0[2 * kNumLanes], | 
 |                 &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[36], cospi[28], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[28], -cospi[36], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[52], cospi[12], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[12], -cospi[52], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); | 
 |  | 
 |   // stage 7 | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[1 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[3 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[5 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[7 * instride]); | 
 | } | 
 |  | 
 | template <size_t Width, typename D> | 
 | HWY_ATTR HWY_INLINE void Fadst16(D int_tag, hn::TFromD<D> *HWY_RESTRICT in, | 
 |                                  const int8_t cos_bit, const size_t instride) { | 
 |   constexpr size_t kNumLanes = hn::MaxLanes(int_tag); | 
 |   constexpr size_t kNumBytes = kNumLanes * sizeof(hn::TFromD<D>); | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf0[16 * kNumLanes]; | 
 |   HWY_ALIGN_MAX hn::TFromD<D> buf1[16 * kNumLanes]; | 
 |   const int32_t *HWY_RESTRICT const cospi = cospi_arr(cos_bit); | 
 |   const auto round = hn::Set(hn::Repartition<int32_t, D>(), 1 << (cos_bit - 1)); | 
 |  | 
 |   // stage 0 | 
 |   // stage 1 | 
 |   hn::Store(hn::Load(int_tag, &in[0 * instride]), int_tag, | 
 |             &buf0[0 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[15 * instride])), int_tag, | 
 |             &buf0[1 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[7 * instride])), int_tag, | 
 |             &buf0[2 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[8 * instride]), int_tag, | 
 |             &buf0[3 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[3 * instride])), int_tag, | 
 |             &buf0[4 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[12 * instride]), int_tag, | 
 |             &buf0[5 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[4 * instride]), int_tag, | 
 |             &buf0[6 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[11 * instride])), int_tag, | 
 |             &buf0[7 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[1 * instride])), int_tag, | 
 |             &buf0[8 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[14 * instride]), int_tag, | 
 |             &buf0[9 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[6 * instride]), int_tag, | 
 |             &buf0[10 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[9 * instride])), int_tag, | 
 |             &buf0[11 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[2 * instride]), int_tag, | 
 |             &buf0[12 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[13 * instride])), int_tag, | 
 |             &buf0[13 * kNumLanes]); | 
 |   hn::Store(hn::Neg(hn::Load(int_tag, &in[5 * instride])), int_tag, | 
 |             &buf0[14 * kNumLanes]); | 
 |   hn::Store(hn::Load(int_tag, &in[10 * instride]), int_tag, | 
 |             &buf0[15 * kNumLanes]); | 
 |  | 
 |   // stage 2 | 
 |   hwy::CopyBytes<kNumBytes * 2>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[2 * kNumLanes], | 
 |             &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], &buf1[3 * kNumLanes], | 
 |             cos_bit, round); | 
 |   hwy::CopyBytes<kNumBytes * 2>(&buf0[4 * kNumLanes], &buf1[4 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[6 * kNumLanes], | 
 |             &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], &buf1[7 * kNumLanes], | 
 |             cos_bit, round); | 
 |   hwy::CopyBytes<kNumBytes * 2>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[10 * kNumLanes], | 
 |             &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], &buf1[11 * kNumLanes], | 
 |             cos_bit, round); | 
 |   hwy::CopyBytes<kNumBytes * 2>(&buf0[12 * kNumLanes], &buf1[12 * kNumLanes]); | 
 |   Butterfly(int_tag, cospi[32], cospi[32], &buf0[14 * kNumLanes], | 
 |             &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], &buf1[15 * kNumLanes], | 
 |             cos_bit, round); | 
 |  | 
 |   // stage 3 | 
 |   for (size_t j = 0; j < 16; j += 4) { | 
 |     for (size_t i = 0; i < 2; ++i) { | 
 |       AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], | 
 |              &buf1[(2 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], | 
 |              &buf0[(2 + i + j) * kNumLanes]); | 
 |     } | 
 |   } | 
 |  | 
 |   // stage 4 | 
 |   hwy::CopyBytes<kNumBytes * 4>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); | 
 |   hwy::CopyBytes<kNumBytes * 4>(&buf0[8 * kNumLanes], &buf1[8 * kNumLanes]); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[48], -cospi[16], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, -cospi[48], cospi[16], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[16], cospi[48], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); | 
 |  | 
 |   // stage 5 | 
 |   for (size_t j = 0; j < 16; j += 8) { | 
 |     for (size_t i = 0; i < 4; ++i) { | 
 |       AddSub(int_tag, &buf1[(0 + i + j) * kNumLanes], | 
 |              &buf1[(4 + i + j) * kNumLanes], &buf0[(0 + i + j) * kNumLanes], | 
 |              &buf0[(4 + i + j) * kNumLanes]); | 
 |     } | 
 |   } | 
 |  | 
 |   // stage 6 | 
 |   hwy::CopyBytes<kNumBytes * 8>(&buf0[0 * kNumLanes], &buf1[0 * kNumLanes]); | 
 |   HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[8 * kNumLanes], | 
 |                 &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[56], -cospi[8], &buf0[8 * kNumLanes], | 
 |                 &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[10 * kNumLanes], | 
 |                 &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[24], -cospi[40], &buf0[10 * kNumLanes], | 
 |                 &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, -cospi[56], cospi[8], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[8], cospi[56], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, -cospi[24], cospi[40], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[40], cospi[24], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); | 
 |  | 
 |   // stage 7 | 
 |   for (size_t i = 0; i < 8; ++i) { | 
 |     AddSub(int_tag, &buf1[(0 + i) * kNumLanes], &buf1[(8 + i) * kNumLanes], | 
 |            &buf0[(0 + i) * kNumLanes], &buf0[(8 + i) * kNumLanes]); | 
 |   } | 
 |  | 
 |   // stage 8 | 
 |   HalfButterfly(int_tag, cospi[2], cospi[62], &buf0[0 * kNumLanes], | 
 |                 &buf0[1 * kNumLanes], &buf1[0 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[62], -cospi[2], &buf0[0 * kNumLanes], | 
 |                 &buf0[1 * kNumLanes], &buf1[1 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[10], cospi[54], &buf0[2 * kNumLanes], | 
 |                 &buf0[3 * kNumLanes], &buf1[2 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[54], -cospi[10], &buf0[2 * kNumLanes], | 
 |                 &buf0[3 * kNumLanes], &buf1[3 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[18], cospi[46], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[4 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[46], -cospi[18], &buf0[4 * kNumLanes], | 
 |                 &buf0[5 * kNumLanes], &buf1[5 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[26], cospi[38], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[6 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[38], -cospi[26], &buf0[6 * kNumLanes], | 
 |                 &buf0[7 * kNumLanes], &buf1[7 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[34], cospi[30], &buf0[8 * kNumLanes], | 
 |                 &buf0[9 * kNumLanes], &buf1[8 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[30], -cospi[34], &buf0[8 * kNumLanes], | 
 |                 &buf0[9 * kNumLanes], &buf1[9 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[42], cospi[22], &buf0[10 * kNumLanes], | 
 |                 &buf0[11 * kNumLanes], &buf1[10 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[22], -cospi[42], &buf0[10 * kNumLanes], | 
 |                 &buf0[11 * kNumLanes], &buf1[11 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[50], cospi[14], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[12 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[14], -cospi[50], &buf0[12 * kNumLanes], | 
 |                 &buf0[13 * kNumLanes], &buf1[13 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[58], cospi[6], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[14 * kNumLanes], cos_bit, round); | 
 |   HalfButterfly(int_tag, cospi[6], -cospi[58], &buf0[14 * kNumLanes], | 
 |                 &buf0[15 * kNumLanes], &buf1[15 * kNumLanes], cos_bit, round); | 
 |  | 
 |   // stage 9 | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[1 * kNumLanes], &in[0 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[14 * kNumLanes], &in[1 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[3 * kNumLanes], &in[2 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[12 * kNumLanes], &in[3 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[5 * kNumLanes], &in[4 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[10 * kNumLanes], &in[5 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[7 * kNumLanes], &in[6 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[8 * kNumLanes], &in[7 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[9 * kNumLanes], &in[8 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[6 * kNumLanes], &in[9 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[11 * kNumLanes], &in[10 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[4 * kNumLanes], &in[11 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[13 * kNumLanes], &in[12 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[2 * kNumLanes], &in[13 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[15 * kNumLanes], &in[14 * instride]); | 
 |   hwy::CopyBytes<kNumBytes>(&buf1[0 * kNumLanes], &in[15 * instride]); | 
 | } | 
 |  | 
 | template <size_t Width, typename D> | 
 | HWY_ATTR HWY_INLINE void IdtxAdd2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { | 
 |   for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { | 
 |     auto v = hn::Load(tag, &in[x]); | 
 |     hn::Store(hn::Add(v, v), tag, &in[x]); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, int Shift, typename D> | 
 | HWY_ATTR HWY_INLINE void IdtxShift(D tag, hn::TFromD<D> *HWY_RESTRICT in) { | 
 |   for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { | 
 |     hn::Store(hn::ShiftLeft<Shift>(hn::Load(tag, &in[x])), tag, &in[x]); | 
 |   } | 
 | } | 
 |  | 
 | template <int Scale, typename D> | 
 | HWY_ATTR HWY_INLINE void PromoteScale2x16ByNewSqrt2( | 
 |     D tag, hn::VFromD<D> v, hn::VFromD<hn::RepartitionToWide<D>> &out0, | 
 |     hn::VFromD<hn::RepartitionToWide<D>> &out1) { | 
 |   constexpr hn::RepartitionToWide<D> int32_tag; | 
 |   auto one = hn::Set(tag, 1); | 
 |   auto scale_rounding = SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); | 
 |   auto a0 = hn::InterleaveLower(tag, v, one); | 
 |   auto a1 = hn::InterleaveUpper(tag, v, one); | 
 |   out0 = hn::ShiftRight<NewSqrt2Bits>( | 
 |       hn::WidenMulPairwiseAdd(int32_tag, a0, scale_rounding)); | 
 |   out1 = hn::ShiftRight<NewSqrt2Bits>( | 
 |       hn::WidenMulPairwiseAdd(int32_tag, a1, scale_rounding)); | 
 | } | 
 |  | 
 | template <size_t LaneSize, size_t NumLanes> | 
 | struct ScaleByNewSqrt2Traits { | 
 |   template <int Scale, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, | 
 |                                                            hn::VFromD<D> v) { | 
 |     auto fact = hn::Set(tag, Scale * NewSqrt2); | 
 |     auto offset = hn::Set(tag, 1 << (NewSqrt2Bits - 1)); | 
 |     return hn::ShiftRight<NewSqrt2Bits>(hn::MulAdd(v, fact, offset)); | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct ScaleByNewSqrt2Traits<2, 4> { | 
 |   template <int Scale, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, | 
 |                                                            hn::VFromD<D> v) { | 
 |     auto one = hn::Set(tag, 1); | 
 |     auto scale_rounding = | 
 |         SetPair(tag, Scale * NewSqrt2, 1 << (NewSqrt2Bits - 1)); | 
 |     constexpr hn::Rebind<int32_t, D> int32_tag; | 
 |     auto a = hn::InterleaveLower(tag, v, one); | 
 |     auto b = hn::ShiftRight<NewSqrt2Bits>( | 
 |         hn::WidenMulPairwiseAdd(int32_tag, a, scale_rounding)); | 
 |     return hn::DemoteTo(tag, b); | 
 |   } | 
 | }; | 
 |  | 
 | template <size_t NumLanes> | 
 | struct ScaleByNewSqrt2Traits<2, NumLanes> { | 
 |   template <int Scale, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> ScaleByNewSqrt2(D tag, | 
 |                                                            hn::VFromD<D> v) { | 
 |     hn::VFromD<hn::RepartitionToWide<D>> b0, b1; | 
 |     PromoteScale2x16ByNewSqrt2<Scale>(tag, v, b0, b1); | 
 |     return hn::ReorderDemote2To(tag, b0, b1); | 
 |   } | 
 | }; | 
 |  | 
 | template <int Scale, typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> ScaleByNewSqrt2(D tag, hn::VFromD<D> v) { | 
 |   return ScaleByNewSqrt2Traits<sizeof(hn::TFromD<D>), hn::MaxLanes(tag)>:: | 
 |       template ScaleByNewSqrt2<Scale>(tag, v); | 
 | } | 
 |  | 
 | template <size_t Width, int Scale, typename D> | 
 | HWY_ATTR HWY_INLINE void IdtxSqrt2(D tag, hn::TFromD<D> *HWY_RESTRICT in) { | 
 |   for (size_t x = 0; x < Width; x += hn::MaxLanes(tag)) { | 
 |     hn::Store(ScaleByNewSqrt2<Scale>(tag, hn::Load(tag, &in[x])), tag, &in[x]); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FdctNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Width>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fdct4(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FdctNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Stride>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fdct8(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FdctNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Stride>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fdct16(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FdctNx32Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Stride>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fdct32(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t InWidth, size_t InStride, size_t OutWidth, size_t OutStride, | 
 |           typename T> | 
 | HWY_ATTR void FdctNx64Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, InWidth>(); | 
 |   for (size_t i = 0; i < OutWidth; i += hn::MaxLanes(int_tag)) { | 
 |     Fdct64<InStride, OutStride>(int_tag, &in[i], cos_bit); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR HWY_INLINE void FadstNx4Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Width>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fadst4<Width>(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FadstNx8Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Stride>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fadst8<Width>(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, typename T> | 
 | HWY_ATTR void FadstNx16Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   constexpr auto int_tag = hn::CappedTag<T, Stride>(); | 
 |   for (size_t i = 0; i < Width; i += hn::MaxLanes(int_tag)) { | 
 |     Fadst16<Width>(int_tag, &in[i], cos_bit, Stride); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, size_t BlockHeight, typename T> | 
 | HWY_ATTR void IdtxAdd2Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   (void)cos_bit; | 
 |   constexpr auto int_tag = hn::CappedTag<T, Width>(); | 
 |   for (size_t y = 0; y < BlockHeight; ++y) { | 
 |     IdtxAdd2<Width>(int_tag, &in[y * Stride]); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, size_t BlockHeight, int Scale, | 
 |           typename T> | 
 | HWY_ATTR void IdtxSqrt2Block(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   (void)cos_bit; | 
 |   constexpr auto int_tag = hn::CappedTag<T, Width>(); | 
 |   for (size_t y = 0; y < BlockHeight; ++y) { | 
 |     IdtxSqrt2<Width, Scale>(int_tag, &in[y * Stride]); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t Width, size_t Stride, size_t BlockHeight, int Shift, | 
 |           typename T> | 
 | HWY_ATTR void IdtxShiftBlock(T *HWY_RESTRICT in, int8_t cos_bit) { | 
 |   (void)cos_bit; | 
 |   constexpr auto int_tag = hn::CappedTag<T, Width>(); | 
 |   for (size_t y = 0; y < BlockHeight; ++y) { | 
 |     IdtxShift<Width, Shift>(int_tag, &in[y * Stride]); | 
 |   } | 
 | } | 
 |  | 
 | template <typename T> | 
 | void TransformFail(T *in, int8_t cos_bit) { | 
 |   (void)in; | 
 |   (void)cos_bit; | 
 |   assert(false && "Incorrect transform requested."); | 
 | } | 
 |  | 
 | template <typename T> | 
 | using Transform1D = void (*)(T *in, int8_t cos_bit); | 
 |  | 
 | template <bool PositiveOrZero> | 
 | struct RoundShiftTraits {}; | 
 |  | 
 | template <> | 
 | struct RoundShiftTraits<true> { | 
 |   template <int Bit, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, | 
 |                                                  hn::VFromD<D> value) { | 
 |     (void)int_tag; | 
 |     if CONSTEXPR_IF (Bit == 0) { | 
 |       return value; | 
 |     } else { | 
 |       return hn::ShiftLeft<Bit>(value); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct RoundShiftTraits<false> { | 
 |   template <int Bit, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> Shift(D int_tag, | 
 |                                                  hn::VFromD<D> value) { | 
 |     const auto round = hn::Set(int_tag, 1 << (-Bit - 1)); | 
 |     return hn::ShiftRight<-Bit>(hn::Add(value, round)); | 
 |   } | 
 | }; | 
 |  | 
 | template <int Bit, typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> RoundShift(D int_tag, hn::VFromD<D> value) { | 
 |   return RoundShiftTraits<(Bit >= 0)>::template Shift<Bit>(int_tag, value); | 
 | } | 
 |  | 
 | template <bool ApplyRectScale, typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> RectScale(D int_tag, hn::VFromD<D> v) { | 
 |   if CONSTEXPR_IF (ApplyRectScale) { | 
 |     return ScaleByNewSqrt2<1>(int_tag, v); | 
 |   } | 
 |   return v; | 
 | } | 
 |  | 
 | template <bool IsSame> | 
 | struct MaybePromoteTraits {}; | 
 |  | 
 | template <> | 
 | struct MaybePromoteTraits<true> { | 
 |   template <typename VIn, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { | 
 |     (void)out_tag; | 
 |     return in; | 
 |   } | 
 |  | 
 |   template <typename VIn, typename D> | 
 |   HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, | 
 |                                                 hn::TFromD<D> *out) { | 
 |     hn::StoreU(v, int_tag, out); | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct MaybePromoteTraits<false> { | 
 |   template <typename VIn, typename D> | 
 |   HWY_ATTR HWY_INLINE static hn::VFromD<D> PromoteTo(D out_tag, VIn in) { | 
 |     return hn::PromoteTo(out_tag, in); | 
 |   } | 
 |  | 
 |   template <typename VIn, typename TOut, typename D> | 
 |   HWY_ATTR HWY_INLINE static void PromoteStore2(D int_tag, VIn v, TOut *out) { | 
 |     (void)int_tag; | 
 |     constexpr hn::Repartition<TOut, D> store_tag; | 
 |     hn::StoreU(hn::PromoteLowerTo(store_tag, v), store_tag, out); | 
 |     hn::StoreU(hn::PromoteUpperTo(store_tag, v), store_tag, | 
 |                out + hn::MaxLanes(store_tag)); | 
 |   } | 
 | }; | 
 |  | 
 | template <typename VIn, typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> MaybePromoteTo(D out_tag, VIn in) { | 
 |   return MaybePromoteTraits< | 
 |       std::is_same<hn::TFromD<D>, hn::TFromV<VIn>>::value>::PromoteTo(out_tag, | 
 |                                                                       in); | 
 | } | 
 |  | 
 | template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> | 
 | HWY_ATTR HWY_INLINE void Transpose4(const TIn *HWY_RESTRICT in, | 
 |                                     TOut *HWY_RESTRICT out, size_t instride, | 
 |                                     size_t outstride) { | 
 |   constexpr hn::FixedTag<TIn, 4> int_tag; | 
 |   auto i0 = RectScale<ApplyRectScale>( | 
 |       int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[0 * instride]))); | 
 |   auto i1 = RectScale<ApplyRectScale>( | 
 |       int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[1 * instride]))); | 
 |   auto i2 = RectScale<ApplyRectScale>( | 
 |       int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[2 * instride]))); | 
 |   auto i3 = RectScale<ApplyRectScale>( | 
 |       int_tag, RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[3 * instride]))); | 
 |   HWY_ALIGN_MAX TOut interleaved[16]; | 
 |   constexpr hn::FixedTag<TOut, 4> out_tag; | 
 |   hn::StoreInterleaved4(MaybePromoteTo(out_tag, i0), | 
 |                         MaybePromoteTo(out_tag, i1), | 
 |                         MaybePromoteTo(out_tag, i2), | 
 |                         MaybePromoteTo(out_tag, i3), out_tag, interleaved); | 
 |   for (size_t i = 0; i < 4; ++i) { | 
 |     hwy::CopyBytes<hn::MaxLanes(int_tag) * sizeof(*out)>(&interleaved[i * 4], | 
 |                                                          &out[i * outstride]); | 
 |   } | 
 | } | 
 |  | 
 | template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> | 
 | HWY_ATTR HWY_INLINE void Transpose8(const TIn *HWY_RESTRICT in, | 
 |                                     TOut *HWY_RESTRICT out, size_t instride, | 
 |                                     size_t outstride) { | 
 |   constexpr hn::FixedTag<TIn, 8> int_tag; | 
 |   constexpr hn::Rebind<TOut, decltype(int_tag)> out_tag; | 
 |   // N.B. there isn't a StoreInterleaved8, so hand-code Transpose8. | 
 |   constexpr hn::RepartitionToWide<decltype(out_tag)> wide_int_tag; | 
 |   HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved0[16]; | 
 |   HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> interleaved1[16]; | 
 |   auto i0 = hn::Load(int_tag, &in[0 * instride]); | 
 |   auto i1 = hn::Load(int_tag, &in[1 * instride]); | 
 |   auto i2 = hn::Load(int_tag, &in[2 * instride]); | 
 |   auto i3 = hn::Load(int_tag, &in[3 * instride]); | 
 |   auto i4 = hn::Load(int_tag, &in[4 * instride]); | 
 |   auto i5 = hn::Load(int_tag, &in[5 * instride]); | 
 |   auto i6 = hn::Load(int_tag, &in[6 * instride]); | 
 |   auto i7 = hn::Load(int_tag, &in[7 * instride]); | 
 |   auto s0 = hn::Undefined(out_tag); | 
 |   auto s1 = hn::Undefined(out_tag); | 
 |   auto s2 = hn::Undefined(out_tag); | 
 |   auto s3 = hn::Undefined(out_tag); | 
 |   auto s4 = hn::Undefined(out_tag); | 
 |   auto s5 = hn::Undefined(out_tag); | 
 |   auto s6 = hn::Undefined(out_tag); | 
 |   auto s7 = hn::Undefined(out_tag); | 
 |   auto ip0 = MaybePromoteTo(out_tag, i0); | 
 |   auto ip1 = MaybePromoteTo(out_tag, i1); | 
 |   auto ip2 = MaybePromoteTo(out_tag, i2); | 
 |   auto ip3 = MaybePromoteTo(out_tag, i3); | 
 |   auto ip4 = MaybePromoteTo(out_tag, i4); | 
 |   auto ip5 = MaybePromoteTo(out_tag, i5); | 
 |   auto ip6 = MaybePromoteTo(out_tag, i6); | 
 |   auto ip7 = MaybePromoteTo(out_tag, i7); | 
 |   s0 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip0)); | 
 |   s1 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip1)); | 
 |   s2 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip2)); | 
 |   s3 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip3)); | 
 |   s4 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip4)); | 
 |   s5 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip5)); | 
 |   s6 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip6)); | 
 |   s7 = RectScale<ApplyRectScale>(out_tag, RoundShift<Bit>(out_tag, ip7)); | 
 |   auto u0 = hn::ZipLower(wide_int_tag, s0, s1); | 
 |   auto u1 = hn::ZipUpper(wide_int_tag, s0, s1); | 
 |   auto u2 = hn::ZipLower(wide_int_tag, s2, s3); | 
 |   auto u3 = hn::ZipUpper(wide_int_tag, s2, s3); | 
 |   auto u4 = hn::ZipLower(wide_int_tag, s4, s5); | 
 |   auto u5 = hn::ZipUpper(wide_int_tag, s4, s5); | 
 |   auto u6 = hn::ZipLower(wide_int_tag, s6, s7); | 
 |   auto u7 = hn::ZipUpper(wide_int_tag, s6, s7); | 
 |   hn::StoreInterleaved4(u0, u2, u4, u6, wide_int_tag, interleaved0); | 
 |   hn::StoreInterleaved4(u1, u3, u5, u7, wide_int_tag, interleaved1); | 
 |   constexpr size_t kNumBytes = hn::MaxLanes(int_tag) * sizeof(*out); | 
 |   if CONSTEXPR_IF (sizeof(TOut) == 2) { | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[2 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[3 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[4 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[5 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); | 
 |   } else { | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[0], &out[0 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[4], &out[1 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[0], &out[2 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[4], &out[3 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[8], &out[4 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved0[12], &out[5 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[8], &out[6 * outstride]); | 
 |     hwy::CopyBytes<kNumBytes>(&interleaved1[12], &out[7 * outstride]); | 
 |   } | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveEvenBlocks(D tag, | 
 |                                                             hn::VFromD<D> a, | 
 |                                                             hn::VFromD<D> b) { | 
 |   static_assert(sizeof(hn::TFromD<D>) == 8, | 
 |                 "LocalInterleaveEvenBlocks requires 64-bit lanes."); | 
 |   HWY_ALIGN static constexpr int64_t kIndices[] = { 0, 1, 8 + 0, 8 + 1, | 
 |                                                     4, 5, 8 + 4, 8 + 5 }; | 
 |   auto indices = hn::SetTableIndices(tag, kIndices); | 
 |   return hn::TwoTablesLookupLanes(tag, a, b, indices); | 
 | } | 
 |  | 
 | template <typename D> | 
 | HWY_ATTR HWY_INLINE hn::VFromD<D> LocalInterleaveOddBlocks(D tag, | 
 |                                                            hn::VFromD<D> a, | 
 |                                                            hn::VFromD<D> b) { | 
 |   static_assert(sizeof(hn::TFromD<D>) == 8, | 
 |                 "LocalInterleaveOddBlocks requires 64-bit lanes."); | 
 |   HWY_ALIGN static constexpr int64_t kIndices[] = { 2, 3, 8 + 2, 8 + 3, | 
 |                                                     6, 7, 8 + 6, 8 + 7 }; | 
 |   auto indices = hn::SetTableIndices(tag, kIndices); | 
 |   return hn::TwoTablesLookupLanes(tag, a, b, indices); | 
 | } | 
 |  | 
 | template <size_t LaneSize> | 
 | struct Transpose16Traits {}; | 
 |  | 
 | template <> | 
 | struct Transpose16Traits<2> { | 
 |   template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, | 
 |                                               TOut *HWY_RESTRICT out, | 
 |                                               size_t instride, | 
 |                                               size_t outstride) { | 
 |     constexpr hn::FixedTag<TIn, 16> int_tag; | 
 |     static_assert(hn::MaxLanes(int_tag) == 16, | 
 |                   "16-bit Transpose16 requires an 16-lane int_tag"); | 
 |     constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; | 
 |     constexpr hn::RepartitionToWide<decltype(wide_int_tag)> widex2_int_tag; | 
 |     HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> | 
 |         y[16 * hn::MaxLanes(wide_int_tag)]; | 
 |     HWY_ALIGN_MAX hn::TFromD<decltype(widex2_int_tag)> | 
 |         z[16 * hn::MaxLanes(widex2_int_tag)]; | 
 |     for (size_t i = 0; i < 16; i += 2) { | 
 |       auto i0 = RectScale<ApplyRectScale>( | 
 |           int_tag, | 
 |           RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); | 
 |       auto i1 = RectScale<ApplyRectScale>( | 
 |           int_tag, | 
 |           RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); | 
 |       hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, | 
 |                 &y[(i + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |       hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, | 
 |                 &y[(i + 1) * hn::MaxLanes(wide_int_tag)]); | 
 |     } | 
 |     for (size_t i = 0; i < 16; i += 4) { | 
 |       for (size_t j = 0; j < 2; ++j) { | 
 |         auto i0 = hn::Load(wide_int_tag, | 
 |                            &y[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |         auto i2 = hn::Load(wide_int_tag, | 
 |                            &y[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); | 
 |         hn::Store(hn::ZipLower(widex2_int_tag, i0, i2), widex2_int_tag, | 
 |                   &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); | 
 |         hn::Store(hn::ZipUpper(widex2_int_tag, i0, i2), widex2_int_tag, | 
 |                   &z[(i + j + 2) * hn::MaxLanes(widex2_int_tag)]); | 
 |       } | 
 |     } | 
 |     for (size_t i = 0; i < 16; i += 8) { | 
 |       for (size_t j = 0; j < 4; ++j) { | 
 |         auto i0 = hn::Load(widex2_int_tag, | 
 |                            &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); | 
 |         auto i4 = hn::Load(widex2_int_tag, | 
 |                            &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); | 
 |         hn::Store(hn::InterleaveLower(widex2_int_tag, i0, i4), widex2_int_tag, | 
 |                   &z[(i + j + 0) * hn::MaxLanes(widex2_int_tag)]); | 
 |         hn::Store(hn::InterleaveUpper(widex2_int_tag, i0, i4), widex2_int_tag, | 
 |                   &z[(i + j + 4) * hn::MaxLanes(widex2_int_tag)]); | 
 |       } | 
 |     } | 
 |     static constexpr size_t kStoreIndex[] = { 0, 4,  2,  6,  1, 5,  3,  7, | 
 |                                               8, 12, 10, 14, 9, 13, 11, 15 }; | 
 |     for (size_t j = 0; j < 8; ++j) { | 
 |       auto i0 = | 
 |           hn::Load(widex2_int_tag, &z[(j + 0) * hn::MaxLanes(widex2_int_tag)]); | 
 |       auto i8 = | 
 |           hn::Load(widex2_int_tag, &z[(j + 8) * hn::MaxLanes(widex2_int_tag)]); | 
 |       hn::StoreU( | 
 |           hn::BitCast(int_tag, hn::ConcatLowerLower(widex2_int_tag, i8, i0)), | 
 |           int_tag, &out[kStoreIndex[j + 0] * outstride]); | 
 |       hn::StoreU( | 
 |           hn::BitCast(int_tag, hn::ConcatUpperUpper(widex2_int_tag, i8, i0)), | 
 |           int_tag, &out[kStoreIndex[j + 8] * outstride]); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct Transpose16Traits<4> { | 
 |   template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose16(const TIn *HWY_RESTRICT in, | 
 |                                               TOut *HWY_RESTRICT out, | 
 |                                               size_t instride, | 
 |                                               size_t outstride) { | 
 |     constexpr hn::FixedTag<TIn, 16> int_tag; | 
 |     static_assert(hn::MaxLanes(int_tag) == 16, | 
 |                   "32-bit Transpose16 requires an 16-lane int_tag"); | 
 |     constexpr hn::RepartitionToWide<decltype(int_tag)> wide_int_tag; | 
 |     HWY_ALIGN_MAX hn::TFromD<decltype(wide_int_tag)> | 
 |         z[16 * hn::MaxLanes(wide_int_tag)]; | 
 |     for (size_t i = 0; i < 16; i += 2) { | 
 |       auto i0 = RectScale<ApplyRectScale>( | 
 |           int_tag, | 
 |           RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 0) * instride]))); | 
 |       auto i1 = RectScale<ApplyRectScale>( | 
 |           int_tag, | 
 |           RoundShift<Bit>(int_tag, hn::Load(int_tag, &in[(i + 1) * instride]))); | 
 |       hn::Store(hn::ZipLower(wide_int_tag, i0, i1), wide_int_tag, | 
 |                 &z[(i + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |       hn::Store(hn::ZipUpper(wide_int_tag, i0, i1), wide_int_tag, | 
 |                 &z[(i + 1) * hn::MaxLanes(wide_int_tag)]); | 
 |     } | 
 |     for (size_t i = 0; i < 16; i += 4) { | 
 |       for (size_t j = 0; j < 2; ++j) { | 
 |         auto i0 = hn::Load(wide_int_tag, | 
 |                            &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |         auto i2 = hn::Load(wide_int_tag, | 
 |                            &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); | 
 |         hn::Store(hn::InterleaveLower(wide_int_tag, i0, i2), wide_int_tag, | 
 |                   &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |         hn::Store(hn::InterleaveUpper(wide_int_tag, i0, i2), wide_int_tag, | 
 |                   &z[(i + j + 2) * hn::MaxLanes(wide_int_tag)]); | 
 |       } | 
 |     } | 
 |     for (size_t i = 0; i < 16; i += 8) { | 
 |       for (size_t j = 0; j < 4; ++j) { | 
 |         auto i0 = hn::Load(wide_int_tag, | 
 |                            &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |         auto i4 = hn::Load(wide_int_tag, | 
 |                            &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); | 
 |         hn::Store(LocalInterleaveEvenBlocks(wide_int_tag, i0, i4), wide_int_tag, | 
 |                   &z[(i + j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |         hn::Store(LocalInterleaveOddBlocks(wide_int_tag, i0, i4), wide_int_tag, | 
 |                   &z[(i + j + 4) * hn::MaxLanes(wide_int_tag)]); | 
 |       } | 
 |     } | 
 |     static constexpr size_t kStoreIndex[] = { 0, 2,  1, 3,  4,  6,  5,  7, | 
 |                                               8, 10, 9, 11, 12, 14, 13, 15 }; | 
 |     for (size_t j = 0; j < 8; ++j) { | 
 |       auto i0 = | 
 |           hn::Load(wide_int_tag, &z[(j + 0) * hn::MaxLanes(wide_int_tag)]); | 
 |       auto i8 = | 
 |           hn::Load(wide_int_tag, &z[(j + 8) * hn::MaxLanes(wide_int_tag)]); | 
 |       hn::StoreU( | 
 |           hn::BitCast(int_tag, hn::ConcatLowerLower(wide_int_tag, i8, i0)), | 
 |           int_tag, &out[kStoreIndex[j + 0] * outstride]); | 
 |       hn::StoreU( | 
 |           hn::BitCast(int_tag, hn::ConcatUpperUpper(wide_int_tag, i8, i0)), | 
 |           int_tag, &out[kStoreIndex[j + 8] * outstride]); | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <int8_t Bit, bool ApplyRectScale, typename TIn, typename TOut> | 
 | HWY_ATTR HWY_INLINE void Transpose16(const TIn *HWY_RESTRICT in, | 
 |                                      TOut *HWY_RESTRICT out, size_t instride, | 
 |                                      size_t outstride) { | 
 |   static_assert(sizeof(TOut) == sizeof(TIn), | 
 |                 "Transpose16 does not directly support integer promotion."); | 
 |   Transpose16Traits<sizeof(TIn)>::template Transpose16<Bit, ApplyRectScale>( | 
 |       in, out, instride, outstride); | 
 | } | 
 |  | 
 | template <size_t NumLanes, bool RequiresPromotion> | 
 | struct TransposeTraits {}; | 
 |  | 
 | template <> | 
 | struct TransposeTraits<16, true> { | 
 |   template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, | 
 |             typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, | 
 |                                             TOut *HWY_RESTRICT out, | 
 |                                             size_t instride, size_t outstride) { | 
 |     constexpr auto int_tag = | 
 |         hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); | 
 |     constexpr hn::Rebind<TIn, decltype(int_tag)> input_tag; | 
 |     HWY_ALIGN_MAX hn::TFromD<decltype(int_tag)> p[16 * hn::MaxLanes(int_tag)]; | 
 |     for (size_t r = 0; r < Height; r += 16) { | 
 |       for (size_t c = 0; c < Width; c += 16) { | 
 |         for (size_t i = 0; i < 16; ++i) { | 
 |           hn::Store( | 
 |               hn::PromoteTo(int_tag, | 
 |                             hn::Load(input_tag, &in[(r + i) * instride + c])), | 
 |               int_tag, &p[i * hn::MaxLanes(int_tag)]); | 
 |         } | 
 |         Transpose16<Bit, ApplyRectScale>(p, &out[c * outstride + r], | 
 |                                          hn::MaxLanes(int_tag), outstride); | 
 |       } | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <> | 
 | struct TransposeTraits<16, false> { | 
 |   template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, | 
 |             typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, | 
 |                                             TOut *HWY_RESTRICT out, | 
 |                                             size_t instride, size_t outstride) { | 
 |     for (size_t r = 0; r < Height; r += 16) { | 
 |       for (size_t c = 0; c < Width; c += 16) { | 
 |         Transpose16<Bit, ApplyRectScale>(&in[r * instride + c], | 
 |                                          &out[c * outstride + r], instride, | 
 |                                          outstride); | 
 |       } | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <bool RequiresPromotion> | 
 | struct TransposeTraits<8, RequiresPromotion> { | 
 |   template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, | 
 |             typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, | 
 |                                             TOut *HWY_RESTRICT out, | 
 |                                             size_t instride, size_t outstride) { | 
 |     for (size_t r = 0; r < Height; r += 8) { | 
 |       for (size_t c = 0; c < Width; c += 8) { | 
 |         Transpose8<Bit, ApplyRectScale>(&in[r * instride + c], | 
 |                                         &out[c * outstride + r], instride, | 
 |                                         outstride); | 
 |       } | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <bool RequiresPromotion> | 
 | struct TransposeTraits<4, RequiresPromotion> { | 
 |   template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, | 
 |             typename TIn, typename TOut> | 
 |   HWY_ATTR HWY_INLINE static void Transpose(const TIn *HWY_RESTRICT in, | 
 |                                             TOut *HWY_RESTRICT out, | 
 |                                             size_t instride, size_t outstride) { | 
 |     for (size_t r = 0; r < Height; r += 4) { | 
 |       for (size_t c = 0; c < Width; c += 4) { | 
 |         Transpose4<Bit, ApplyRectScale>(&in[r * instride + c], | 
 |                                         &out[c * outstride + r], instride, | 
 |                                         outstride); | 
 |       } | 
 |     } | 
 |   } | 
 | }; | 
 |  | 
 | template <size_t Width, size_t Height, int8_t Bit, bool ApplyRectScale, | 
 |           typename TIn, typename TOut> | 
 | HWY_ATTR HWY_INLINE void Transpose(const TIn *HWY_RESTRICT in, | 
 |                                    TOut *HWY_RESTRICT out, size_t instride, | 
 |                                    size_t outstride) { | 
 |   constexpr auto int_tag = | 
 |       hn::CappedTag<TOut, AOMMIN(16, AOMMIN(Width, Height))>(); | 
 |   TransposeTraits<hn::MaxLanes(int_tag), !std::is_same<TIn, TOut>::value>:: | 
 |       template Transpose<Width, Height, Bit, ApplyRectScale>(in, out, instride, | 
 |                                                              outstride); | 
 | } | 
 |  | 
 | template <size_t Width, size_t Height, int Shift, bool ApplyRectScale, | 
 |           typename TIn, typename TOut> | 
 | HWY_ATTR HWY_INLINE void StoreBlock(const TIn *HWY_RESTRICT in, size_t instride, | 
 |                                     TOut *HWY_RESTRICT out, size_t outstride) { | 
 |   constexpr hn::CappedTag<TIn, Width> load_tag; | 
 |   for (size_t r = 0; r < Height; ++r) { | 
 |     for (size_t c = 0; c < Width; c += hn::MaxLanes(load_tag)) { | 
 |       auto v = RectScale<ApplyRectScale>( | 
 |           load_tag, RoundShift<Shift>( | 
 |                         load_tag, hn::Load(load_tag, &in[r * instride + c]))); | 
 |       MaybePromoteTraits<std::is_same<TIn, TOut>::value>::PromoteStore2( | 
 |           load_tag, v, &out[r * outstride + c]); | 
 |     } | 
 |   } | 
 | } | 
 |  | 
 | template <int8_t Shift, size_t Width, bool FlipLeftRight, typename TInput, | 
 |           typename TIn> | 
 | HWY_ATTR HWY_INLINE void LoadLine(const TInput *HWY_RESTRICT input, | 
 |                                   TIn *HWY_RESTRICT in) { | 
 |   constexpr hn::CappedTag<TIn, Width> store_tag; | 
 |   constexpr hn::Rebind<TInput, decltype(store_tag)> load_tag; | 
 |   for (size_t x = 0; x < Width / hn::MaxLanes(load_tag); ++x) { | 
 |     auto v = hn::LoadU(load_tag, &input[x * hn::MaxLanes(load_tag)]); | 
 |     if CONSTEXPR_IF (FlipLeftRight) { | 
 |       v = hn::Reverse(load_tag, v); | 
 |     } | 
 |     auto vp = MaybePromoteTo(store_tag, v); | 
 |     hn::Store( | 
 |         hn::ShiftLeft<Shift>(vp), store_tag, | 
 |         &in[(FlipLeftRight ? (Width / hn::MaxLanes(store_tag)) - x - 1 : x) * | 
 |             hn::MaxLanes(store_tag)]); | 
 |   } | 
 | } | 
 |  | 
 | template <int8_t Shift, size_t Width, size_t OutStride, size_t Height, | 
 |           bool FlipUpDown, bool FlipLeftRight, typename TInput, typename TIn> | 
 | HWY_ATTR HWY_INLINE void LoadBuffer(const TInput *HWY_RESTRICT input, | 
 |                                     TIn *HWY_RESTRICT in, size_t stride) { | 
 |   for (size_t y = 0; y < Height; ++y) { | 
 |     LoadLine<Shift, Width, FlipLeftRight>( | 
 |         input + y * stride, &in[(FlipUpDown ? Height - y - 1 : y) * OutStride]); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, | 
 |           typename T> | 
 | HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform4(TX_TYPE_1D tx_type, T *in, | 
 |                                                 int8_t cos_bit) { | 
 |   switch (tx_type) { | 
 |     case DCT_1D: FdctNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; | 
 |     case IDTX_1D: | 
 |       IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 1>(in, cos_bit); | 
 |       break; | 
 |     default: FadstNx4Block<TransformWidth, BlockWidth>(in, cos_bit); break; | 
 |   } | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, | 
 |           typename T> | 
 | HWY_ATTR HWY_FLATTEN HWY_INLINE void Transform8(TX_TYPE_1D tx_type, T *in, | 
 |                                                 int8_t cos_bit) { | 
 |   switch (tx_type) { | 
 |     case DCT_1D: FdctNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; | 
 |     case IDTX_1D: | 
 |       IdtxAdd2Block<TransformWidth, BlockWidth, BlockHeight>(in, cos_bit); | 
 |       break; | 
 |     default: FadstNx8Block<TransformWidth, BlockWidth>(in, cos_bit); break; | 
 |   } | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, | 
 |           typename T> | 
 | HWY_ATTR HWY_INLINE void Transform16(TX_TYPE_1D tx_type, T *in, | 
 |                                      int8_t cos_bit) { | 
 |   static const Transform1D<T> kTransform[] = { | 
 |     FdctNx16Block<TransformWidth, BlockWidth, T>,   // DCT_1D | 
 |     FadstNx16Block<TransformWidth, BlockWidth, T>,  // ADST_1D | 
 |     FadstNx16Block<TransformWidth, BlockWidth, T>,  // FLIPADST_1D | 
 |     IdtxSqrt2Block<TransformWidth, BlockWidth, BlockHeight, 2, T>,  // IDTX_1D | 
 |   }; | 
 |   kTransform[tx_type](in, cos_bit); | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t BlockHeight, | 
 |           typename T> | 
 | HWY_ATTR HWY_INLINE void Transform32(TX_TYPE_1D tx_type, T *in, | 
 |                                      int8_t cos_bit) { | 
 |   static const Transform1D<T> kTransform[] = { | 
 |     FdctNx32Block<TransformWidth, BlockWidth, T>,  // DCT_1D | 
 |     TransformFail<T>,                              // ADST_1D | 
 |     TransformFail<T>,                              // FLIPADST_1D | 
 |     IdtxShiftBlock<TransformWidth, BlockWidth, BlockHeight, 2, T>,  // IDTX_1D | 
 |   }; | 
 |   kTransform[tx_type](in, cos_bit); | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, typename T> | 
 | HWY_ATTR HWY_INLINE void TransformFull64(TX_TYPE_1D tx_type, T *in, | 
 |                                          int8_t cos_bit) { | 
 |   (void)tx_type; | 
 |   assert(tx_type == DCT_1D); | 
 |   FdctNx64Block<TransformWidth, BlockWidth, TransformWidth, BlockWidth>( | 
 |       in, cos_bit); | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, | 
 |           size_t BlockHeight, typename T> | 
 | HWY_ATTR HWY_INLINE void TransformBelow32(TX_TYPE_1D tx_type, T *in, | 
 |                                           int8_t cos_bit) { | 
 |   if CONSTEXPR_IF (TransformHeight == 4) { | 
 |     Transform4<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); | 
 |   } else if CONSTEXPR_IF (TransformHeight == 8) { | 
 |     Transform8<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); | 
 |   } else if CONSTEXPR_IF (TransformHeight == 16) { | 
 |     Transform16<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); | 
 |   } else if CONSTEXPR_IF (TransformHeight == 32) { | 
 |     Transform32<TransformWidth, BlockWidth, BlockHeight>(tx_type, in, cos_bit); | 
 |   } else { | 
 |     assert(false && "Unsupported transform size."); | 
 |   } | 
 | } | 
 |  | 
 | template <size_t TransformWidth, size_t BlockWidth, size_t TransformHeight, | 
 |           size_t BlockHeight, typename T> | 
 | HWY_ATTR HWY_INLINE void RowTransform(TX_TYPE_1D tx_type, T *in, | 
 |                                       int8_t cos_bit) { | 
 |   if CONSTEXPR_IF (TransformWidth == 64 && TransformHeight == 64) { | 
 |     assert(tx_type == DCT_1D); | 
 |     // 64x64 only writes 32x32 of coefficients. | 
 |     FdctNx64Block<TransformWidth, BlockWidth, 32, 32>(in, cos_bit); | 
 |   } else if CONSTEXPR_IF (TransformHeight == 64) { | 
 |     TransformFull64<TransformWidth, BlockWidth>(tx_type, in, cos_bit); | 
 |   } else { | 
 |     TransformBelow32<TransformWidth, BlockWidth, TransformHeight, BlockHeight>( | 
 |         tx_type, in, cos_bit); | 
 |   } | 
 | } | 
 |  | 
 | template <TX_SIZE TxSize, typename T> | 
 | HWY_ATTR HWY_MAYBE_UNUSED void ForwardTransform2D(const int16_t *input, | 
 |                                                   int32_t *output, | 
 |                                                   size_t stride, | 
 |                                                   TX_TYPE tx_type) { | 
 |   constexpr size_t kWidth = kTxSizeWide[TxSize]; | 
 |   constexpr size_t kHeight = kTxSizeHigh[TxSize]; | 
 |   // Ensure the storage is aligned to the architecture's block width. | 
 |   constexpr size_t kMinVectorSize = | 
 |       hn::BlockDFromD<hn::ScalableTag<T>>().MaxBytes() / sizeof(uint8_t); | 
 |   constexpr size_t kBlockWidth = AOMMAX(kMinVectorSize / sizeof(T), kWidth); | 
 |   constexpr size_t kBlockHeight = AOMMAX(kMinVectorSize / sizeof(T), kHeight); | 
 |   HWY_ALIGN_MAX T buf0[kBlockWidth * kBlockHeight]; | 
 |   constexpr bool kBigRectangle = (kBlockWidth == 64 && kBlockHeight >= 32) || | 
 |                                  (kBlockWidth >= 32 && kBlockHeight == 64); | 
 |   using T2 = typename std::conditional<kBigRectangle, int32_t, T>::type; | 
 |   HWY_ALIGN_MAX T2 buf1[kBlockWidth * kBlockHeight]; | 
 |   constexpr int8_t kShift[3] = { kForwardTransformShift[TxSize][0], | 
 |                                  kForwardTransformShift[TxSize][1], | 
 |                                  kForwardTransformShift[TxSize][2] }; | 
 |   constexpr int kTransformWidthIndex = GetTxwIndex(TxSize); | 
 |   constexpr int kTransformHeightIndex = GetTxhIndex(TxSize); | 
 |   constexpr int8_t cos_bit_col = | 
 |       kForwardCosBitCol[kTransformWidthIndex][kTransformHeightIndex]; | 
 |   constexpr int8_t cos_bit_row = | 
 |       kForwardCosBitRow[kTransformWidthIndex][kTransformHeightIndex]; | 
 |   const TX_TYPE_1D vertical_transform = vtx_tab[tx_type]; | 
 |   const TX_TYPE_1D horizontal_transform = htx_tab[tx_type]; | 
 |   constexpr bool kApplyRectScale = kApplyRectScaleList[TxSize]; | 
 |   switch ((vertical_transform == FLIPADST_1D ? 1 : 0) | | 
 |           (horizontal_transform == FLIPADST_1D ? 2 : 0)) { | 
 |     case 0: | 
 |       LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, false>( | 
 |           input, buf0, stride); | 
 |       break; | 
 |     case 1: | 
 |       LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, false>( | 
 |           input, buf0, stride); | 
 |       break; | 
 |     case 2: | 
 |       LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, false, true>( | 
 |           input, buf0, stride); | 
 |       break; | 
 |     case 3: | 
 |       LoadBuffer<kShift[0], kWidth, kBlockWidth, kHeight, true, true>( | 
 |           input, buf0, stride); | 
 |       break; | 
 |   } | 
 |   if CONSTEXPR_IF (kHeight == 64) { | 
 |     TransformFull64<kWidth, kBlockWidth>(vertical_transform, buf0, cos_bit_col); | 
 |   } else { | 
 |     TransformBelow32<kWidth, kBlockWidth, kHeight, kBlockHeight>( | 
 |         vertical_transform, buf0, cos_bit_col); | 
 |   } | 
 |   Transpose<kWidth, kHeight, kShift[1], false>(buf0, buf1, kBlockWidth, | 
 |                                                kBlockHeight); | 
 |   if CONSTEXPR_IF (kWidth == 64 && kHeight == 64) { | 
 |     // 64x64 only writes 32x32 of coefficients. | 
 |     assert(tx_type == DCT_1D); | 
 |     FdctNx64Block<kHeight, kBlockHeight, 32, 32>(buf1, cos_bit_row); | 
 |     StoreBlock<32, 32, kShift[2], kApplyRectScale>(buf1, 32, output, 32); | 
 |   } else if CONSTEXPR_IF (kHeight == 64 && (kWidth == 16 || kWidth == 32)) { | 
 |     // 32x64 and 16x64 coefficients are packed into Wx32, discarding the | 
 |     // right-most results. | 
 |     RowTransform<32, kBlockHeight, kWidth, kBlockWidth>(horizontal_transform, | 
 |                                                         buf1, cos_bit_row); | 
 |     StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, | 
 |                                                             output, 32); | 
 |   } else { | 
 |     RowTransform<kHeight, kBlockHeight, kWidth, kBlockWidth>( | 
 |         horizontal_transform, buf1, cos_bit_row); | 
 |     StoreBlock<kHeight, kWidth, kShift[2], kApplyRectScale>(buf1, kBlockHeight, | 
 |                                                             output, kHeight); | 
 |   } | 
 |   if CONSTEXPR_IF (kHeight <= 16 && kWidth == 64) { | 
 |     hwy::ZeroBytes<kHeight * 32 * sizeof(*output)>(output + kHeight * 32); | 
 |   } | 
 | } | 
 |  | 
 | HWY_MAYBE_UNUSED void LowBitdepthForwardTransform2D(const int16_t *src_diff, | 
 |                                                     tran_low_t *coeff, | 
 |                                                     int diff_stride, | 
 |                                                     TxfmParam *txfm_param) { | 
 |   if (txfm_param->lossless && txfm_param->tx_size == TX_4X4) { | 
 |     assert(txfm_param->tx_type == DCT_DCT); | 
 |     av1_fwht4x4(src_diff, coeff, diff_stride); | 
 |     return; | 
 |   } | 
 |   using TransformFunction = decltype(&ForwardTransform2D<TX_4X4, int16_t>); | 
 |   constexpr TransformFunction kTable[] = { | 
 | #define POINTER(w, h, _) &ForwardTransform2D<TX_##w##X##h, int16_t>, | 
 |     FOR_EACH_TXFM2D(POINTER, _) | 
 | #undef POINTER | 
 |   }; | 
 |   kTable[txfm_param->tx_size](src_diff, coeff, diff_stride, | 
 |                               txfm_param->tx_type); | 
 | } | 
 |  | 
 | }  // namespace HWY_NAMESPACE | 
 | }  // namespace | 
 |  | 
 | HWY_AFTER_NAMESPACE(); | 
 |  | 
 | #define MAKE_HIGHBD_TXFM2D(w, h, suffix)                                       \ | 
 |   extern "C" void av1_fwd_txfm2d_##w##x##h##_##suffix(                         \ | 
 |       const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \ | 
 |       int bd);                                                                 \ | 
 |   HWY_ATTR void av1_fwd_txfm2d_##w##x##h##_##suffix(                           \ | 
 |       const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \ | 
 |       int bd) {                                                                \ | 
 |     (void)bd;                                                                  \ | 
 |     HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int32_t>(input, output,    \ | 
 |                                                              stride, tx_type); \ | 
 |   } | 
 |  | 
 | #define MAKE_LOWBD_TXFM2D(w, h, suffix)                                        \ | 
 |   extern "C" void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix(                   \ | 
 |       const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \ | 
 |       int bd);                                                                 \ | 
 |   HWY_ATTR void av1_lowbd_fwd_txfm2d_##w##x##h##_##suffix(                     \ | 
 |       const int16_t *input, int32_t *output, int stride, TX_TYPE tx_type,      \ | 
 |       int bd) {                                                                \ | 
 |     (void)bd;                                                                  \ | 
 |     HWY_NAMESPACE::ForwardTransform2D<TX_##w##X##h, int16_t>(input, output,    \ | 
 |                                                              stride, tx_type); \ | 
 |   } | 
 |  | 
 | #define MAKE_LOWBD_TXFM2D_DISPATCH(suffix)                                     \ | 
 |   extern "C" void av1_lowbd_fwd_txfm_##suffix(                                 \ | 
 |       const int16_t *src_diff, tran_low_t *coeff, int diff_stride,             \ | 
 |       TxfmParam *txfm_param);                                                  \ | 
 |   HWY_ATTR void av1_lowbd_fwd_txfm_##suffix(                                   \ | 
 |       const int16_t *src_diff, tran_low_t *coeff, int diff_stride,             \ | 
 |       TxfmParam *txfm_param) {                                                 \ | 
 |     HWY_NAMESPACE::LowBitdepthForwardTransform2D(src_diff, coeff, diff_stride, \ | 
 |                                                  txfm_param);                  \ | 
 |   } | 
 |  | 
 | #endif  // AOM_AV1_ENCODER_AV1_FWD_TXFM2D_HWY_H_ |