| // Copyright 2023 Google LLC | 
 | // SPDX-License-Identifier: Apache-2.0 | 
 | // | 
 | // Licensed under the Apache License, Version 2.0 (the "License"); | 
 | // you may not use this file except in compliance with the License. | 
 | // You may obtain a copy of the License at | 
 | // | 
 | //      http://www.apache.org/licenses/LICENSE-2.0 | 
 | // | 
 | // Unless required by applicable law or agreed to in writing, software | 
 | // distributed under the License is distributed on an "AS IS" BASIS, | 
 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
 | // See the License for the specific language governing permissions and | 
 | // limitations under the License. | 
 |  | 
 | // Include guard (still compiled once per target) | 
 | #if defined(HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_) == \ | 
 |     defined(HWY_TARGET_TOGGLE) | 
 | #ifdef HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_ | 
 | #undef HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_ | 
 | #else | 
 | #define HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_ | 
 | #endif | 
 |  | 
 | #include <stddef.h> | 
 |  | 
 | #include "third_party/highway/hwy/cache_control.h" | 
 | #include "third_party/highway/hwy/contrib/thread_pool/thread_pool.h" | 
 | #include "third_party/highway/hwy/highway.h" | 
 |  | 
 | HWY_BEFORE_NAMESPACE(); | 
 | namespace hwy { | 
 | namespace HWY_NAMESPACE { | 
 |  | 
 | template <typename TA, typename TB> | 
 | TA AddScalar(TA a, TB b) { | 
 |   return ConvertScalarTo<TA>(ConvertScalarTo<float>(a) + | 
 |                              ConvertScalarTo<float>(b)); | 
 | } | 
 |  | 
 | template <size_t kOuter, size_t kInner, typename T, bool kAdd> | 
 | HWY_NOINLINE void MatVecAddImpl(const T* HWY_RESTRICT mat, | 
 |                                 const T* HWY_RESTRICT vec, | 
 |                                 const T* HWY_RESTRICT add, T* HWY_RESTRICT out, | 
 |                                 hwy::ThreadPool& pool) { | 
 |   (void)add; | 
 |  | 
 |   // Process multiple rows at a time so that we write multiples of a cache line | 
 |   // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little | 
 |   // parallelization potential. | 
 |   constexpr size_t kChunkSize = 64 / sizeof(T); | 
 |   const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize); | 
 |  | 
 |   const ScalableTag<T> d; | 
 |   const size_t N = Lanes(d); | 
 |   // Required for Stream loop, otherwise we might have partial vectors. | 
 |   HWY_DASSERT(kChunkSize >= N); | 
 |   pool.Run(0, num_chunks, | 
 |            [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR { | 
 |              // MSVC workaround: duplicate to ensure constexpr. | 
 |              constexpr size_t kChunkSize = 64 / sizeof(T); | 
 |              // Software write-combining to avoid cache pollution from out. | 
 |              // Although `out` may be used later, keeping it out of the cache | 
 |              // now and avoiding RFOs is a consistent 5% overall win. | 
 |              HWY_ALIGN T buf[kChunkSize]; | 
 |  | 
 |              // Only handle entire chunks here because the Stream is not masked. | 
 |              // Remaining rows are handled after the pool.Run. | 
 |              const size_t begin = static_cast<size_t>(chunk * kChunkSize); | 
 |              for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) { | 
 |                auto sum0 = Zero(d); | 
 |                auto sum1 = Zero(d); | 
 |                // 4x unrolling barely helps SKX but likely helps Arm V2. | 
 |                auto sum2 = Zero(d); | 
 |                auto sum3 = Zero(d); | 
 |  | 
 |                const T* HWY_RESTRICT row = &mat[(begin + idx_row) * kInner]; | 
 |                size_t i = 0; | 
 |                // No clear win from prefetching from the next 1..3 rows. | 
 |                // clflush &row[i] is slow, clflushopt less so but not helping. | 
 |                HWY_UNROLL(1) | 
 |                for (; i + 4 * N <= kInner; i += 4 * N) { | 
 |                  const auto a0 = LoadU(d, row + i + 0 * N); | 
 |                  const auto v0 = LoadU(d, vec + i + 0 * N); | 
 |                  sum0 = MulAdd(a0, v0, sum0); | 
 |  | 
 |                  const auto a1 = LoadU(d, row + i + 1 * N); | 
 |                  const auto v1 = LoadU(d, vec + i + 1 * N); | 
 |                  sum1 = MulAdd(a1, v1, sum1); | 
 |  | 
 |                  const auto a2 = LoadU(d, row + i + 2 * N); | 
 |                  const auto v2 = LoadU(d, vec + i + 2 * N); | 
 |                  sum2 = MulAdd(a2, v2, sum2); | 
 |  | 
 |                  const auto a3 = LoadU(d, row + i + 3 * N); | 
 |                  const auto v3 = LoadU(d, vec + i + 3 * N); | 
 |                  sum3 = MulAdd(a3, v3, sum3); | 
 |                } | 
 |                // Last entire vectors | 
 |                for (; i + N <= kInner; i += N) { | 
 |                  const auto a0 = LoadU(d, row + i); | 
 |                  const auto v0 = LoadU(d, vec + i); | 
 |                  sum0 = MulAdd(a0, v0, sum0); | 
 |                } | 
 |                const size_t remainder = kInner - i; | 
 |                if (remainder != 0) { | 
 |                  const auto a0 = LoadN(d, row + i, remainder); | 
 |                  const auto v0 = LoadN(d, vec + i, remainder); | 
 |                  sum1 = MulAdd(a0, v0, sum1); | 
 |                } | 
 |                // Reduction tree: sum of all accumulators, then their lanes | 
 |                sum2 = Add(sum2, sum3); | 
 |                sum0 = Add(sum0, sum1); | 
 |                sum0 = Add(sum0, sum2); | 
 |                buf[idx_row] = ReduceSum(d, sum0); | 
 |                HWY_IF_CONSTEXPR(kAdd) { | 
 |                  buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]); | 
 |                } | 
 |              }  // idx_row | 
 |              HWY_UNROLL(4)  // 1..4 iterations | 
 |              for (size_t i = 0; i != kChunkSize; i += N) { | 
 |                Stream(Load(d, buf + i), d, out + begin + i); | 
 |              } | 
 |            }); | 
 |   hwy::FlushStream(); | 
 |  | 
 |   // Handle remainder rows which are not a multiple of the chunk size. | 
 |   for (size_t r = num_chunks * kChunkSize; r < kOuter; ++r) { | 
 |     auto sum0 = Zero(d); | 
 |  | 
 |     const T* HWY_RESTRICT row = &mat[r * kInner]; | 
 |     size_t i = 0; | 
 |     HWY_UNROLL(1) | 
 |     for (; i + N <= kInner; i += N) { | 
 |       const auto a0 = LoadU(d, row + i); | 
 |       const auto v0 = LoadU(d, vec + i); | 
 |       sum0 = MulAdd(a0, v0, sum0); | 
 |     } | 
 |     const size_t remainder = kInner - i; | 
 |     if (remainder != 0) { | 
 |       const auto a0 = LoadN(d, row + i, remainder); | 
 |       const auto v0 = LoadN(d, vec + i, remainder); | 
 |       sum0 = MulAdd(a0, v0, sum0); | 
 |     } | 
 |     out[r] = ReduceSum(d, sum0); | 
 |     HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); } | 
 |   }  // r | 
 | } | 
 |  | 
 | // Multiplies mat with vec, adds add and puts the result in out. | 
 | // | 
 | // mat is a (kOuter, kInner)-shaped array, where element [i,j] is located at | 
 | // index i * kInner + j. | 
 | // | 
 | // vec is a (kInner,)-shaped array. | 
 | // | 
 | // add is a (kOuter,)-shaped array. | 
 | // | 
 | // out is a (kOuter,)-shaped array that will set to mat @ vec + add. | 
 | template <size_t kOuter, size_t kInner, typename T> | 
 | HWY_NOINLINE void MatVecAdd(const T* HWY_RESTRICT mat, | 
 |                             const T* HWY_RESTRICT vec, | 
 |                             const T* HWY_RESTRICT add, T* HWY_RESTRICT out, | 
 |                             hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, T, true>(mat, vec, add, out, pool); | 
 | } | 
 |  | 
 | // Multiplies mat with vec and puts the result in out. | 
 | // | 
 | // mat is a (kOuter, kInner)-shaped array, where element [i,j] is located at | 
 | // index i * kInner + j. | 
 | // | 
 | // vec is a (kInner,)-shaped array. | 
 | // | 
 | // out is a (kOuter,)-shaped array that will set to mat @ vec. | 
 | template <size_t kOuter, size_t kInner, typename T> | 
 | HWY_NOINLINE void MatVec(const T* HWY_RESTRICT mat, const T* HWY_RESTRICT vec, | 
 |                          T* HWY_RESTRICT out, hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, T, false>(mat, vec, /*add=*/nullptr, out, pool); | 
 | } | 
 |  | 
 | // This target lacks too many ops required in our implementation, use | 
 | // HWY_EMU128 instead. | 
 | #if HWY_TARGET != HWY_SCALAR | 
 |  | 
 | // Specialization for bf16 matrix, which halves memory bandwidth requirements. | 
 | template <size_t kOuter, size_t kInner, bool kAdd> | 
 | HWY_NOINLINE void MatVecAddImpl(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                                 const float* HWY_RESTRICT vec, | 
 |                                 const float* HWY_RESTRICT add, | 
 |                                 float* HWY_RESTRICT out, | 
 |                                 hwy::ThreadPool& pool) { | 
 |   // Process multiple rows at a time so that we write multiples of a cache line | 
 |   // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little | 
 |   // parallelization potential. | 
 |   constexpr size_t kChunkSize = 64 / sizeof(float); | 
 |   const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize); | 
 |  | 
 |   const ScalableTag<float> d; | 
 |   const Repartition<hwy::bfloat16_t, decltype(d)> d16; | 
 |   // In the remainder loop, we only process a single f32 vector, so load half | 
 |   // vectors of bf16 to avoid overrun. | 
 |   const Half<decltype(d16)> d16h; | 
 |   using V = Vec<decltype(d)>; | 
 |   using V16 = Vec<decltype(d16)>; | 
 |   using V16H = Vec<decltype(d16h)>; | 
 |   const size_t N = Lanes(d); | 
 |   // Required for Stream loop, otherwise we might have partial vectors. | 
 |   HWY_DASSERT(kChunkSize >= N); | 
 |   pool.Run(0, num_chunks, | 
 |            [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR { | 
 |              // MSVC workaround: duplicate to ensure constexpr. | 
 |              constexpr size_t kChunkSize = 64 / sizeof(float); | 
 |              // Software write-combining to avoid cache pollution from out. | 
 |              // Although `out` may be used later, keeping it out of the cache | 
 |              // now and avoiding RFOs is a consistent 5% overall win. | 
 |              HWY_ALIGN float buf[kChunkSize]; | 
 |  | 
 |              // Only handle entire chunks here because the Stream is not masked. | 
 |              // Remaining rows are handled after the pool.Run. | 
 |              const size_t begin = static_cast<size_t>(chunk * kChunkSize); | 
 |              for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) { | 
 |                auto sum0 = Zero(d); | 
 |                auto sum1 = Zero(d); | 
 |                // 4x unrolling barely helps SKX but likely helps Arm V2. | 
 |                auto sum2 = Zero(d); | 
 |                auto sum3 = Zero(d); | 
 |  | 
 |                const hwy::bfloat16_t* HWY_RESTRICT row = | 
 |                    &mat[(begin + idx_row) * kInner]; | 
 |                size_t i = 0; | 
 |                // No clear win from prefetching from the next 1..3 rows. | 
 |                // clflush &row[i] is slow, clflushopt less so but not helping. | 
 |                HWY_UNROLL(1) | 
 |                for (; i + 4 * N <= kInner; i += 4 * N) { | 
 |                  const V16 b0 = LoadU(d16, row + i + 0 * N); | 
 |                  const V a0 = PromoteLowerTo(d, b0); | 
 |                  const V a1 = PromoteUpperTo(d, b0); | 
 |  | 
 |                  const V16 b1 = LoadU(d16, row + i + 2 * N); | 
 |                  const V a2 = PromoteLowerTo(d, b1); | 
 |                  const V a3 = PromoteUpperTo(d, b1); | 
 |  | 
 |                  const V v0 = LoadU(d, vec + i + 0 * N); | 
 |                  sum0 = MulAdd(a0, v0, sum0); | 
 |  | 
 |                  const V v1 = LoadU(d, vec + i + 1 * N); | 
 |                  sum1 = MulAdd(a1, v1, sum1); | 
 |  | 
 |                  const V v2 = LoadU(d, vec + i + 2 * N); | 
 |                  sum2 = MulAdd(a2, v2, sum2); | 
 |  | 
 |                  const V v3 = LoadU(d, vec + i + 3 * N); | 
 |                  sum3 = MulAdd(a3, v3, sum3); | 
 |                } | 
 |                // Last entire vectors | 
 |                for (; i + N <= kInner; i += N) { | 
 |                  const V16H b0 = LoadU(d16h, row + i); | 
 |                  const V a0 = PromoteTo(d, b0); | 
 |                  const V v0 = LoadU(d, vec + i); | 
 |                  sum0 = MulAdd(a0, v0, sum0); | 
 |                } | 
 |                const size_t remainder = kInner - i; | 
 |                if (remainder != 0) { | 
 |                  const V16H b0 = LoadN(d16h, row + i, remainder); | 
 |                  const V a0 = PromoteTo(d, b0); | 
 |                  const V v0 = LoadN(d, vec + i, remainder); | 
 |                  sum1 = MulAdd(a0, v0, sum1); | 
 |                } | 
 |                // Reduction tree: sum of all accumulators, then their lanes | 
 |                sum2 = Add(sum2, sum3); | 
 |                sum0 = Add(sum0, sum1); | 
 |                sum0 = Add(sum0, sum2); | 
 |                buf[idx_row] = ReduceSum(d, sum0); | 
 |                HWY_IF_CONSTEXPR(kAdd) { | 
 |                  buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]); | 
 |                } | 
 |              }  // idx_row | 
 |              HWY_UNROLL(4)  // 1..4 iterations | 
 |              for (size_t i = 0; i != kChunkSize; i += N) { | 
 |                Stream(Load(d, buf + i), d, out + begin + i); | 
 |              } | 
 |            }); | 
 |   hwy::FlushStream(); | 
 |  | 
 |   // Handle remainder rows which are not a multiple of the chunk size. | 
 |   for (size_t r = num_chunks * kChunkSize; r < kOuter; ++r) { | 
 |     auto sum0 = Zero(d); | 
 |  | 
 |     const hwy::bfloat16_t* HWY_RESTRICT row = &mat[r * kInner]; | 
 |     size_t i = 0; | 
 |     HWY_UNROLL(1) | 
 |     for (; i + N <= kInner; i += N) { | 
 |       const V16H b0 = LoadU(d16h, row + i); | 
 |       const V a0 = PromoteTo(d, b0); | 
 |       const V v0 = LoadU(d, vec + i); | 
 |       sum0 = MulAdd(a0, v0, sum0); | 
 |     } | 
 |     const size_t remainder = kInner - i; | 
 |     if (remainder != 0) { | 
 |       const V16H b0 = LoadN(d16h, row + i, remainder); | 
 |       const V a0 = PromoteTo(d, b0); | 
 |       const V v0 = LoadN(d, vec + i, remainder); | 
 |       sum0 = MulAdd(a0, v0, sum0); | 
 |     } | 
 |     out[r] = ReduceSum(d, sum0); | 
 |     HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); } | 
 |   }  // r | 
 | } | 
 |  | 
 | template <size_t kOuter, size_t kInner> | 
 | HWY_NOINLINE void MatVecAdd(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                             const float* HWY_RESTRICT vec, | 
 |                             const float* HWY_RESTRICT add, | 
 |                             float* HWY_RESTRICT out, hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, true>(mat, vec, add, out, pool); | 
 | } | 
 |  | 
 | template <size_t kOuter, size_t kInner> | 
 | HWY_NOINLINE void MatVec(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                          const float* HWY_RESTRICT vec, float* HWY_RESTRICT out, | 
 |                          hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, false>(mat, vec, /*add=*/nullptr, out, pool); | 
 | } | 
 |  | 
 | // Both mat and vec are bf16. | 
 | template <size_t kOuter, size_t kInner, bool kAdd> | 
 | HWY_NOINLINE void MatVecAddImpl(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                                 const hwy::bfloat16_t* HWY_RESTRICT vec, | 
 |                                 const hwy::bfloat16_t* HWY_RESTRICT add, | 
 |                                 float* HWY_RESTRICT out, | 
 |                                 hwy::ThreadPool& pool) { | 
 |   // Process multiple rows at a time so that we write multiples of a cache line | 
 |   // to avoid false sharing (>= 64). 128 is better than 256. 512 has too little | 
 |   // parallelization potential. | 
 |   constexpr size_t kChunkSize = 64 / sizeof(bfloat16_t); | 
 |   const uint64_t num_chunks = static_cast<uint64_t>(kOuter / kChunkSize); | 
 |  | 
 |   const ScalableTag<float> df; | 
 |   const Repartition<hwy::bfloat16_t, decltype(df)> d16; | 
 |   using V16 = Vec<decltype(d16)>; | 
 |   const size_t N = Lanes(d16); | 
 |   // Required for Stream loop, otherwise we might have partial vectors. | 
 |   HWY_DASSERT(kChunkSize >= N); | 
 |   pool.Run(0, num_chunks, | 
 |            [&](const uint64_t chunk, size_t /*thread*/) HWY_ATTR { | 
 |              // MSVC workaround: duplicate to ensure constexpr. | 
 |              constexpr size_t kChunkSize = 64 / sizeof(bfloat16_t); | 
 |              // Software write-combining to avoid cache pollution from out. | 
 |              // Although `out` may be used later, keeping it out of the cache | 
 |              // now and avoiding RFOs is a consistent 5% overall win. | 
 |              HWY_ALIGN float buf[kChunkSize]; | 
 |  | 
 |              // Only handle entire chunks here because the Stream is not masked. | 
 |              // Remaining rows are handled after the pool.Run. | 
 |              const size_t begin = static_cast<size_t>(chunk * kChunkSize); | 
 |              for (size_t idx_row = 0; idx_row < kChunkSize; ++idx_row) { | 
 |                auto sum0 = Zero(df); | 
 |                auto sum1 = Zero(df); | 
 |                auto sum2 = Zero(df); | 
 |                auto sum3 = Zero(df); | 
 |  | 
 |                const hwy::bfloat16_t* HWY_RESTRICT row = | 
 |                    &mat[(begin + idx_row) * kInner]; | 
 |                size_t i = 0; | 
 |                // No clear win from prefetching from the next 1..3 rows. | 
 |                // clflush &row[i] is slow, clflushopt less so but not helping. | 
 |                HWY_UNROLL(1) | 
 |                for (; i + 2 * N <= kInner; i += 2 * N) { | 
 |                  const V16 b0 = LoadU(d16, row + i + 0 * N); | 
 |                  const V16 b1 = LoadU(d16, row + i + 1 * N); | 
 |                  const V16 v0 = LoadU(d16, vec + i + 0 * N); | 
 |                  const V16 v1 = LoadU(d16, vec + i + 1 * N); | 
 |                  sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1); | 
 |                  sum2 = ReorderWidenMulAccumulate(df, b1, v1, sum2, sum3); | 
 |                } | 
 |                // Last entire vector | 
 |                for (; i + N <= kInner; i += N) { | 
 |                  const V16 b0 = LoadU(d16, row + i); | 
 |                  const V16 v0 = LoadU(d16, vec + i); | 
 |                  sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1); | 
 |                } | 
 |                const size_t remainder = kInner - i; | 
 |                if (remainder != 0) { | 
 |                  const V16 b0 = LoadN(d16, row + i, remainder); | 
 |                  const V16 v0 = LoadN(d16, vec + i, remainder); | 
 |                  sum2 = ReorderWidenMulAccumulate(df, b0, v0, sum2, sum3); | 
 |                } | 
 |                // Reduction tree: sum of all accumulators, then their lanes | 
 |                sum0 = Add(sum0, sum1); | 
 |                sum2 = Add(sum2, sum3); | 
 |                sum0 = Add(sum0, sum2); | 
 |                buf[idx_row] = ReduceSum(df, sum0); | 
 |                HWY_IF_CONSTEXPR(kAdd) { | 
 |                  buf[idx_row] = AddScalar(buf[idx_row], add[begin + idx_row]); | 
 |                } | 
 |              }  // idx_row | 
 |              HWY_UNROLL(4)  // 1..4 iterations | 
 |              for (size_t i = 0; i != kChunkSize; i += N / 2) { | 
 |                Stream(Load(df, buf + i), df, out + begin + i); | 
 |              } | 
 |            }); | 
 |   hwy::FlushStream(); | 
 |  | 
 |   // Handle remainder rows which are not a multiple of the chunk size. | 
 |   for (size_t r = num_chunks * kChunkSize; r < kOuter; ++r) { | 
 |     auto sum0 = Zero(df); | 
 |     auto sum1 = Zero(df); | 
 |  | 
 |     const hwy::bfloat16_t* HWY_RESTRICT row = &mat[r * kInner]; | 
 |     size_t i = 0; | 
 |     HWY_UNROLL(1) | 
 |     for (; i + N <= kInner; i += N) { | 
 |       const V16 b0 = LoadU(d16, row + i); | 
 |       const V16 v0 = LoadU(d16, vec + i); | 
 |       sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1); | 
 |     } | 
 |     const size_t remainder = kInner - i; | 
 |     if (remainder != 0) { | 
 |       const V16 b0 = LoadN(d16, row + i, remainder); | 
 |       const V16 v0 = LoadN(d16, vec + i, remainder); | 
 |       sum0 = ReorderWidenMulAccumulate(df, b0, v0, sum0, sum1); | 
 |     } | 
 |     out[r] = ReduceSum(df, Add(sum0, sum1)); | 
 |     HWY_IF_CONSTEXPR(kAdd) { out[r] = AddScalar(out[r], add[r]); } | 
 |   }  // r | 
 | } | 
 |  | 
 | template <size_t kOuter, size_t kInner> | 
 | HWY_NOINLINE void MatVecAdd(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                             const hwy::bfloat16_t* HWY_RESTRICT vec, | 
 |                             const hwy::bfloat16_t* HWY_RESTRICT add, | 
 |                             float* HWY_RESTRICT out, hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, true>(mat, vec, add, out, pool); | 
 | } | 
 |  | 
 | template <size_t kOuter, size_t kInner> | 
 | HWY_NOINLINE void MatVec(const hwy::bfloat16_t* HWY_RESTRICT mat, | 
 |                          const hwy::bfloat16_t* HWY_RESTRICT vec, | 
 |                          float* HWY_RESTRICT out, hwy::ThreadPool& pool) { | 
 |   MatVecAddImpl<kOuter, kInner, false>(mat, vec, /*add=*/nullptr, out, pool); | 
 | } | 
 |  | 
 | #endif  // HWY_TARGET != HWY_SCALAR | 
 |  | 
 | // NOLINTNEXTLINE(google-readability-namespace-comments) | 
 | }  // namespace HWY_NAMESPACE | 
 | }  // namespace hwy | 
 | HWY_AFTER_NAMESPACE(); | 
 |  | 
 | #endif  // HIGHWAY_HWY_CONTRIB_MATVEC_MATVEC_INL_H_ |