| // Copyright 2022 Google LLC |
| // SPDX-License-Identifier: Apache-2.0 |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| // Per-target include guard |
| #if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \ |
| defined(HWY_TARGET_TOGGLE) // NOLINT |
| #ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ |
| #undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ |
| #else |
| #define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ |
| #endif |
| |
| #include <stddef.h> |
| #include <stdint.h> |
| |
| #include "third_party/highway/hwy/highway.h" |
| |
| HWY_BEFORE_NAMESPACE(); |
| namespace hwy { |
| namespace HWY_NAMESPACE { |
| |
| // These functions avoid having to write a loop plus remainder handling in the |
| // (unfortunately still common) case where arrays are not aligned/padded. If the |
| // inputs are known to be aligned/padded, it is more efficient to write a single |
| // loop using Load(). We do not provide a CopyAlignedPadded because it |
| // would be more verbose than such a loop. |
| |
| // Fills `to`[0, `count`) with `value`. |
| template <class D, typename T = TFromD<D>> |
| void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) { |
| const size_t N = Lanes(d); |
| const Vec<D> v = Set(d, value); |
| |
| size_t idx = 0; |
| if (count >= N) { |
| for (; idx <= count - N; idx += N) { |
| StoreU(v, d, to + idx); |
| } |
| } |
| |
| // `count` was a multiple of the vector length `N`: already done. |
| if (HWY_UNLIKELY(idx == count)) return; |
| |
| const size_t remaining = count - idx; |
| HWY_DASSERT(0 != remaining && remaining < N); |
| SafeFillN(remaining, value, d, to + idx); |
| } |
| |
| // Copies `from`[0, `count`) to `to`, which must not overlap `from`. |
| template <class D, typename T = TFromD<D>> |
| void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) { |
| const size_t N = Lanes(d); |
| |
| size_t idx = 0; |
| if (count >= N) { |
| for (; idx <= count - N; idx += N) { |
| const Vec<D> v = LoadU(d, from + idx); |
| StoreU(v, d, to + idx); |
| } |
| } |
| |
| // `count` was a multiple of the vector length `N`: already done. |
| if (HWY_UNLIKELY(idx == count)) return; |
| |
| const size_t remaining = count - idx; |
| HWY_DASSERT(0 != remaining && remaining < N); |
| SafeCopyN(remaining, d, from + idx, to + idx); |
| } |
| |
| // For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the |
| // corresponding mask element of `func(d, v)` is true. Returns the STL-style end |
| // of the newly written elements in `to`. |
| // |
| // `func` is either a functor with a templated operator()(d, v) returning a |
| // mask, or a generic lambda if using C++14. Due to apparent limitations of |
| // Clang on Windows, it is currently necessary to add HWY_ATTR before the |
| // opening { of the lambda to avoid errors about "function .. requires target". |
| // |
| // NOTE: this is only supported for 16-, 32- or 64-bit types. |
| // NOTE: Func may be called a second time for elements it has already seen, but |
| // these elements will not be written to `to` again. |
| template <class D, class Func, typename T = TFromD<D>> |
| T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to, |
| const Func& func) { |
| const size_t N = Lanes(d); |
| |
| size_t idx = 0; |
| if (count >= N) { |
| for (; idx <= count - N; idx += N) { |
| const Vec<D> v = LoadU(d, from + idx); |
| to += CompressBlendedStore(v, func(d, v), d, to); |
| } |
| } |
| |
| // `count` was a multiple of the vector length `N`: already done. |
| if (HWY_UNLIKELY(idx == count)) return to; |
| |
| #if HWY_MEM_OPS_MIGHT_FAULT |
| // Proceed one by one. |
| const CappedTag<T, 1> d1; |
| for (; idx < count; ++idx) { |
| using V1 = Vec<decltype(d1)>; |
| // Workaround for -Waggressive-loop-optimizations on GCC 8 |
| // (iteration 2305843009213693951 invokes undefined behavior for T=i64) |
| const uintptr_t addr = reinterpret_cast<uintptr_t>(from); |
| const T* HWY_RESTRICT from_idx = |
| reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T))); |
| const V1 v = LoadU(d1, from_idx); |
| // Avoid storing to `to` unless we know it should be kept - otherwise, we |
| // might overrun the end if it was allocated for the exact count. |
| if (CountTrue(d1, func(d1, v)) == 0) continue; |
| StoreU(v, d1, to); |
| to += 1; |
| } |
| #else |
| // Start index of the last unaligned whole vector, ending at the array end. |
| const size_t last = count - N; |
| // Number of elements before `from` or already written. |
| const size_t invalid = idx - last; |
| HWY_DASSERT(0 != invalid && invalid < N); |
| const Mask<D> mask = Not(FirstN(d, invalid)); |
| const Vec<D> v = MaskedLoad(mask, d, from + last); |
| to += CompressBlendedStore(v, And(mask, func(d, v)), d, to); |
| #endif |
| return to; |
| } |
| |
| // NOLINTNEXTLINE(google-readability-namespace-comments) |
| } // namespace HWY_NAMESPACE |
| } // namespace hwy |
| HWY_AFTER_NAMESPACE(); |
| |
| #endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_ |