third_party/highway/hwy/contrib/image/image.h - aom - Git at Google

 // Copyright 2020 Google LLC
 // SPDX-License-Identifier: Apache-2.0
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //      http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
 #define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_

 // SIMD/multicore-friendly planar image representation with row accessors.

 #include <string.h>

 #include <utility>  // std::move

 #include "third_party/highway/hwy/aligned_allocator.h"
 #include "third_party/highway/hwy/base.h"

 namespace hwy {

 // Type-independent parts of Image<> - reduces code duplication and facilitates
 // moving member function implementations to cc file.
 struct HWY_CONTRIB_DLLEXPORT ImageBase {
   // Returns required alignment in bytes for externally allocated memory.
   static size_t VectorSize();

   // Returns distance [bytes] between the start of two consecutive rows, a
   // multiple of VectorSize but NOT kAlias (see implementation).
   static size_t BytesPerRow(size_t xsize, size_t sizeof_t);

   // No allocation (for output params or unused images)
   ImageBase()
       : xsize_(0),
         ysize_(0),
         bytes_per_row_(0),
         bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}

   // Allocates memory (this is the common case)
   ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);

   // References but does not take ownership of external memory. Useful for
   // interoperability with other libraries. `aligned` must be aligned to a
   // multiple of VectorSize() and `bytes_per_row` must also be a multiple of
   // VectorSize() or preferably equal to BytesPerRow().
   ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);

   // Copy construction/assignment is forbidden to avoid inadvertent copies,
   // which can be very expensive. Use CopyImageTo() instead.
   ImageBase(const ImageBase& other) = delete;
   ImageBase& operator=(const ImageBase& other) = delete;

   // Move constructor (required for returning Image from function)
   ImageBase(ImageBase&& other) noexcept = default;

   // Move assignment (required for std::vector)
   ImageBase& operator=(ImageBase&& other) noexcept = default;

   void Swap(ImageBase& other);

   // Useful for pre-allocating image with some padding for alignment purposes
   // and later reporting the actual valid dimensions. Caller is responsible
   // for ensuring xsize/ysize are <= the original dimensions.
   void ShrinkTo(const size_t xsize, const size_t ysize) {
     xsize_ = static_cast<uint32_t>(xsize);
     ysize_ = static_cast<uint32_t>(ysize);
     // NOTE: we can't recompute bytes_per_row for more compact storage and
     // better locality because that would invalidate the image contents.
   }

   // How many pixels.
   HWY_INLINE size_t xsize() const { return xsize_; }
   HWY_INLINE size_t ysize() const { return ysize_; }

   // NOTE: do not use this for copying rows - the valid xsize may be much less.
   HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }

   // Raw access to byte contents, for interfacing with other libraries.
   // Unsigned char instead of char to avoid surprises (sign extension).
   HWY_INLINE uint8_t* bytes() {
     void* p = bytes_.get();
     return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
   }
   HWY_INLINE const uint8_t* bytes() const {
     const void* p = bytes_.get();
     return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
   }

  protected:
   // Returns pointer to the start of a row.
   HWY_INLINE void* VoidRow(const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
     if (y >= ysize_) {
       HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
     }
 #endif

     void* row = bytes_.get() + y * bytes_per_row_;
     return HWY_ASSUME_ALIGNED(row, 64);
   }

   enum class Padding {
     // Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
     kRoundUp,
     // Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
     // vector to be initialized. If done by default, this would suppress
     // legitimate msan warnings. We therefore require users to explicitly call
     // InitializePadding before using unaligned loads (e.g. convolution).
     kUnaligned
   };

   // Initializes the minimum bytes required to suppress msan warnings from
   // legitimate (according to Padding mode) vector loads/stores on the right
   // border, where some lanes are uninitialized and assumed to be unused.
   void InitializePadding(size_t sizeof_t, Padding padding);

   // (Members are non-const to enable assignment during move-assignment.)
   uint32_t xsize_;  // In valid pixels, not including any padding.
   uint32_t ysize_;
   size_t bytes_per_row_;  // Includes padding.
   AlignedFreeUniquePtr<uint8_t[]> bytes_;
 };

 // Single channel, aligned rows separated by padding. T must be POD.
 //
 // 'Single channel' (one 2D array per channel) simplifies vectorization
 // (repeating the same operation on multiple adjacent components) without the
 // complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
 // can easily iterate over all components in a row and Image requires no
 // knowledge of the pixel format beyond the component type "T".
 //
 // 'Aligned' means each row is aligned to the L1 cache line size. This prevents
 // false sharing between two threads operating on adjacent rows.
 //
 // 'Padding' is still relevant because vectors could potentially be larger than
 // a cache line. By rounding up row sizes to the vector size, we allow
 // reading/writing ALIGNED vectors whose first lane is a valid sample. This
 // avoids needing a separate loop to handle remaining unaligned lanes.
 //
 // This image layout could also be achieved with a vector and a row accessor
 // function, but a class wrapper with support for "deleter" allows wrapping
 // existing memory allocated by clients without copying the pixels. It also
 // provides convenient accessors for xsize/ysize, which shortens function
 // argument lists. Supports move-construction so it can be stored in containers.
 template <typename ComponentType>
 class Image : public ImageBase {
  public:
   using T = ComponentType;

   Image() = default;
   Image(const size_t xsize, const size_t ysize)
       : ImageBase(xsize, ysize, sizeof(T)) {}
   Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
         void* aligned)
       : ImageBase(xsize, ysize, bytes_per_row, aligned) {}

   void InitializePaddingForUnalignedAccesses() {
     InitializePadding(sizeof(T), Padding::kUnaligned);
   }

   HWY_INLINE const T* ConstRow(const size_t y) const {
     return static_cast<const T*>(VoidRow(y));
   }
   HWY_INLINE const T* ConstRow(const size_t y) {
     return static_cast<const T*>(VoidRow(y));
   }

   // Returns pointer to non-const. This allows passing const Image* parameters
   // when the callee is only supposed to fill the pixels, as opposed to
   // allocating or resizing the image.
   HWY_INLINE T* MutableRow(const size_t y) const {
     return static_cast<T*>(VoidRow(y));
   }
   HWY_INLINE T* MutableRow(const size_t y) {
     return static_cast<T*>(VoidRow(y));
   }

   // Returns number of pixels (some of which are padding) per row. Useful for
   // computing other rows via pointer arithmetic. WARNING: this must
   // NOT be used to determine xsize.
   HWY_INLINE intptr_t PixelsPerRow() const {
     return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
   }
 };

 using ImageF = Image<float>;

 // A bundle of 3 same-sized images. To fill an existing Image3 using
 // single-channel producers, we also need access to each const Image*. Const
 // prevents breaking the same-size invariant, while still allowing pixels to be
 // changed via MutableRow.
 template <typename ComponentType>
 class Image3 {
  public:
   using T = ComponentType;
   using ImageT = Image<T>;
   static constexpr size_t kNumPlanes = 3;

   Image3() : planes_{ImageT(), ImageT(), ImageT()} {}

   Image3(const size_t xsize, const size_t ysize)
       : planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
                 ImageT(xsize, ysize)} {}

   Image3(Image3&& other) noexcept {
     for (size_t i = 0; i < kNumPlanes; i++) {
       planes_[i] = std::move(other.planes_[i]);
     }
   }

   Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
     if (!SameSize(plane0, plane1) || !SameSize(plane0, plane2)) {
       HWY_ABORT(
           "Not same size: %d x %d, %d x %d, %d x %d\n",
           static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
           static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
           static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
     }
     planes_[0] = std::move(plane0);
     planes_[1] = std::move(plane1);
     planes_[2] = std::move(plane2);
   }

   // Copy construction/assignment is forbidden to avoid inadvertent copies,
   // which can be very expensive. Use CopyImageTo instead.
   Image3(const Image3& other) = delete;
   Image3& operator=(const Image3& other) = delete;

   Image3& operator=(Image3&& other) noexcept {
     for (size_t i = 0; i < kNumPlanes; i++) {
       planes_[i] = std::move(other.planes_[i]);
     }
     return *this;
   }

   HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
     return static_cast<const T*>(VoidPlaneRow(c, y));
   }
   HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
     return static_cast<const T*>(VoidPlaneRow(c, y));
   }

   HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
     return static_cast<T*>(VoidPlaneRow(c, y));
   }
   HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
     return static_cast<T*>(VoidPlaneRow(c, y));
   }

   HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }

   void Swap(Image3& other) {
     for (size_t c = 0; c < 3; ++c) {
       other.planes_[c].Swap(planes_[c]);
     }
   }

   void ShrinkTo(const size_t xsize, const size_t ysize) {
     for (ImageT& plane : planes_) {
       plane.ShrinkTo(xsize, ysize);
     }
   }

   // Sizes of all three images are guaranteed to be equal.
   HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
   HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
   // Returns offset [bytes] from one row to the next row of the same plane.
   // WARNING: this must NOT be used to determine xsize, nor for copying rows -
   // the valid xsize may be much less.
   HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
   // Returns number of pixels (some of which are padding) per row. Useful for
   // computing other rows via pointer arithmetic. WARNING: this must NOT be used
   // to determine xsize.
   HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }

  private:
   // Returns pointer to the start of a row.
   HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
 #if HWY_IS_ASAN || HWY_IS_MSAN || HWY_IS_TSAN
     if (c >= kNumPlanes || y >= ysize()) {
       HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
                 static_cast<int>(y), static_cast<int>(ysize()));
     }
 #endif
     // Use the first plane's stride because the compiler might not realize they
     // are all equal. Thus we only need a single multiplication for all planes.
     const size_t row_offset = y * planes_[0].bytes_per_row();
     const void* row = planes_[c].bytes() + row_offset;
     return static_cast<const T * HWY_RESTRICT>(
         HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
   }

  private:
   ImageT planes_[kNumPlanes];
 };

 using Image3F = Image3<float>;

 // Rectangular region in image(s). Factoring this out of Image instead of
 // shifting the pointer by x0/y0 allows this to apply to multiple images with
 // different resolutions. Can compare size via SameSize(rect1, rect2).
 class Rect {
  public:
   // Most windows are xsize_max * ysize_max, except those on the borders where
   // begin + size_max > end.
   constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
                  size_t ysize_max, size_t xend, size_t yend)
       : x0_(xbegin),
         y0_(ybegin),
         xsize_(ClampedSize(xbegin, xsize_max, xend)),
         ysize_(ClampedSize(ybegin, ysize_max, yend)) {}

   // Construct with origin and known size (typically from another Rect).
   constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
       : x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}

   // Construct a rect that covers a whole image.
   template <typename Image>
   explicit Rect(const Image& image)
       : Rect(0, 0, image.xsize(), image.ysize()) {}

   Rect() : Rect(0, 0, 0, 0) {}

   Rect(const Rect&) = default;
   Rect& operator=(const Rect&) = default;

   Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
                size_t ysize_max) {
     return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
                 y0_ + ysize_);
   }

   template <typename T>
   const T* ConstRow(const Image<T>* image, size_t y) const {
     return image->ConstRow(y + y0_) + x0_;
   }

   template <typename T>
   T* MutableRow(const Image<T>* image, size_t y) const {
     return image->MutableRow(y + y0_) + x0_;
   }

   template <typename T>
   const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
     return image.ConstPlaneRow(c, y + y0_) + x0_;
   }

   template <typename T>
   T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
     return image->MutablePlaneRow(c, y + y0_) + x0_;
   }

   // Returns true if this Rect fully resides in the given image. ImageT could be
   // Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
   template <class ImageT>
   bool IsInside(const ImageT& image) const {
     return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
   }

   size_t x0() const { return x0_; }
   size_t y0() const { return y0_; }
   size_t xsize() const { return xsize_; }
   size_t ysize() const { return ysize_; }

  private:
   // Returns size_max, or whatever is left in [begin, end).
   static constexpr size_t ClampedSize(size_t begin, size_t size_max,
                                       size_t end) {
     return (begin + size_max <= end) ? size_max
                                      : (end > begin ? end - begin : 0);
   }

   size_t x0_;
   size_t y0_;

   size_t xsize_;
   size_t ysize_;
 };

 // Works for any image-like input type(s).
 template <class Image1, class Image2>
 HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
   return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
 }

 // Mirrors out of bounds coordinates and returns valid coordinates unchanged.
 // We assume the radius (distance outside the image) is small compared to the
 // image size, otherwise this might not terminate.
 // The mirror is outside the last column (border pixel is also replicated).
 static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
                                                  const int64_t xsize) {
   HWY_DASSERT(xsize != 0);

   // TODO(janwas): replace with branchless version
   while (x < 0 || x >= xsize) {
     if (x < 0) {
       x = -x - 1;
     } else {
       x = 2 * xsize - 1 - x;
     }
   }
   return static_cast<size_t>(x);
 }

 // Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):

 // Mirrors (repeating the edge pixel once). Useful for convolutions.
 struct WrapMirror {
   HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
     return Mirror(coord, static_cast<int64_t>(size));
   }
 };

 // Returns the same coordinate, for when we know "coord" is already valid (e.g.
 // interior of an image).
 struct WrapUnchanged {
   HWY_INLINE size_t operator()(const int64_t coord, size_t /*size*/) const {
     return static_cast<size_t>(coord);
   }
 };

 // Similar to Wrap* but for row pointers (reduces Row() multiplications).

 class WrapRowMirror {
  public:
   template <class View>
   WrapRowMirror(const View& image, size_t ysize)
       : first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}

   const float* operator()(const float* const HWY_RESTRICT row,
                           const int64_t stride) const {
     if (row < first_row_) {
       const int64_t num_before = first_row_ - row;
       // Mirrored; one row before => row 0, two before = row 1, ...
       return first_row_ + num_before - stride;
     }
     if (row > last_row_) {
       const int64_t num_after = row - last_row_;
       // Mirrored; one row after => last row, two after = last - 1, ...
       return last_row_ - num_after + stride;
     }
     return row;
   }

  private:
   const float* const HWY_RESTRICT first_row_;
   const float* const HWY_RESTRICT last_row_;
 };

 struct WrapRowUnchanged {
   HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
                                      int64_t /*stride*/) const {
     return row;
   }
 };

 }  // namespace hwy

 #endif  // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
	// Copyright 2020 Google LLC
	// SPDX-License-Identifier: Apache-2.0
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	#ifndef HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_
	#define HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_

	// SIMD/multicore-friendly planar image representation with row accessors.

	#include <string.h>

	#include <utility> // std::move

	#include "third_party/highway/hwy/aligned_allocator.h"
	#include "third_party/highway/hwy/base.h"

	namespace hwy {

	// Type-independent parts of Image<> - reduces code duplication and facilitates
	// moving member function implementations to cc file.
	struct HWY_CONTRIB_DLLEXPORT ImageBase {
	// Returns required alignment in bytes for externally allocated memory.
	static size_t VectorSize();

	// Returns distance [bytes] between the start of two consecutive rows, a
	// multiple of VectorSize but NOT kAlias (see implementation).
	static size_t BytesPerRow(size_t xsize, size_t sizeof_t);

	// No allocation (for output params or unused images)
	ImageBase()
	: xsize_(0),
	ysize_(0),
	bytes_per_row_(0),
	bytes_(nullptr, AlignedFreer(&AlignedFreer::DoNothing, nullptr)) {}

	// Allocates memory (this is the common case)
	ImageBase(size_t xsize, size_t ysize, size_t sizeof_t);

	// References but does not take ownership of external memory. Useful for
	// interoperability with other libraries. `aligned` must be aligned to a
	// multiple of VectorSize() and `bytes_per_row` must also be a multiple of
	// VectorSize() or preferably equal to BytesPerRow().
	ImageBase(size_t xsize, size_t ysize, size_t bytes_per_row, void* aligned);

	// Copy construction/assignment is forbidden to avoid inadvertent copies,
	// which can be very expensive. Use CopyImageTo() instead.
	ImageBase(const ImageBase& other) = delete;
	ImageBase& operator=(const ImageBase& other) = delete;

	// Move constructor (required for returning Image from function)
	ImageBase(ImageBase&& other) noexcept = default;

	// Move assignment (required for std::vector)
	ImageBase& operator=(ImageBase&& other) noexcept = default;

	void Swap(ImageBase& other);

	// Useful for pre-allocating image with some padding for alignment purposes
	// and later reporting the actual valid dimensions. Caller is responsible
	// for ensuring xsize/ysize are <= the original dimensions.
	void ShrinkTo(const size_t xsize, const size_t ysize) {
	xsize_ = static_cast<uint32_t>(xsize);
	ysize_ = static_cast<uint32_t>(ysize);
	// NOTE: we can't recompute bytes_per_row for more compact storage and
	// better locality because that would invalidate the image contents.
	}

	// How many pixels.
	HWY_INLINE size_t xsize() const { return xsize_; }
	HWY_INLINE size_t ysize() const { return ysize_; }

	// NOTE: do not use this for copying rows - the valid xsize may be much less.
	HWY_INLINE size_t bytes_per_row() const { return bytes_per_row_; }

	// Raw access to byte contents, for interfacing with other libraries.
	// Unsigned char instead of char to avoid surprises (sign extension).
	HWY_INLINE uint8_t* bytes() {
	void* p = bytes_.get();
	return static_cast<uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
	}
	HWY_INLINE const uint8_t* bytes() const {
	const void* p = bytes_.get();
	return static_cast<const uint8_t * HWY_RESTRICT>(HWY_ASSUME_ALIGNED(p, 64));
	}

	protected:
	// Returns pointer to the start of a row.
	HWY_INLINE void* VoidRow(const size_t y) const {
	#if HWY_IS_ASAN \|\| HWY_IS_MSAN \|\| HWY_IS_TSAN
	if (y >= ysize_) {
	HWY_ABORT("Row(%d) >= %u\n", static_cast<int>(y), ysize_);
	}
	#endif

	void* row = bytes_.get() + y * bytes_per_row_;
	return HWY_ASSUME_ALIGNED(row, 64);
	}

	enum class Padding {
	// Allow Load(d, row + x) for x = 0; x < xsize(); x += Lanes(d). Default.
	kRoundUp,
	// Allow LoadU(d, row + x) for x <= xsize() - 1. This requires an extra
	// vector to be initialized. If done by default, this would suppress
	// legitimate msan warnings. We therefore require users to explicitly call
	// InitializePadding before using unaligned loads (e.g. convolution).
	kUnaligned
	};

	// Initializes the minimum bytes required to suppress msan warnings from
	// legitimate (according to Padding mode) vector loads/stores on the right
	// border, where some lanes are uninitialized and assumed to be unused.
	void InitializePadding(size_t sizeof_t, Padding padding);

	// (Members are non-const to enable assignment during move-assignment.)
	uint32_t xsize_; // In valid pixels, not including any padding.
	uint32_t ysize_;
	size_t bytes_per_row_; // Includes padding.
	AlignedFreeUniquePtr<uint8_t[]> bytes_;
	};

	// Single channel, aligned rows separated by padding. T must be POD.
	//
	// 'Single channel' (one 2D array per channel) simplifies vectorization
	// (repeating the same operation on multiple adjacent components) without the
	// complexity of a hybrid layout (8 R, 8 G, 8 B, ...). In particular, clients
	// can easily iterate over all components in a row and Image requires no
	// knowledge of the pixel format beyond the component type "T".
	//
	// 'Aligned' means each row is aligned to the L1 cache line size. This prevents
	// false sharing between two threads operating on adjacent rows.
	//
	// 'Padding' is still relevant because vectors could potentially be larger than
	// a cache line. By rounding up row sizes to the vector size, we allow
	// reading/writing ALIGNED vectors whose first lane is a valid sample. This
	// avoids needing a separate loop to handle remaining unaligned lanes.
	//
	// This image layout could also be achieved with a vector and a row accessor
	// function, but a class wrapper with support for "deleter" allows wrapping
	// existing memory allocated by clients without copying the pixels. It also
	// provides convenient accessors for xsize/ysize, which shortens function
	// argument lists. Supports move-construction so it can be stored in containers.
	template <typename ComponentType>
	class Image : public ImageBase {
	public:
	using T = ComponentType;

	Image() = default;
	Image(const size_t xsize, const size_t ysize)
	: ImageBase(xsize, ysize, sizeof(T)) {}
	Image(const size_t xsize, const size_t ysize, size_t bytes_per_row,
	void* aligned)
	: ImageBase(xsize, ysize, bytes_per_row, aligned) {}

	void InitializePaddingForUnalignedAccesses() {
	InitializePadding(sizeof(T), Padding::kUnaligned);
	}

	HWY_INLINE const T* ConstRow(const size_t y) const {
	return static_cast<const T*>(VoidRow(y));
	}
	HWY_INLINE const T* ConstRow(const size_t y) {
	return static_cast<const T*>(VoidRow(y));
	}

	// Returns pointer to non-const. This allows passing const Image* parameters
	// when the callee is only supposed to fill the pixels, as opposed to
	// allocating or resizing the image.
	HWY_INLINE T* MutableRow(const size_t y) const {
	return static_cast<T*>(VoidRow(y));
	}
	HWY_INLINE T* MutableRow(const size_t y) {
	return static_cast<T*>(VoidRow(y));
	}

	// Returns number of pixels (some of which are padding) per row. Useful for
	// computing other rows via pointer arithmetic. WARNING: this must
	// NOT be used to determine xsize.
	HWY_INLINE intptr_t PixelsPerRow() const {
	return static_cast<intptr_t>(bytes_per_row_ / sizeof(T));
	}
	};

	using ImageF = Image<float>;

	// A bundle of 3 same-sized images. To fill an existing Image3 using
	// single-channel producers, we also need access to each const Image*. Const
	// prevents breaking the same-size invariant, while still allowing pixels to be
	// changed via MutableRow.
	template <typename ComponentType>
	class Image3 {
	public:
	using T = ComponentType;
	using ImageT = Image<T>;
	static constexpr size_t kNumPlanes = 3;

	Image3() : planes_{ImageT(), ImageT(), ImageT()} {}

	Image3(const size_t xsize, const size_t ysize)
	: planes_{ImageT(xsize, ysize), ImageT(xsize, ysize),
	ImageT(xsize, ysize)} {}

	Image3(Image3&& other) noexcept {
	for (size_t i = 0; i < kNumPlanes; i++) {
	planes_[i] = std::move(other.planes_[i]);
	}
	}

	Image3(ImageT&& plane0, ImageT&& plane1, ImageT&& plane2) {
	if (!SameSize(plane0, plane1) \|\| !SameSize(plane0, plane2)) {
	HWY_ABORT(
	"Not same size: %d x %d, %d x %d, %d x %d\n",
	static_cast<int>(plane0.xsize()), static_cast<int>(plane0.ysize()),
	static_cast<int>(plane1.xsize()), static_cast<int>(plane1.ysize()),
	static_cast<int>(plane2.xsize()), static_cast<int>(plane2.ysize()));
	}
	planes_[0] = std::move(plane0);
	planes_[1] = std::move(plane1);
	planes_[2] = std::move(plane2);
	}

	// Copy construction/assignment is forbidden to avoid inadvertent copies,
	// which can be very expensive. Use CopyImageTo instead.
	Image3(const Image3& other) = delete;
	Image3& operator=(const Image3& other) = delete;

	Image3& operator=(Image3&& other) noexcept {
	for (size_t i = 0; i < kNumPlanes; i++) {
	planes_[i] = std::move(other.planes_[i]);
	}
	return *this;
	}

	HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) const {
	return static_cast<const T*>(VoidPlaneRow(c, y));
	}
	HWY_INLINE const T* ConstPlaneRow(const size_t c, const size_t y) {
	return static_cast<const T*>(VoidPlaneRow(c, y));
	}

	HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) const {
	return static_cast<T*>(VoidPlaneRow(c, y));
	}
	HWY_INLINE T* MutablePlaneRow(const size_t c, const size_t y) {
	return static_cast<T*>(VoidPlaneRow(c, y));
	}

	HWY_INLINE const ImageT& Plane(size_t idx) const { return planes_[idx]; }

	void Swap(Image3& other) {
	for (size_t c = 0; c < 3; ++c) {
	other.planes_[c].Swap(planes_[c]);
	}
	}

	void ShrinkTo(const size_t xsize, const size_t ysize) {
	for (ImageT& plane : planes_) {
	plane.ShrinkTo(xsize, ysize);
	}
	}

	// Sizes of all three images are guaranteed to be equal.
	HWY_INLINE size_t xsize() const { return planes_[0].xsize(); }
	HWY_INLINE size_t ysize() const { return planes_[0].ysize(); }
	// Returns offset [bytes] from one row to the next row of the same plane.
	// WARNING: this must NOT be used to determine xsize, nor for copying rows -
	// the valid xsize may be much less.
	HWY_INLINE size_t bytes_per_row() const { return planes_[0].bytes_per_row(); }
	// Returns number of pixels (some of which are padding) per row. Useful for
	// computing other rows via pointer arithmetic. WARNING: this must NOT be used
	// to determine xsize.
	HWY_INLINE intptr_t PixelsPerRow() const { return planes_[0].PixelsPerRow(); }

	private:
	// Returns pointer to the start of a row.
	HWY_INLINE void* VoidPlaneRow(const size_t c, const size_t y) const {
	#if HWY_IS_ASAN \|\| HWY_IS_MSAN \|\| HWY_IS_TSAN
	if (c >= kNumPlanes \|\| y >= ysize()) {
	HWY_ABORT("PlaneRow(%d, %d) >= %d\n", static_cast<int>(c),
	static_cast<int>(y), static_cast<int>(ysize()));
	}
	#endif
	// Use the first plane's stride because the compiler might not realize they
	// are all equal. Thus we only need a single multiplication for all planes.
	const size_t row_offset = y * planes_[0].bytes_per_row();
	const void* row = planes_[c].bytes() + row_offset;
	return static_cast<const T * HWY_RESTRICT>(
	HWY_ASSUME_ALIGNED(row, HWY_ALIGNMENT));
	}

	private:
	ImageT planes_[kNumPlanes];
	};

	using Image3F = Image3<float>;

	// Rectangular region in image(s). Factoring this out of Image instead of
	// shifting the pointer by x0/y0 allows this to apply to multiple images with
	// different resolutions. Can compare size via SameSize(rect1, rect2).
	class Rect {
	public:
	// Most windows are xsize_max * ysize_max, except those on the borders where
	// begin + size_max > end.
	constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize_max,
	size_t ysize_max, size_t xend, size_t yend)
	: x0_(xbegin),
	y0_(ybegin),
	xsize_(ClampedSize(xbegin, xsize_max, xend)),
	ysize_(ClampedSize(ybegin, ysize_max, yend)) {}

	// Construct with origin and known size (typically from another Rect).
	constexpr Rect(size_t xbegin, size_t ybegin, size_t xsize, size_t ysize)
	: x0_(xbegin), y0_(ybegin), xsize_(xsize), ysize_(ysize) {}

	// Construct a rect that covers a whole image.
	template <typename Image>
	explicit Rect(const Image& image)
	: Rect(0, 0, image.xsize(), image.ysize()) {}

	Rect() : Rect(0, 0, 0, 0) {}

	Rect(const Rect&) = default;
	Rect& operator=(const Rect&) = default;

	Rect Subrect(size_t xbegin, size_t ybegin, size_t xsize_max,
	size_t ysize_max) {
	return Rect(x0_ + xbegin, y0_ + ybegin, xsize_max, ysize_max, x0_ + xsize_,
	y0_ + ysize_);
	}

	template <typename T>
	const T* ConstRow(const Image<T>* image, size_t y) const {
	return image->ConstRow(y + y0_) + x0_;
	}

	template <typename T>
	T* MutableRow(const Image<T>* image, size_t y) const {
	return image->MutableRow(y + y0_) + x0_;
	}

	template <typename T>
	const T* ConstPlaneRow(const Image3<T>& image, size_t c, size_t y) const {
	return image.ConstPlaneRow(c, y + y0_) + x0_;
	}

	template <typename T>
	T* MutablePlaneRow(Image3<T>* image, const size_t c, size_t y) const {
	return image->MutablePlaneRow(c, y + y0_) + x0_;
	}

	// Returns true if this Rect fully resides in the given image. ImageT could be
	// Image<T> or Image3<T>; however if ImageT is Rect, results are nonsensical.
	template <class ImageT>
	bool IsInside(const ImageT& image) const {
	return (x0_ + xsize_ <= image.xsize()) && (y0_ + ysize_ <= image.ysize());
	}

	size_t x0() const { return x0_; }
	size_t y0() const { return y0_; }
	size_t xsize() const { return xsize_; }
	size_t ysize() const { return ysize_; }

	private:
	// Returns size_max, or whatever is left in [begin, end).
	static constexpr size_t ClampedSize(size_t begin, size_t size_max,
	size_t end) {
	return (begin + size_max <= end) ? size_max
	: (end > begin ? end - begin : 0);
	}

	size_t x0_;
	size_t y0_;

	size_t xsize_;
	size_t ysize_;
	};

	// Works for any image-like input type(s).
	template <class Image1, class Image2>
	HWY_MAYBE_UNUSED bool SameSize(const Image1& image1, const Image2& image2) {
	return image1.xsize() == image2.xsize() && image1.ysize() == image2.ysize();
	}

	// Mirrors out of bounds coordinates and returns valid coordinates unchanged.
	// We assume the radius (distance outside the image) is small compared to the
	// image size, otherwise this might not terminate.
	// The mirror is outside the last column (border pixel is also replicated).
	static HWY_INLINE HWY_MAYBE_UNUSED size_t Mirror(int64_t x,
	const int64_t xsize) {
	HWY_DASSERT(xsize != 0);

	// TODO(janwas): replace with branchless version
	while (x < 0 \|\| x >= xsize) {
	if (x < 0) {
	x = -x - 1;
	} else {
	x = 2 * xsize - 1 - x;
	}
	}
	return static_cast<size_t>(x);
	}

	// Wrap modes for ensuring X/Y coordinates are in the valid range [0, size):

	// Mirrors (repeating the edge pixel once). Useful for convolutions.
	struct WrapMirror {
	HWY_INLINE size_t operator()(const int64_t coord, const size_t size) const {
	return Mirror(coord, static_cast<int64_t>(size));
	}
	};

	// Returns the same coordinate, for when we know "coord" is already valid (e.g.
	// interior of an image).
	struct WrapUnchanged {
	HWY_INLINE size_t operator()(const int64_t coord, size_t /size/) const {
	return static_cast<size_t>(coord);
	}
	};

	// Similar to Wrap* but for row pointers (reduces Row() multiplications).

	class WrapRowMirror {
	public:
	template <class View>
	WrapRowMirror(const View& image, size_t ysize)
	: first_row_(image.ConstRow(0)), last_row_(image.ConstRow(ysize - 1)) {}

	const float* operator()(const float* const HWY_RESTRICT row,
	const int64_t stride) const {
	if (row < first_row_) {
	const int64_t num_before = first_row_ - row;
	// Mirrored; one row before => row 0, two before = row 1, ...
	return first_row_ + num_before - stride;
	}
	if (row > last_row_) {
	const int64_t num_after = row - last_row_;
	// Mirrored; one row after => last row, two after = last - 1, ...
	return last_row_ - num_after + stride;
	}
	return row;
	}

	private:
	const float* const HWY_RESTRICT first_row_;
	const float* const HWY_RESTRICT last_row_;
	};

	struct WrapRowUnchanged {
	HWY_INLINE const float* operator()(const float* const HWY_RESTRICT row,
	int64_t /stride/) const {
	return row;
	}
	};

	} // namespace hwy

	#endif // HIGHWAY_HWY_CONTRIB_IMAGE_IMAGE_H_