av1/common/arm/resize_neon_dotprod.c - aom - Git at Google

 /*
  * Copyright (c) 2024, Alliance for Open Media. All rights reserved.
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <arm_neon.h>
 #include <assert.h>

 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "av1/common/arm/resize_neon.h"
 #include "av1/common/resize.h"
 #include "config/aom_scale_rtcd.h"
 #include "config/av1_rtcd.h"

 // clang-format off
 DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
   0, 1, 2, 3, 2, 3, 4, 5, 4, 5,  6,  7,  6,  7,  8,  9,
   4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
 };
 DECLARE_ALIGNED(16, static const uint8_t, kScale4DotProdPermuteTbl[16]) = {
   0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11
 };
 // clang-format on

 static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0,
                                                const uint8x16_t s1,
                                                const uint8x16x2_t permute_tbl,
                                                const int8x8_t filter) {
   // Transform sample range to [-128, 127] for 8-bit signed dot product.
   int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
   int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));

   // Permute samples ready for dot product.
   int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl.val[0]),
                                 vqtbl1q_s8(s0_128, permute_tbl.val[1]),
                                 vqtbl1q_s8(s1_128, permute_tbl.val[0]),
                                 vqtbl1q_s8(s1_128, permute_tbl.val[1]) };

   // Dot product constant:
   // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
   // from every source value. The additional right shift by one is needed
   // because we halve the filter values.
   const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 1);

   // First 4 output values.
   int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
   sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
   // Second 4 output values.
   int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[2], filter, 0);
   sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filter, 1);

   int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));

   // We halved the filter values so -1 from right shift.
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }

 static inline void scale_2_to_1_horiz_8tap(const uint8_t *src,
                                            const int src_stride, int w, int h,
                                            uint8_t *dst, const int dst_stride,
                                            const int16x8_t filters) {
   const int8x8_t filter = vmovn_s16(filters);
   const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl);

   do {
     const uint8_t *s = src;
     uint8_t *d = dst;
     int width = w;
     do {
       uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
       load_u8_16x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
                    &s5[0], &s6[0], &s7[0]);
       load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
                    &s5[1], &s6[1], &s7[1]);

       uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filter);
       uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filter);
       uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filter);
       uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filter);

       uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filter);
       uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filter);
       uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filter);
       uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filter);

       store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);

       d += 8;
       s += 16;
       width -= 8;
     } while (width > 0);

     dst += 8 * dst_stride;
     src += 8 * src_stride;
     h -= 8;
   } while (h > 0);
 }

 static inline void scale_plane_2_to_1_8tap(const uint8_t *src,
                                            const int src_stride, uint8_t *dst,
                                            const int dst_stride, const int w,
                                            const int h,
                                            const int16_t *const filter_ptr,
                                            uint8_t *const im_block) {
   assert(w > 0 && h > 0);

   const int im_h = 2 * h + SUBPEL_TAPS - 3;
   const int im_stride = (w + 7) & ~7;
   // All filter values are even, halve them to fit in int8_t when applying
   // horizontal filter and stay in 16-bit elements when applying vertical
   // filter.
   const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

   const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
   const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

   scale_2_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
                           im_block, im_stride, filters);

   // We can specialise the vertical filtering for 6-tap filters given that the
   // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
   scale_2_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
                          filters);
 }

 static inline uint8x8_t scale_4_to_1_filter8_8(
     const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2,
     const uint8x16_t s3, const uint8x16_t permute_tbl, const int8x8_t filter) {
   int8x16_t filters = vcombine_s8(filter, filter);

   // Transform sample range to [-128, 127] for 8-bit signed dot product.
   int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
   int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));
   int8x16_t s2_128 = vreinterpretq_s8_u8(vsubq_u8(s2, vdupq_n_u8(128)));
   int8x16_t s3_128 = vreinterpretq_s8_u8(vsubq_u8(s3, vdupq_n_u8(128)));

   int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl),
                                 vqtbl1q_s8(s1_128, permute_tbl),
                                 vqtbl1q_s8(s2_128, permute_tbl),
                                 vqtbl1q_s8(s3_128, permute_tbl) };

   // Dot product constant:
   // The shim of 128 << FILTER_BITS is needed because we are subtracting 128
   // from every source value. The additional right shift by one is needed
   // because we halved the filter values and will use a pairwise add.
   const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 2);

   int32x4_t sum0 = vdotq_s32(acc, perm_samples[0], filters);
   int32x4_t sum1 = vdotq_s32(acc, perm_samples[1], filters);
   int32x4_t sum2 = vdotq_s32(acc, perm_samples[2], filters);
   int32x4_t sum3 = vdotq_s32(acc, perm_samples[3], filters);

   int32x4_t sum01 = vpaddq_s32(sum0, sum1);
   int32x4_t sum23 = vpaddq_s32(sum2, sum3);

   int16x8_t sum = vcombine_s16(vmovn_s32(sum01), vmovn_s32(sum23));

   // We halved the filter values so -1 from right shift.
   return vqrshrun_n_s16(sum, FILTER_BITS - 1);
 }

 static inline void scale_4_to_1_horiz_8tap(const uint8_t *src,
                                            const int src_stride, int w, int h,
                                            uint8_t *dst, const int dst_stride,
                                            const int16x8_t filters) {
   const int8x8_t filter = vmovn_s16(filters);
   const uint8x16_t permute_tbl = vld1q_u8(kScale4DotProdPermuteTbl);

   do {
     const uint8_t *s = src;
     uint8_t *d = dst;
     int width = w;

     do {
       uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
       load_u8_16x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);

       uint8x8_t d0 =
           scale_4_to_1_filter8_8(s0, s1, s2, s3, permute_tbl, filter);
       uint8x8_t d1 =
           scale_4_to_1_filter8_8(s4, s5, s6, s7, permute_tbl, filter);

       store_u8x2_strided_x4(d + 0 * dst_stride, dst_stride, d0);
       store_u8x2_strided_x4(d + 4 * dst_stride, dst_stride, d1);

       d += 2;
       s += 8;
       width -= 2;
     } while (width > 0);

     dst += 8 * dst_stride;
     src += 8 * src_stride;
     h -= 8;
   } while (h > 0);
 }

 static inline void scale_plane_4_to_1_8tap(const uint8_t *src,
                                            const int src_stride, uint8_t *dst,
                                            const int dst_stride, const int w,
                                            const int h,
                                            const int16_t *const filter_ptr,
                                            uint8_t *const im_block) {
   assert(w > 0 && h > 0);
   const int im_h = 4 * h + SUBPEL_TAPS - 2;
   const int im_stride = (w + 1) & ~1;
   // All filter values are even, halve them to fit in int8_t when applying
   // horizontal filter and stay in 16-bit elements when applying vertical
   // filter.
   const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

   const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
   const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

   scale_4_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
                           im_block, im_stride, filters);

   // We can specialise the vertical filtering for 6-tap filters given that the
   // EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
   scale_4_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
                          filters);
 }

 static inline bool has_normative_scaler_neon_dotprod(const int src_width,
                                                      const int src_height,
                                                      const int dst_width,
                                                      const int dst_height) {
   return (2 * dst_width == src_width && 2 * dst_height == src_height) ||
          (4 * dst_width == src_width && 4 * dst_height == src_height);
 }

 void av1_resize_and_extend_frame_neon_dotprod(const YV12_BUFFER_CONFIG *src,
                                               YV12_BUFFER_CONFIG *dst,
                                               const InterpFilter filter,
                                               const int phase,
                                               const int num_planes) {
   assert(filter == BILINEAR || filter == EIGHTTAP_SMOOTH ||
          filter == EIGHTTAP_REGULAR);

   bool has_normative_scaler =
       has_normative_scaler_neon_dotprod(src->y_crop_width, src->y_crop_height,
                                         dst->y_crop_width, dst->y_crop_height);

   if (num_planes > 1) {
     has_normative_scaler =
         has_normative_scaler && has_normative_scaler_neon_dotprod(
                                     src->uv_crop_width, src->uv_crop_height,
                                     dst->uv_crop_width, dst->uv_crop_height);
   }

   if (!has_normative_scaler || filter == BILINEAR || phase == 0) {
     av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes);
     return;
   }

   // We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
   // the static analysis warnings.
   int malloc_failed = 0;
   for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
     const int is_uv = i > 0;
     const int src_w = src->crop_widths[is_uv];
     const int src_h = src->crop_heights[is_uv];
     const int dst_w = dst->crop_widths[is_uv];
     const int dst_h = dst->crop_heights[is_uv];
     const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
     const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;

     if (2 * dst_w == src_w && 2 * dst_h == src_h) {
       const int buffer_stride = (dst_y_w + 7) & ~7;
       const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
       if (!temp_buffer) {
         malloc_failed = 1;
         break;
       }
       const InterpKernel *interp_kernel =
           (const InterpKernel *)av1_interp_filter_params_list[filter]
               .filter_ptr;
       scale_plane_2_to_1_8tap(src->buffers[i], src->strides[is_uv],
                               dst->buffers[i], dst->strides[is_uv], dst_w,
                               dst_h, interp_kernel[phase], temp_buffer);
       free(temp_buffer);
     } else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
       const int buffer_stride = (dst_y_w + 1) & ~1;
       const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
       uint8_t *const temp_buffer =
           (uint8_t *)malloc(buffer_stride * buffer_height);
       if (!temp_buffer) {
         malloc_failed = 1;
         break;
       }
       const InterpKernel *interp_kernel =
           (const InterpKernel *)av1_interp_filter_params_list[filter]
               .filter_ptr;
       scale_plane_4_to_1_8tap(src->buffers[i], src->strides[is_uv],
                               dst->buffers[i], dst->strides[is_uv], dst_w,
                               dst_h, interp_kernel[phase], temp_buffer);
       free(temp_buffer);
     }
   }

   if (malloc_failed) {
     av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
   } else {
     aom_extend_frame_borders(dst, num_planes);
   }
 }
	/*
	* Copyright (c) 2024, Alliance for Open Media. All rights reserved.
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <arm_neon.h>
	#include <assert.h>

	#include "aom_dsp/arm/mem_neon.h"
	#include "aom_dsp/arm/transpose_neon.h"
	#include "av1/common/arm/resize_neon.h"
	#include "av1/common/resize.h"
	#include "config/aom_scale_rtcd.h"
	#include "config/av1_rtcd.h"

	// clang-format off
	DECLARE_ALIGNED(16, static const uint8_t, kScale2DotProdPermuteTbl[32]) = {
	0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9,
	4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13
	};
	DECLARE_ALIGNED(16, static const uint8_t, kScale4DotProdPermuteTbl[16]) = {
	0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 8, 9, 10, 11
	};
	// clang-format on

	static inline uint8x8_t scale_2_to_1_filter8_8(const uint8x16_t s0,
	const uint8x16_t s1,
	const uint8x16x2_t permute_tbl,
	const int8x8_t filter) {
	// Transform sample range to [-128, 127] for 8-bit signed dot product.
	int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
	int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));

	// Permute samples ready for dot product.
	int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl.val[0]),
	vqtbl1q_s8(s0_128, permute_tbl.val[1]),
	vqtbl1q_s8(s1_128, permute_tbl.val[0]),
	vqtbl1q_s8(s1_128, permute_tbl.val[1]) };

	// Dot product constant:
	// The shim of 128 << FILTER_BITS is needed because we are subtracting 128
	// from every source value. The additional right shift by one is needed
	// because we halve the filter values.
	const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 1);

	// First 4 output values.
	int32x4_t sum0123 = vdotq_lane_s32(acc, perm_samples[0], filter, 0);
	sum0123 = vdotq_lane_s32(sum0123, perm_samples[1], filter, 1);
	// Second 4 output values.
	int32x4_t sum4567 = vdotq_lane_s32(acc, perm_samples[2], filter, 0);
	sum4567 = vdotq_lane_s32(sum4567, perm_samples[3], filter, 1);

	int16x8_t sum = vcombine_s16(vmovn_s32(sum0123), vmovn_s32(sum4567));

	// We halved the filter values so -1 from right shift.
	return vqrshrun_n_s16(sum, FILTER_BITS - 1);
	}

	static inline void scale_2_to_1_horiz_8tap(const uint8_t *src,
	const int src_stride, int w, int h,
	uint8_t *dst, const int dst_stride,
	const int16x8_t filters) {
	const int8x8_t filter = vmovn_s16(filters);
	const uint8x16x2_t permute_tbl = vld1q_u8_x2(kScale2DotProdPermuteTbl);

	do {
	const uint8_t *s = src;
	uint8_t *d = dst;
	int width = w;
	do {
	uint8x16_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
	load_u8_16x8(s, src_stride, &s0[0], &s1[0], &s2[0], &s3[0], &s4[0],
	&s5[0], &s6[0], &s7[0]);
	load_u8_16x8(s + 8, src_stride, &s0[1], &s1[1], &s2[1], &s3[1], &s4[1],
	&s5[1], &s6[1], &s7[1]);

	uint8x8_t d0 = scale_2_to_1_filter8_8(s0[0], s0[1], permute_tbl, filter);
	uint8x8_t d1 = scale_2_to_1_filter8_8(s1[0], s1[1], permute_tbl, filter);
	uint8x8_t d2 = scale_2_to_1_filter8_8(s2[0], s2[1], permute_tbl, filter);
	uint8x8_t d3 = scale_2_to_1_filter8_8(s3[0], s3[1], permute_tbl, filter);

	uint8x8_t d4 = scale_2_to_1_filter8_8(s4[0], s4[1], permute_tbl, filter);
	uint8x8_t d5 = scale_2_to_1_filter8_8(s5[0], s5[1], permute_tbl, filter);
	uint8x8_t d6 = scale_2_to_1_filter8_8(s6[0], s6[1], permute_tbl, filter);
	uint8x8_t d7 = scale_2_to_1_filter8_8(s7[0], s7[1], permute_tbl, filter);

	store_u8_8x8(d, dst_stride, d0, d1, d2, d3, d4, d5, d6, d7);

	d += 8;
	s += 16;
	width -= 8;
	} while (width > 0);

	dst += 8 * dst_stride;
	src += 8 * src_stride;
	h -= 8;
	} while (h > 0);
	}

	static inline void scale_plane_2_to_1_8tap(const uint8_t *src,
	const int src_stride, uint8_t *dst,
	const int dst_stride, const int w,
	const int h,
	const int16_t *const filter_ptr,
	uint8_t *const im_block) {
	assert(w > 0 && h > 0);

	const int im_h = 2 * h + SUBPEL_TAPS - 3;
	const int im_stride = (w + 7) & ~7;
	// All filter values are even, halve them to fit in int8_t when applying
	// horizontal filter and stay in 16-bit elements when applying vertical
	// filter.
	const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

	const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
	const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

	scale_2_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
	im_block, im_stride, filters);

	// We can specialise the vertical filtering for 6-tap filters given that the
	// EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
	scale_2_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
	filters);
	}

	static inline uint8x8_t scale_4_to_1_filter8_8(
	const uint8x16_t s0, const uint8x16_t s1, const uint8x16_t s2,
	const uint8x16_t s3, const uint8x16_t permute_tbl, const int8x8_t filter) {
	int8x16_t filters = vcombine_s8(filter, filter);

	// Transform sample range to [-128, 127] for 8-bit signed dot product.
	int8x16_t s0_128 = vreinterpretq_s8_u8(vsubq_u8(s0, vdupq_n_u8(128)));
	int8x16_t s1_128 = vreinterpretq_s8_u8(vsubq_u8(s1, vdupq_n_u8(128)));
	int8x16_t s2_128 = vreinterpretq_s8_u8(vsubq_u8(s2, vdupq_n_u8(128)));
	int8x16_t s3_128 = vreinterpretq_s8_u8(vsubq_u8(s3, vdupq_n_u8(128)));

	int8x16_t perm_samples[4] = { vqtbl1q_s8(s0_128, permute_tbl),
	vqtbl1q_s8(s1_128, permute_tbl),
	vqtbl1q_s8(s2_128, permute_tbl),
	vqtbl1q_s8(s3_128, permute_tbl) };

	// Dot product constant:
	// The shim of 128 << FILTER_BITS is needed because we are subtracting 128
	// from every source value. The additional right shift by one is needed
	// because we halved the filter values and will use a pairwise add.
	const int32x4_t acc = vdupq_n_s32((128 << FILTER_BITS) >> 2);

	int32x4_t sum0 = vdotq_s32(acc, perm_samples[0], filters);
	int32x4_t sum1 = vdotq_s32(acc, perm_samples[1], filters);
	int32x4_t sum2 = vdotq_s32(acc, perm_samples[2], filters);
	int32x4_t sum3 = vdotq_s32(acc, perm_samples[3], filters);

	int32x4_t sum01 = vpaddq_s32(sum0, sum1);
	int32x4_t sum23 = vpaddq_s32(sum2, sum3);

	int16x8_t sum = vcombine_s16(vmovn_s32(sum01), vmovn_s32(sum23));

	// We halved the filter values so -1 from right shift.
	return vqrshrun_n_s16(sum, FILTER_BITS - 1);
	}

	static inline void scale_4_to_1_horiz_8tap(const uint8_t *src,
	const int src_stride, int w, int h,
	uint8_t *dst, const int dst_stride,
	const int16x8_t filters) {
	const int8x8_t filter = vmovn_s16(filters);
	const uint8x16_t permute_tbl = vld1q_u8(kScale4DotProdPermuteTbl);

	do {
	const uint8_t *s = src;
	uint8_t *d = dst;
	int width = w;

	do {
	uint8x16_t s0, s1, s2, s3, s4, s5, s6, s7;
	load_u8_16x8(s, src_stride, &s0, &s1, &s2, &s3, &s4, &s5, &s6, &s7);

	uint8x8_t d0 =
	scale_4_to_1_filter8_8(s0, s1, s2, s3, permute_tbl, filter);
	uint8x8_t d1 =
	scale_4_to_1_filter8_8(s4, s5, s6, s7, permute_tbl, filter);

	store_u8x2_strided_x4(d + 0 * dst_stride, dst_stride, d0);
	store_u8x2_strided_x4(d + 4 * dst_stride, dst_stride, d1);

	d += 2;
	s += 8;
	width -= 2;
	} while (width > 0);

	dst += 8 * dst_stride;
	src += 8 * src_stride;
	h -= 8;
	} while (h > 0);
	}

	static inline void scale_plane_4_to_1_8tap(const uint8_t *src,
	const int src_stride, uint8_t *dst,
	const int dst_stride, const int w,
	const int h,
	const int16_t *const filter_ptr,
	uint8_t *const im_block) {
	assert(w > 0 && h > 0);
	const int im_h = 4 * h + SUBPEL_TAPS - 2;
	const int im_stride = (w + 1) & ~1;
	// All filter values are even, halve them to fit in int8_t when applying
	// horizontal filter and stay in 16-bit elements when applying vertical
	// filter.
	const int16x8_t filters = vshrq_n_s16(vld1q_s16(filter_ptr), 1);

	const ptrdiff_t horiz_offset = SUBPEL_TAPS / 2 - 1;
	const ptrdiff_t vert_offset = (SUBPEL_TAPS / 2 - 1) * src_stride;

	scale_4_to_1_horiz_8tap(src - horiz_offset - vert_offset, src_stride, w, im_h,
	im_block, im_stride, filters);

	// We can specialise the vertical filtering for 6-tap filters given that the
	// EIGHTTAP_SMOOTH and EIGHTTAP_REGULAR filters are 0-padded.
	scale_4_to_1_vert_6tap(im_block + im_stride, im_stride, w, h, dst, dst_stride,
	filters);
	}

	static inline bool has_normative_scaler_neon_dotprod(const int src_width,
	const int src_height,
	const int dst_width,
	const int dst_height) {
	return (2 * dst_width == src_width && 2 * dst_height == src_height) \|\|
	(4 * dst_width == src_width && 4 * dst_height == src_height);
	}

	void av1_resize_and_extend_frame_neon_dotprod(const YV12_BUFFER_CONFIG *src,
	YV12_BUFFER_CONFIG *dst,
	const InterpFilter filter,
	const int phase,
	const int num_planes) {
	assert(filter == BILINEAR \|\| filter == EIGHTTAP_SMOOTH \|\|
	filter == EIGHTTAP_REGULAR);

	bool has_normative_scaler =
	has_normative_scaler_neon_dotprod(src->y_crop_width, src->y_crop_height,
	dst->y_crop_width, dst->y_crop_height);

	if (num_planes > 1) {
	has_normative_scaler =
	has_normative_scaler && has_normative_scaler_neon_dotprod(
	src->uv_crop_width, src->uv_crop_height,
	dst->uv_crop_width, dst->uv_crop_height);
	}

	if (!has_normative_scaler \|\| filter == BILINEAR \|\| phase == 0) {
	av1_resize_and_extend_frame_neon(src, dst, filter, phase, num_planes);
	return;
	}

	// We use AOMMIN(num_planes, MAX_MB_PLANE) instead of num_planes to quiet
	// the static analysis warnings.
	int malloc_failed = 0;
	for (int i = 0; i < AOMMIN(num_planes, MAX_MB_PLANE); ++i) {
	const int is_uv = i > 0;
	const int src_w = src->crop_widths[is_uv];
	const int src_h = src->crop_heights[is_uv];
	const int dst_w = dst->crop_widths[is_uv];
	const int dst_h = dst->crop_heights[is_uv];
	const int dst_y_w = (dst->crop_widths[0] + 1) & ~1;
	const int dst_y_h = (dst->crop_heights[0] + 1) & ~1;

	if (2 * dst_w == src_w && 2 * dst_h == src_h) {
	const int buffer_stride = (dst_y_w + 7) & ~7;
	const int buffer_height = (2 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
	uint8_t *const temp_buffer =
	(uint8_t )malloc(buffer_stride buffer_height);
	if (!temp_buffer) {
	malloc_failed = 1;
	break;
	}
	const InterpKernel *interp_kernel =
	(const InterpKernel *)av1_interp_filter_params_list[filter]
	.filter_ptr;
	scale_plane_2_to_1_8tap(src->buffers[i], src->strides[is_uv],
	dst->buffers[i], dst->strides[is_uv], dst_w,
	dst_h, interp_kernel[phase], temp_buffer);
	free(temp_buffer);
	} else if (4 * dst_w == src_w && 4 * dst_h == src_h) {
	const int buffer_stride = (dst_y_w + 1) & ~1;
	const int buffer_height = (4 * dst_y_h + SUBPEL_TAPS - 2 + 7) & ~7;
	uint8_t *const temp_buffer =
	(uint8_t )malloc(buffer_stride buffer_height);
	if (!temp_buffer) {
	malloc_failed = 1;
	break;
	}
	const InterpKernel *interp_kernel =
	(const InterpKernel *)av1_interp_filter_params_list[filter]
	.filter_ptr;
	scale_plane_4_to_1_8tap(src->buffers[i], src->strides[is_uv],
	dst->buffers[i], dst->strides[is_uv], dst_w,
	dst_h, interp_kernel[phase], temp_buffer);
	free(temp_buffer);
	}
	}

	if (malloc_failed) {
	av1_resize_and_extend_frame_c(src, dst, filter, phase, num_planes);
	} else {
	aom_extend_frame_borders(dst, num_planes);
	}
	}