av1/common/arm/highbd_convolve_horiz_rs_neon.c - aom - Git at Google

 /*
  * Copyright (c) 2023, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <arm_neon.h>

 #include "config/aom_config.h"
 #include "config/av1_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/arm/highbd_convolve_neon.h"

 #define UPSCALE_NORMATIVE_TAPS 8

 void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
                                        uint16_t *dst, int dst_stride, int w,
                                        int h, const int16_t *x_filters,
                                        int x0_qn, int x_step_qn, int bd) {
   const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;

   static const int32_t kIdx[4] = { 0, 1, 2, 3 };
   const int32x4_t idx = vld1q_s32(kIdx);
   const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
   const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
   const int32x4_t offset_s32 = vdupq_n_s32(0);
   const uint16x4_t max = vdup_n_u16((1 << bd) - 1);

   const uint16_t *src_ptr = src - horiz_offset;
   uint16_t *dst_ptr = dst;

   if (w <= 4) {
     int height = h;
     uint16_t *d = dst_ptr;

     do {
       int x_qn = x0_qn;

       // Load 4 src vectors at a time, they might be the same, but we have to
       // calculate the indices anyway. Doing it in SIMD and then storing the
       // indices is faster than having to calculate the expression
       // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
       // Ideally this should be a gather using the indices, but NEON does not
       // have that, so have to emulate
       const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
       // We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
       // 2
       const int32x4_t src_idx =
           vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
       // Similarly for the filter vector indices, we calculate the filter
       // indices for 4 columns. First we calculate the indices:
       // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
       // Then we calculate the actual pointers, multiplying with
       // UPSCALE_UPSCALE_NORMATIVE_TAPS
       // again shift left by 1
       const int32x4_t x_filter4_idx = vshlq_n_s32(
           vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
       // Even though pointers are unsigned 32/64-bit ints we do signed
       // addition The reason for this is that x_qn can be negative, leading to
       // negative offsets. Argon test
       // profile0_core/streams/test10573_11003.obu was failing because of
       // this.
 #if AOM_ARCH_AARCH64
       uint64x2_t tmp4[2];
       tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
           vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
       tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
           vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
       int16_t *src4_ptr[4];
       uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
       vst1q_u64(tmp_ptr, tmp4[0]);
       vst1q_u64(tmp_ptr + 2, tmp4[1]);

       // filter vectors
       tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
           vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
           vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
       tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
           vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
           vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));

       const int16_t *x_filter4_ptr[4];
       tmp_ptr = (uint64_t *)&x_filter4_ptr;
       vst1q_u64(tmp_ptr, tmp4[0]);
       vst1q_u64(tmp_ptr + 2, tmp4[1]);
 #else
       uint32x4_t tmp4;
       tmp4 = vreinterpretq_u32_s32(
           vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
       int16_t *src4_ptr[4];
       uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
       vst1q_u32(tmp_ptr, tmp4);

       // filter vectors
       tmp4 = vreinterpretq_u32_s32(
           vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
                     vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));

       const int16_t *x_filter4_ptr[4];
       tmp_ptr = (uint32_t *)&x_filter4_ptr;
       vst1q_u32(tmp_ptr, tmp4);
 #endif  // AOM_ARCH_AARCH64
       // Load source
       int16x8_t s0 = vld1q_s16(src4_ptr[0]);
       int16x8_t s1 = vld1q_s16(src4_ptr[1]);
       int16x8_t s2 = vld1q_s16(src4_ptr[2]);
       int16x8_t s3 = vld1q_s16(src4_ptr[3]);

       // Actually load the filters
       const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
       const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
       const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
       const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);

       // Group low and high parts and transpose
       int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
                                  vget_low_s16(x_filter1),
                                  vget_low_s16(x_filter2),
                                  vget_low_s16(x_filter3) };
       int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
                                  vget_high_s16(x_filter1),
                                  vget_high_s16(x_filter2),
                                  vget_high_s16(x_filter3) };
       transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
       transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);

       // Run the 2D Scale convolution
       uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
           s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);

       d0 = vmin_u16(d0, max);

       if (w == 2) {
         store_u16_2x1(d, d0);
       } else {
         vst1_u16(d, d0);
       }

       src_ptr += src_stride;
       d += dst_stride;
       height--;
     } while (height > 0);
   } else {
     int height = h;

     do {
       int width = w;
       int x_qn = x0_qn;
       uint16_t *d = dst_ptr;
       const uint16_t *s = src_ptr;

       do {
         // Load 4 src vectors at a time, they might be the same, but we have to
         // calculate the indices anyway. Doing it in SIMD and then storing the
         // indices is faster than having to calculate the expression
         // &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
         // Ideally this should be a gather using the indices, but NEON does not
         // have that, so have to emulate
         const int32x4_t xqn_idx =
             vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
         // We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
         // = 2
         const int32x4_t src_idx =
             vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);

         // Similarly for the filter vector indices, we calculate the filter
         // indices for 4 columns. First we calculate the indices:
         // x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
         // Then we calculate the actual pointers, multiplying with
         // UPSCALE_UPSCALE_NORMATIVE_TAPS
         // again shift left by 1
         const int32x4_t x_filter4_idx = vshlq_n_s32(
             vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
             1);
         // Even though pointers are unsigned 32/64-bit ints we do signed
         // addition The reason for this is that x_qn can be negative, leading to
         // negative offsets. Argon test
         // profile0_core/streams/test10573_11003.obu was failing because of
         // this.
 #if AOM_ARCH_AARCH64
         uint64x2_t tmp4[2];
         tmp4[0] = vreinterpretq_u64_s64(
             vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
         tmp4[1] = vreinterpretq_u64_s64(
             vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
         int16_t *src4_ptr[4];
         uint64_t *tmp_ptr = (uint64_t *)&src4_ptr;
         vst1q_u64(tmp_ptr, tmp4[0]);
         vst1q_u64(tmp_ptr + 2, tmp4[1]);

         // filter vectors
         tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
             vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
             vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
         tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
             vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
             vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));

         const int16_t *x_filter4_ptr[4];
         tmp_ptr = (uint64_t *)&x_filter4_ptr;
         vst1q_u64(tmp_ptr, tmp4[0]);
         vst1q_u64(tmp_ptr + 2, tmp4[1]);
 #else
         uint32x4_t tmp4;
         tmp4 = vreinterpretq_u32_s32(
             vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
         int16_t *src4_ptr[4];
         uint32_t *tmp_ptr = (uint32_t *)&src4_ptr;
         vst1q_u32(tmp_ptr, tmp4);

         // filter vectors
         tmp4 = vreinterpretq_u32_s32(
             vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
                       vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));

         const int16_t *x_filter4_ptr[4];
         tmp_ptr = (uint32_t *)&x_filter4_ptr;
         vst1q_u32(tmp_ptr, tmp4);
 #endif  // AOM_ARCH_AARCH64

         // Load source
         int16x8_t s0 = vld1q_s16(src4_ptr[0]);
         int16x8_t s1 = vld1q_s16(src4_ptr[1]);
         int16x8_t s2 = vld1q_s16(src4_ptr[2]);
         int16x8_t s3 = vld1q_s16(src4_ptr[3]);

         // Actually load the filters
         const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
         const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
         const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
         const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);

         // Group low and high parts and transpose
         int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
                                    vget_low_s16(x_filter1),
                                    vget_low_s16(x_filter2),
                                    vget_low_s16(x_filter3) };
         int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
                                    vget_high_s16(x_filter1),
                                    vget_high_s16(x_filter2),
                                    vget_high_s16(x_filter3) };
         transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
         transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);

         // Run the 2D Scale X convolution
         uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
             s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);

         d0 = vmin_u16(d0, max);
         vst1_u16(d, d0);

         x_qn += 4 * x_step_qn;
         d += 4;
         width -= 4;
       } while (width > 0);

       src_ptr += src_stride;
       dst_ptr += dst_stride;
       height--;
     } while (height > 0);
   }
 }
	/*
	* Copyright (c) 2023, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <arm_neon.h>

	#include "config/aom_config.h"
	#include "config/av1_rtcd.h"

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_dsp/arm/mem_neon.h"
	#include "aom_dsp/arm/transpose_neon.h"
	#include "aom_ports/mem.h"
	#include "av1/common/convolve.h"
	#include "av1/common/filter.h"
	#include "av1/common/arm/highbd_convolve_neon.h"

	#define UPSCALE_NORMATIVE_TAPS 8

	void av1_highbd_convolve_horiz_rs_neon(const uint16_t *src, int src_stride,
	uint16_t *dst, int dst_stride, int w,
	int h, const int16_t *x_filters,
	int x0_qn, int x_step_qn, int bd) {
	const int horiz_offset = UPSCALE_NORMATIVE_TAPS / 2 - 1;

	static const int32_t kIdx[4] = { 0, 1, 2, 3 };
	const int32x4_t idx = vld1q_s32(kIdx);
	const int32x4_t subpel_mask = vdupq_n_s32(RS_SCALE_SUBPEL_MASK);
	const int32x4_t shift_s32 = vdupq_n_s32(-FILTER_BITS);
	const int32x4_t offset_s32 = vdupq_n_s32(0);
	const uint16x4_t max = vdup_n_u16((1 << bd) - 1);

	const uint16_t *src_ptr = src - horiz_offset;
	uint16_t *dst_ptr = dst;

	if (w <= 4) {
	int height = h;
	uint16_t *d = dst_ptr;

	do {
	int x_qn = x0_qn;

	// Load 4 src vectors at a time, they might be the same, but we have to
	// calculate the indices anyway. Doing it in SIMD and then storing the
	// indices is faster than having to calculate the expression
	// &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
	// Ideally this should be a gather using the indices, but NEON does not
	// have that, so have to emulate
	const int32x4_t xqn_idx = vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
	// We have to multiply x2 to get the actual pointer as sizeof(uint16_t) =
	// 2
	const int32x4_t src_idx =
	vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);
	// Similarly for the filter vector indices, we calculate the filter
	// indices for 4 columns. First we calculate the indices:
	// x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
	// Then we calculate the actual pointers, multiplying with
	// UPSCALE_UPSCALE_NORMATIVE_TAPS
	// again shift left by 1
	const int32x4_t x_filter4_idx = vshlq_n_s32(
	vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS), 1);
	// Even though pointers are unsigned 32/64-bit ints we do signed
	// addition The reason for this is that x_qn can be negative, leading to
	// negative offsets. Argon test
	// profile0_core/streams/test10573_11003.obu was failing because of
	// this.
	#if AOM_ARCH_AARCH64
	uint64x2_t tmp4[2];
	tmp4[0] = vreinterpretq_u64_s64(vaddw_s32(
	vdupq_n_s64((const int64_t)src_ptr), vget_low_s32(src_idx)));
	tmp4[1] = vreinterpretq_u64_s64(vaddw_s32(
	vdupq_n_s64((const int64_t)src_ptr), vget_high_s32(src_idx)));
	int16_t *src4_ptr[4];
	uint64_t tmp_ptr = (uint64_t )&src4_ptr;
	vst1q_u64(tmp_ptr, tmp4[0]);
	vst1q_u64(tmp_ptr + 2, tmp4[1]);

	// filter vectors
	tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
	vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
	vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
	tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
	vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
	vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));

	const int16_t *x_filter4_ptr[4];
	tmp_ptr = (uint64_t *)&x_filter4_ptr;
	vst1q_u64(tmp_ptr, tmp4[0]);
	vst1q_u64(tmp_ptr + 2, tmp4[1]);
	#else
	uint32x4_t tmp4;
	tmp4 = vreinterpretq_u32_s32(
	vaddq_s32(vdupq_n_s32((const int32_t)src_ptr), src_idx));
	int16_t *src4_ptr[4];
	uint32_t tmp_ptr = (uint32_t )&src4_ptr;
	vst1q_u32(tmp_ptr, tmp4);

	// filter vectors
	tmp4 = vreinterpretq_u32_s32(
	vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
	vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));

	const int16_t *x_filter4_ptr[4];
	tmp_ptr = (uint32_t *)&x_filter4_ptr;
	vst1q_u32(tmp_ptr, tmp4);
	#endif // AOM_ARCH_AARCH64
	// Load source
	int16x8_t s0 = vld1q_s16(src4_ptr[0]);
	int16x8_t s1 = vld1q_s16(src4_ptr[1]);
	int16x8_t s2 = vld1q_s16(src4_ptr[2]);
	int16x8_t s3 = vld1q_s16(src4_ptr[3]);

	// Actually load the filters
	const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
	const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
	const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
	const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);

	// Group low and high parts and transpose
	int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
	vget_low_s16(x_filter1),
	vget_low_s16(x_filter2),
	vget_low_s16(x_filter3) };
	int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
	vget_high_s16(x_filter1),
	vget_high_s16(x_filter2),
	vget_high_s16(x_filter3) };
	transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
	transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);

	// Run the 2D Scale convolution
	uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
	s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);

	d0 = vmin_u16(d0, max);

	if (w == 2) {
	store_u16_2x1(d, d0);
	} else {
	vst1_u16(d, d0);
	}

	src_ptr += src_stride;
	d += dst_stride;
	height--;
	} while (height > 0);
	} else {
	int height = h;

	do {
	int width = w;
	int x_qn = x0_qn;
	uint16_t *d = dst_ptr;
	const uint16_t *s = src_ptr;

	do {
	// Load 4 src vectors at a time, they might be the same, but we have to
	// calculate the indices anyway. Doing it in SIMD and then storing the
	// indices is faster than having to calculate the expression
	// &src_ptr[((x_qn + 0*x_step_qn) >> RS_SCALE_SUBPEL_BITS)] 4 times
	// Ideally this should be a gather using the indices, but NEON does not
	// have that, so have to emulate
	const int32x4_t xqn_idx =
	vmlaq_n_s32(vdupq_n_s32(x_qn), idx, x_step_qn);
	// We have to multiply x2 to get the actual pointer as sizeof(uint16_t)
	// = 2
	const int32x4_t src_idx =
	vshlq_n_s32(vshrq_n_s32(xqn_idx, RS_SCALE_SUBPEL_BITS), 1);

	// Similarly for the filter vector indices, we calculate the filter
	// indices for 4 columns. First we calculate the indices:
	// x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS
	// Then we calculate the actual pointers, multiplying with
	// UPSCALE_UPSCALE_NORMATIVE_TAPS
	// again shift left by 1
	const int32x4_t x_filter4_idx = vshlq_n_s32(
	vshrq_n_s32(vandq_s32(xqn_idx, subpel_mask), RS_SCALE_EXTRA_BITS),
	1);
	// Even though pointers are unsigned 32/64-bit ints we do signed
	// addition The reason for this is that x_qn can be negative, leading to
	// negative offsets. Argon test
	// profile0_core/streams/test10573_11003.obu was failing because of
	// this.
	#if AOM_ARCH_AARCH64
	uint64x2_t tmp4[2];
	tmp4[0] = vreinterpretq_u64_s64(
	vaddw_s32(vdupq_n_s64((const int64_t)s), vget_low_s32(src_idx)));
	tmp4[1] = vreinterpretq_u64_s64(
	vaddw_s32(vdupq_n_s64((const int64_t)s), vget_high_s32(src_idx)));
	int16_t *src4_ptr[4];
	uint64_t tmp_ptr = (uint64_t )&src4_ptr;
	vst1q_u64(tmp_ptr, tmp4[0]);
	vst1q_u64(tmp_ptr + 2, tmp4[1]);

	// filter vectors
	tmp4[0] = vreinterpretq_u64_s64(vmlal_s32(
	vdupq_n_s64((const int64_t)x_filters), vget_low_s32(x_filter4_idx),
	vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));
	tmp4[1] = vreinterpretq_u64_s64(vmlal_s32(
	vdupq_n_s64((const int64_t)x_filters), vget_high_s32(x_filter4_idx),
	vdup_n_s32(UPSCALE_NORMATIVE_TAPS)));

	const int16_t *x_filter4_ptr[4];
	tmp_ptr = (uint64_t *)&x_filter4_ptr;
	vst1q_u64(tmp_ptr, tmp4[0]);
	vst1q_u64(tmp_ptr + 2, tmp4[1]);
	#else
	uint32x4_t tmp4;
	tmp4 = vreinterpretq_u32_s32(
	vaddq_s32(vdupq_n_s32((const int32_t)s), src_idx));
	int16_t *src4_ptr[4];
	uint32_t tmp_ptr = (uint32_t )&src4_ptr;
	vst1q_u32(tmp_ptr, tmp4);

	// filter vectors
	tmp4 = vreinterpretq_u32_s32(
	vmlaq_s32(vdupq_n_s32((const int32_t)x_filters), x_filter4_idx,
	vdupq_n_s32(UPSCALE_NORMATIVE_TAPS)));

	const int16_t *x_filter4_ptr[4];
	tmp_ptr = (uint32_t *)&x_filter4_ptr;
	vst1q_u32(tmp_ptr, tmp4);
	#endif // AOM_ARCH_AARCH64

	// Load source
	int16x8_t s0 = vld1q_s16(src4_ptr[0]);
	int16x8_t s1 = vld1q_s16(src4_ptr[1]);
	int16x8_t s2 = vld1q_s16(src4_ptr[2]);
	int16x8_t s3 = vld1q_s16(src4_ptr[3]);

	// Actually load the filters
	const int16x8_t x_filter0 = vld1q_s16(x_filter4_ptr[0]);
	const int16x8_t x_filter1 = vld1q_s16(x_filter4_ptr[1]);
	const int16x8_t x_filter2 = vld1q_s16(x_filter4_ptr[2]);
	const int16x8_t x_filter3 = vld1q_s16(x_filter4_ptr[3]);

	// Group low and high parts and transpose
	int16x4_t filters_lo[] = { vget_low_s16(x_filter0),
	vget_low_s16(x_filter1),
	vget_low_s16(x_filter2),
	vget_low_s16(x_filter3) };
	int16x4_t filters_hi[] = { vget_high_s16(x_filter0),
	vget_high_s16(x_filter1),
	vget_high_s16(x_filter2),
	vget_high_s16(x_filter3) };
	transpose_array_inplace_u16_4x4((uint16x4_t *)filters_lo);
	transpose_array_inplace_u16_4x4((uint16x4_t *)filters_hi);

	// Run the 2D Scale X convolution
	uint16x4_t d0 = highbd_convolve8_2d_scale_horiz4x8_s32_s16(
	s0, s1, s2, s3, filters_lo, filters_hi, shift_s32, offset_s32);

	d0 = vmin_u16(d0, max);
	vst1_u16(d, d0);

	x_qn += 4 * x_step_qn;
	d += 4;
	width -= 4;
	} while (width > 0);

	src_ptr += src_stride;
	dst_ptr += dst_stride;
	height--;
	} while (height > 0);
	}
	}