aom_dsp/arm/highbd_convolve8_neon.h - aom - Git at Google

 /*
  *  Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
  *
  *  Use of this source code is governed by a BSD-style license
  *  that can be found in the LICENSE file in the root of the source
  *  tree. An additional intellectual property rights grant can be found
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */

 #ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
 #define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_

 #include <arm_neon.h>

 #include "config/aom_config.h"
 #include "aom_dsp/arm/mem_neon.h"

 static INLINE void highbd_convolve8_horiz_2tap_neon(
     const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
     ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
   // Bilinear filter values are all positive and multiples of 8. Divide by 8 to
   // reduce intermediate precision requirements and allow the use of non
   // widening multiply.
   const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
   const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);

   const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

   if (w == 4) {
     do {
       uint16x8_t s0 =
           load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
       uint16x8_t s1 =
           load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
       uint16x8_t s2 =
           load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
       uint16x8_t s3 =
           load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);

       uint16x8_t sum01 = vmulq_u16(s0, f0);
       sum01 = vmlaq_u16(sum01, s1, f1);
       uint16x8_t sum23 = vmulq_u16(s2, f0);
       sum23 = vmlaq_u16(sum23, s3, f1);

       // We divided filter taps by 8 so subtract 3 from right shift.
       sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
       sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);

       sum01 = vminq_u16(sum01, max);
       sum23 = vminq_u16(sum23, max);

       store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
       store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);

       src_ptr += 4 * src_stride;
       dst_ptr += 4 * dst_stride;
       h -= 4;
     } while (h > 0);
   } else {
     do {
       int width = w;
       const uint16_t *s = src_ptr;
       uint16_t *d = dst_ptr;

       do {
         uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
         uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
         uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
         uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);

         uint16x8_t sum01 = vmulq_u16(s0, f0);
         sum01 = vmlaq_u16(sum01, s1, f1);
         uint16x8_t sum23 = vmulq_u16(s2, f0);
         sum23 = vmlaq_u16(sum23, s3, f1);

         // We divided filter taps by 8 so subtract 3 from right shift.
         sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
         sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);

         sum01 = vminq_u16(sum01, max);
         sum23 = vminq_u16(sum23, max);

         vst1q_u16(d + 0 * dst_stride, sum01);
         vst1q_u16(d + 1 * dst_stride, sum23);

         s += 8;
         d += 8;
         width -= 8;
       } while (width != 0);
       src_ptr += 2 * src_stride;
       dst_ptr += 2 * dst_stride;
       h -= 2;
     } while (h > 0);
   }
 }

 #endif  // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
	/*
	* Copyright (c) 2024, Alliance for Open Media. All Rights Reserved.
	*
	* Use of this source code is governed by a BSD-style license
	* that can be found in the LICENSE file in the root of the source
	* tree. An additional intellectual property rights grant can be found
	* in the file PATENTS. All contributing project authors may
	* be found in the AUTHORS file in the root of the source tree.
	*/

	#ifndef AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_
	#define AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_

	#include <arm_neon.h>

	#include "config/aom_config.h"
	#include "aom_dsp/arm/mem_neon.h"

	static INLINE void highbd_convolve8_horiz_2tap_neon(
	const uint16_t src_ptr, ptrdiff_t src_stride, uint16_t dst_ptr,
	ptrdiff_t dst_stride, const int16_t *x_filter_ptr, int w, int h, int bd) {
	// Bilinear filter values are all positive and multiples of 8. Divide by 8 to
	// reduce intermediate precision requirements and allow the use of non
	// widening multiply.
	const uint16x8_t f0 = vdupq_n_u16((uint16_t)x_filter_ptr[3] / 8);
	const uint16x8_t f1 = vdupq_n_u16((uint16_t)x_filter_ptr[4] / 8);

	const uint16x8_t max = vdupq_n_u16((1 << bd) - 1);

	if (w == 4) {
	do {
	uint16x8_t s0 =
	load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 0, (int)src_stride);
	uint16x8_t s1 =
	load_unaligned_u16_4x2(src_ptr + 0 * src_stride + 1, (int)src_stride);
	uint16x8_t s2 =
	load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 0, (int)src_stride);
	uint16x8_t s3 =
	load_unaligned_u16_4x2(src_ptr + 2 * src_stride + 1, (int)src_stride);

	uint16x8_t sum01 = vmulq_u16(s0, f0);
	sum01 = vmlaq_u16(sum01, s1, f1);
	uint16x8_t sum23 = vmulq_u16(s2, f0);
	sum23 = vmlaq_u16(sum23, s3, f1);

	// We divided filter taps by 8 so subtract 3 from right shift.
	sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
	sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);

	sum01 = vminq_u16(sum01, max);
	sum23 = vminq_u16(sum23, max);

	store_u16x4_strided_x2(dst_ptr + 0 * dst_stride, (int)dst_stride, sum01);
	store_u16x4_strided_x2(dst_ptr + 2 * dst_stride, (int)dst_stride, sum23);

	src_ptr += 4 * src_stride;
	dst_ptr += 4 * dst_stride;
	h -= 4;
	} while (h > 0);
	} else {
	do {
	int width = w;
	const uint16_t *s = src_ptr;
	uint16_t *d = dst_ptr;

	do {
	uint16x8_t s0 = vld1q_u16(s + 0 * src_stride + 0);
	uint16x8_t s1 = vld1q_u16(s + 0 * src_stride + 1);
	uint16x8_t s2 = vld1q_u16(s + 1 * src_stride + 0);
	uint16x8_t s3 = vld1q_u16(s + 1 * src_stride + 1);

	uint16x8_t sum01 = vmulq_u16(s0, f0);
	sum01 = vmlaq_u16(sum01, s1, f1);
	uint16x8_t sum23 = vmulq_u16(s2, f0);
	sum23 = vmlaq_u16(sum23, s3, f1);

	// We divided filter taps by 8 so subtract 3 from right shift.
	sum01 = vrshrq_n_u16(sum01, FILTER_BITS - 3);
	sum23 = vrshrq_n_u16(sum23, FILTER_BITS - 3);

	sum01 = vminq_u16(sum01, max);
	sum23 = vminq_u16(sum23, max);

	vst1q_u16(d + 0 * dst_stride, sum01);
	vst1q_u16(d + 1 * dst_stride, sum23);

	s += 8;
	d += 8;
	width -= 8;
	} while (width != 0);
	src_ptr += 2 * src_stride;
	dst_ptr += 2 * dst_stride;
	h -= 2;
	} while (h > 0);
	}
	}

	#endif // AOM_AOM_DSP_ARM_HIGHBD_CONVOLVE8_NEON_H_