av1/common/arm/convolve_neon.c - avm - Git at Google

 /*
  *
  * Copyright (c) 2021, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 3-Clause Clear License
  * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
  * License was not distributed with this source code in the LICENSE file, you
  * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/.  If the
  * Alliance for Open Media Patent License 1.0 was not distributed with this
  * source code in the PATENTS file, you can obtain it at
  * aomedia.org/license/patent-license/.
  */

 #include <assert.h>
 #include <arm_neon.h>

 #include "config/av1_rtcd.h"

 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/arm/convolve_neon.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_dsp/arm/transpose_neon.h"

 void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
                                     uint8_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_x,
                                     const int subpel_x_qn,
                                     ConvolveParams *conv_params) {
   assert(subpel_x_qn == 8);
   assert(filter_params_x->taps == 2);
   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   (void)filter_params_x;
   (void)subpel_x_qn;
   (void)conv_params;
   if (w <= 4) {
     do {
       uint8x8_t s0_0 = vld1_u8(src);
       uint8x8_t s0_1 = vld1_u8(src + 1);
       uint8x8_t s1_0 = vld1_u8(src + src_stride);
       uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
       uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
       uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
       if (w == 2) {
         store_u8_2x1(dst + 0 * dst_stride, d0);
         store_u8_2x1(dst + 1 * dst_stride, d1);
       } else {
         store_u8_4x1(dst + 0 * dst_stride, d0);
         store_u8_4x1(dst + 1 * dst_stride, d1);
       }
       src += 2 * src_stride;
       dst += 2 * dst_stride;
       h -= 2;
     } while (h != 0);
   } else if (w == 8) {
     do {
       uint8x8_t s0_0 = vld1_u8(src);
       uint8x8_t s0_1 = vld1_u8(src + 1);
       uint8x8_t s1_0 = vld1_u8(src + src_stride);
       uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
       uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
       uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
       vst1_u8(dst, d0);
       vst1_u8(dst + dst_stride, d1);
       src += 2 * src_stride;
       dst += 2 * dst_stride;
       h -= 2;
     } while (h != 0);
   } else {
     do {
       const uint8_t *src_ptr = src;
       uint8_t *dst_ptr = dst;
       int width = w;
       do {
         uint8x16_t s0 = vld1q_u8(src_ptr);
         uint8x16_t s1 = vld1q_u8(src_ptr + 1);
         uint8x16_t d0 = vrhaddq_u8(s0, s1);
         vst1q_u8(dst_ptr, d0);
         src_ptr += 16;
         dst_ptr += 16;
         width -= 16;
       } while (width != 0);
       src += src_stride;
       dst += dst_stride;
     } while (--h != 0);
   }
 }

 void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
                                     uint8_t *dst, int dst_stride, int w, int h,
                                     const InterpFilterParams *filter_params_y,
                                     const int subpel_y_qn) {
   assert(subpel_y_qn == 8);
   assert(filter_params_y->taps == 2);
   (void)filter_params_y;
   (void)subpel_y_qn;
   if (w <= 4) {
     do {
       uint8x8_t s0 = load_unaligned_u8_4x1(src);
       uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
       uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
       uint8x8_t d0 = vrhadd_u8(s0, s1);
       uint8x8_t d1 = vrhadd_u8(s1, s2);
       if (w == 2) {
         store_u8_2x1(dst + 0 * dst_stride, d0);
         store_u8_2x1(dst + 1 * dst_stride, d1);
       } else {
         store_u8_4x1(dst + 0 * dst_stride, d0);
         store_u8_4x1(dst + 1 * dst_stride, d1);
       }
       src += 2 * src_stride;
       dst += 2 * dst_stride;
       h -= 2;
     } while (h != 0);
   } else if (w == 8) {
     do {
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + src_stride);
       uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
       uint8x8_t d0 = vrhadd_u8(s0, s1);
       uint8x8_t d1 = vrhadd_u8(s1, s2);
       vst1_u8(dst, d0);
       vst1_u8(dst + dst_stride, d1);
       src += 2 * src_stride;
       dst += 2 * dst_stride;
       h -= 2;
     } while (h != 0);
   } else {
     do {
       const uint8_t *src_ptr = src;
       uint8_t *dst_ptr = dst;
       int height = h;
       do {
         uint8x16_t s0 = vld1q_u8(src_ptr);
         uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
         uint8x16_t d0 = vrhaddq_u8(s0, s1);
         vst1q_u8(dst_ptr, d0);
         src_ptr += src_stride;
         dst_ptr += dst_stride;
       } while (--height != 0);
       src += 16;
       dst += 16;
       w -= 16;
     } while (w != 0);
   }
 }

 void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
                                      uint8_t *dst, int dst_stride, int w, int h,
                                      const InterpFilterParams *filter_params_x,
                                      const InterpFilterParams *filter_params_y,
                                      const int subpel_x_qn,
                                      const int subpel_y_qn,
                                      ConvolveParams *conv_params) {
   assert(subpel_x_qn == 8);
   assert(subpel_y_qn == 8);
   assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
   assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
   (void)filter_params_x;
   (void)subpel_x_qn;
   (void)filter_params_y;
   (void)subpel_y_qn;
   (void)conv_params;
   uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
   int im_h = h + 1;
   int im_stride = w;
   assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
   uint16_t *im = im_block;
   // Horizontal filter.
   if (w <= 4) {
     do {
       uint8x8_t s0 = vld1_u8(src);
       uint8x8_t s1 = vld1_u8(src + 1);
       uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
       // Safe to store the whole vector, the im buffer is big enough.
       vst1_u16(im, sum);
       src += src_stride;
       im += im_stride;
     } while (--im_h != 0);
   } else {
     do {
       const uint8_t *src_ptr = src;
       uint16_t *im_ptr = im;
       int width = w;
       do {
         uint8x8_t s0 = vld1_u8(src_ptr);
         uint8x8_t s1 = vld1_u8(src_ptr + 1);
         uint16x8_t sum = vaddl_u8(s0, s1);
         vst1q_u16(im_ptr, sum);
         src_ptr += 8;
         im_ptr += 8;
         width -= 8;
       } while (width != 0);
       src += src_stride;
       im += im_stride;
     } while (--im_h != 0);
   }
   im = im_block;
   // Vertical filter.
   if (w <= 4) {
     do {
       uint16x4_t s0 = vld1_u16(im);
       uint16x4_t s1 = vld1_u16(im + im_stride);
       uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
       uint16x4_t sum0 = vadd_u16(s0, s1);
       uint16x4_t sum1 = vadd_u16(s1, s2);
       uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
       uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
       if (w == 2) {
         store_u8_2x1(dst + 0 * dst_stride, d0);
         store_u8_2x1(dst + 1 * dst_stride, d1);
       } else {
         store_u8_4x1(dst + 0 * dst_stride, d0);
         store_u8_4x1(dst + 1 * dst_stride, d1);
       }
       im += 2 * im_stride;
       dst += 2 * dst_stride;
       h -= 2;
     } while (h != 0);
   } else {
     do {
       uint16_t *im_ptr = im;
       uint8_t *dst_ptr = dst;
       int height = h;
       do {
         uint16x8_t s0 = vld1q_u16(im_ptr);
         uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
         uint16x8_t sum = vaddq_u16(s0, s1);
         uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
         vst1_u8(dst_ptr, d0);
         im_ptr += im_stride;
         dst_ptr += dst_stride;
       } while (--height != 0);
       im += 8;
       dst += 8;
       w -= 8;
     } while (w != 0);
   }
 }
	/*
	*
	* Copyright (c) 2021, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 3-Clause Clear License
	* and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
	* License was not distributed with this source code in the LICENSE file, you
	* can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
	* Alliance for Open Media Patent License 1.0 was not distributed with this
	* source code in the PATENTS file, you can obtain it at
	* aomedia.org/license/patent-license/.
	*/

	#include <assert.h>
	#include <arm_neon.h>

	#include "config/av1_rtcd.h"

	#include "aom_dsp/aom_dsp_common.h"
	#include "aom_ports/mem.h"
	#include "av1/common/convolve.h"
	#include "av1/common/filter.h"
	#include "av1/common/arm/convolve_neon.h"
	#include "aom_dsp/arm/mem_neon.h"
	#include "aom_dsp/arm/transpose_neon.h"

	void av1_convolve_x_sr_intrabc_neon(const uint8_t *src, int src_stride,
	uint8_t *dst, int dst_stride, int w, int h,
	const InterpFilterParams *filter_params_x,
	const int subpel_x_qn,
	ConvolveParams *conv_params) {
	assert(subpel_x_qn == 8);
	assert(filter_params_x->taps == 2);
	assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
	(void)filter_params_x;
	(void)subpel_x_qn;
	(void)conv_params;
	if (w <= 4) {
	do {
	uint8x8_t s0_0 = vld1_u8(src);
	uint8x8_t s0_1 = vld1_u8(src + 1);
	uint8x8_t s1_0 = vld1_u8(src + src_stride);
	uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
	uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
	uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
	if (w == 2) {
	store_u8_2x1(dst + 0 * dst_stride, d0);
	store_u8_2x1(dst + 1 * dst_stride, d1);
	} else {
	store_u8_4x1(dst + 0 * dst_stride, d0);
	store_u8_4x1(dst + 1 * dst_stride, d1);
	}
	src += 2 * src_stride;
	dst += 2 * dst_stride;
	h -= 2;
	} while (h != 0);
	} else if (w == 8) {
	do {
	uint8x8_t s0_0 = vld1_u8(src);
	uint8x8_t s0_1 = vld1_u8(src + 1);
	uint8x8_t s1_0 = vld1_u8(src + src_stride);
	uint8x8_t s1_1 = vld1_u8(src + src_stride + 1);
	uint8x8_t d0 = vrhadd_u8(s0_0, s0_1);
	uint8x8_t d1 = vrhadd_u8(s1_0, s1_1);
	vst1_u8(dst, d0);
	vst1_u8(dst + dst_stride, d1);
	src += 2 * src_stride;
	dst += 2 * dst_stride;
	h -= 2;
	} while (h != 0);
	} else {
	do {
	const uint8_t *src_ptr = src;
	uint8_t *dst_ptr = dst;
	int width = w;
	do {
	uint8x16_t s0 = vld1q_u8(src_ptr);
	uint8x16_t s1 = vld1q_u8(src_ptr + 1);
	uint8x16_t d0 = vrhaddq_u8(s0, s1);
	vst1q_u8(dst_ptr, d0);
	src_ptr += 16;
	dst_ptr += 16;
	width -= 16;
	} while (width != 0);
	src += src_stride;
	dst += dst_stride;
	} while (--h != 0);
	}
	}

	void av1_convolve_y_sr_intrabc_neon(const uint8_t *src, int src_stride,
	uint8_t *dst, int dst_stride, int w, int h,
	const InterpFilterParams *filter_params_y,
	const int subpel_y_qn) {
	assert(subpel_y_qn == 8);
	assert(filter_params_y->taps == 2);
	(void)filter_params_y;
	(void)subpel_y_qn;
	if (w <= 4) {
	do {
	uint8x8_t s0 = load_unaligned_u8_4x1(src);
	uint8x8_t s1 = load_unaligned_u8_4x1(src + src_stride);
	uint8x8_t s2 = load_unaligned_u8_4x1(src + 2 * src_stride);
	uint8x8_t d0 = vrhadd_u8(s0, s1);
	uint8x8_t d1 = vrhadd_u8(s1, s2);
	if (w == 2) {
	store_u8_2x1(dst + 0 * dst_stride, d0);
	store_u8_2x1(dst + 1 * dst_stride, d1);
	} else {
	store_u8_4x1(dst + 0 * dst_stride, d0);
	store_u8_4x1(dst + 1 * dst_stride, d1);
	}
	src += 2 * src_stride;
	dst += 2 * dst_stride;
	h -= 2;
	} while (h != 0);
	} else if (w == 8) {
	do {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + src_stride);
	uint8x8_t s2 = vld1_u8(src + 2 * src_stride);
	uint8x8_t d0 = vrhadd_u8(s0, s1);
	uint8x8_t d1 = vrhadd_u8(s1, s2);
	vst1_u8(dst, d0);
	vst1_u8(dst + dst_stride, d1);
	src += 2 * src_stride;
	dst += 2 * dst_stride;
	h -= 2;
	} while (h != 0);
	} else {
	do {
	const uint8_t *src_ptr = src;
	uint8_t *dst_ptr = dst;
	int height = h;
	do {
	uint8x16_t s0 = vld1q_u8(src_ptr);
	uint8x16_t s1 = vld1q_u8(src_ptr + src_stride);
	uint8x16_t d0 = vrhaddq_u8(s0, s1);
	vst1q_u8(dst_ptr, d0);
	src_ptr += src_stride;
	dst_ptr += dst_stride;
	} while (--height != 0);
	src += 16;
	dst += 16;
	w -= 16;
	} while (w != 0);
	}
	}

	void av1_convolve_2d_sr_intrabc_neon(const uint8_t *src, int src_stride,
	uint8_t *dst, int dst_stride, int w, int h,
	const InterpFilterParams *filter_params_x,
	const InterpFilterParams *filter_params_y,
	const int subpel_x_qn,
	const int subpel_y_qn,
	ConvolveParams *conv_params) {
	assert(subpel_x_qn == 8);
	assert(subpel_y_qn == 8);
	assert(filter_params_x->taps == 2 && filter_params_y->taps == 2);
	assert((conv_params->round_0 + conv_params->round_1) == 2 * FILTER_BITS);
	(void)filter_params_x;
	(void)subpel_x_qn;
	(void)filter_params_y;
	(void)subpel_y_qn;
	(void)conv_params;
	uint16_t im_block[(MAX_SB_SIZE + MAX_FILTER_TAP - 1) * MAX_SB_SIZE];
	int im_h = h + 1;
	int im_stride = w;
	assert(w <= MAX_SB_SIZE && h <= MAX_SB_SIZE);
	uint16_t *im = im_block;
	// Horizontal filter.
	if (w <= 4) {
	do {
	uint8x8_t s0 = vld1_u8(src);
	uint8x8_t s1 = vld1_u8(src + 1);
	uint16x4_t sum = vget_low_u16(vaddl_u8(s0, s1));
	// Safe to store the whole vector, the im buffer is big enough.
	vst1_u16(im, sum);
	src += src_stride;
	im += im_stride;
	} while (--im_h != 0);
	} else {
	do {
	const uint8_t *src_ptr = src;
	uint16_t *im_ptr = im;
	int width = w;
	do {
	uint8x8_t s0 = vld1_u8(src_ptr);
	uint8x8_t s1 = vld1_u8(src_ptr + 1);
	uint16x8_t sum = vaddl_u8(s0, s1);
	vst1q_u16(im_ptr, sum);
	src_ptr += 8;
	im_ptr += 8;
	width -= 8;
	} while (width != 0);
	src += src_stride;
	im += im_stride;
	} while (--im_h != 0);
	}
	im = im_block;
	// Vertical filter.
	if (w <= 4) {
	do {
	uint16x4_t s0 = vld1_u16(im);
	uint16x4_t s1 = vld1_u16(im + im_stride);
	uint16x4_t s2 = vld1_u16(im + 2 * im_stride);
	uint16x4_t sum0 = vadd_u16(s0, s1);
	uint16x4_t sum1 = vadd_u16(s1, s2);
	uint8x8_t d0 = vqrshrn_n_u16(vcombine_u16(sum0, vdup_n_u16(0)), 2);
	uint8x8_t d1 = vqrshrn_n_u16(vcombine_u16(sum1, vdup_n_u16(0)), 2);
	if (w == 2) {
	store_u8_2x1(dst + 0 * dst_stride, d0);
	store_u8_2x1(dst + 1 * dst_stride, d1);
	} else {
	store_u8_4x1(dst + 0 * dst_stride, d0);
	store_u8_4x1(dst + 1 * dst_stride, d1);
	}
	im += 2 * im_stride;
	dst += 2 * dst_stride;
	h -= 2;
	} while (h != 0);
	} else {
	do {
	uint16_t *im_ptr = im;
	uint8_t *dst_ptr = dst;
	int height = h;
	do {
	uint16x8_t s0 = vld1q_u16(im_ptr);
	uint16x8_t s1 = vld1q_u16(im_ptr + im_stride);
	uint16x8_t sum = vaddq_u16(s0, s1);
	uint8x8_t d0 = vqrshrn_n_u16(sum, 2);
	vst1_u8(dst_ptr, d0);
	im_ptr += im_stride;
	dst_ptr += dst_stride;
	} while (--height != 0);
	im += 8;
	dst += 8;
	w -= 8;
	} while (w != 0);
	}
	}