av1/common/arm/reconinter_neon.c - aom - Git at Google

 /*
  *
  * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
  * was not distributed with this source code in the LICENSE file, you can
  * obtain it at www.aomedia.org/license/software. If the Alliance for Open
  * Media Patent License 1.0 was not distributed with this source code in the
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */

 #include <arm_neon.h>
 #include <assert.h>

 #include "aom/aom_integer.h"
 #include "aom_dsp/blend.h"
 #include "aom_dsp/arm/mem_neon.h"
 #include "aom_ports/mem.h"
 #include "av1/common/blockd.h"
 #include "config/av1_rtcd.h"

 void av1_build_compound_diffwtd_mask_d16_neon(
     uint8_t *mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE *src0,
     int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
     ConvolveParams *conv_params, int bd) {
   assert(h >= 4);
   assert(w >= 4);
   assert((mask_type == DIFFWTD_38_INV) || (mask_type == DIFFWTD_38));
   const int round =
       2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
   uint16x8_t diff_q, tmp0, tmp1;
   uint8x8_t diff_d, diff_select;
   const CONV_BUF_TYPE *src0_1, *src1_1;
   const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
   const uint8x8_t dup_38 = vdup_n_u8(38);
   const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
   if (mask_type == DIFFWTD_38) {
     diff_select = vdup_n_u8(255);
   } else {
     diff_select = vdup_n_u8(0);
   }
   if (w >= 8) {
     for (int i = 0; i < h; ++i) {
       src0_1 = src0;
       src1_1 = src1;
       for (int j = 0; j < w; j += 8) {
         __builtin_prefetch(src0_1);
         __builtin_prefetch(src1_1);
         diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
         diff_q = vrshlq_u16(diff_q, dup_round);
         diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
         diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
         diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
         vst1_u8(mask, diff_d);
         src0_1 += 8;
         src1_1 += 8;
         mask += 8;
       }
       src0 += src0_stride;
       src1 += src1_stride;
     }
   } else if (w == 4) {
     for (int i = 0; i < h; i += 2) {
       src0_1 = src0;
       src1_1 = src1;
       __builtin_prefetch(src0_1 + 0 * src0_stride);
       __builtin_prefetch(src0_1 + 1 * src0_stride);
       __builtin_prefetch(src1_1 + 0 * src1_stride);
       __builtin_prefetch(src1_1 + 1 * src1_stride);
       tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
                           vld1_u16(src0_1 + (1 * src0_stride)));
       tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
                           vld1_u16(src1_1 + (1 * src1_stride)));
       diff_q = vabdq_u16(tmp0, tmp1);
       diff_q = vrshlq_u16(diff_q, dup_round);
       diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
       diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
       diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
       vst1_u8(mask, diff_d);
       src0 += src0_stride * 2;
       src1 += src1_stride * 2;
       mask += w * 2;
     }
   }
 }
	/*
	*
	* Copyright (c) 2018, Alliance for Open Media. All rights reserved
	*
	* This source code is subject to the terms of the BSD 2 Clause License and
	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
	* was not distributed with this source code in the LICENSE file, you can
	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
	* Media Patent License 1.0 was not distributed with this source code in the
	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
	*/

	#include <arm_neon.h>
	#include <assert.h>

	#include "aom/aom_integer.h"
	#include "aom_dsp/blend.h"
	#include "aom_dsp/arm/mem_neon.h"
	#include "aom_ports/mem.h"
	#include "av1/common/blockd.h"
	#include "config/av1_rtcd.h"

	void av1_build_compound_diffwtd_mask_d16_neon(
	uint8_t mask, DIFFWTD_MASK_TYPE mask_type, const CONV_BUF_TYPE src0,
	int src0_stride, const CONV_BUF_TYPE *src1, int src1_stride, int h, int w,
	ConvolveParams *conv_params, int bd) {
	assert(h >= 4);
	assert(w >= 4);
	assert((mask_type == DIFFWTD_38_INV) \|\| (mask_type == DIFFWTD_38));
	const int round =
	2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1 + (bd - 8);
	uint16x8_t diff_q, tmp0, tmp1;
	uint8x8_t diff_d, diff_select;
	const CONV_BUF_TYPE src0_1, src1_1;
	const int16x8_t dup_round = vdupq_n_s16((int16_t)(-round));
	const uint8x8_t dup_38 = vdup_n_u8(38);
	const uint8x8_t dup_64 = vdup_n_u8(AOM_BLEND_A64_MAX_ALPHA);
	if (mask_type == DIFFWTD_38) {
	diff_select = vdup_n_u8(255);
	} else {
	diff_select = vdup_n_u8(0);
	}
	if (w >= 8) {
	for (int i = 0; i < h; ++i) {
	src0_1 = src0;
	src1_1 = src1;
	for (int j = 0; j < w; j += 8) {
	__builtin_prefetch(src0_1);
	__builtin_prefetch(src1_1);
	diff_q = vabdq_u16(vld1q_u16(src0_1), vld1q_u16(src1_1));
	diff_q = vrshlq_u16(diff_q, dup_round);
	diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
	diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
	diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
	vst1_u8(mask, diff_d);
	src0_1 += 8;
	src1_1 += 8;
	mask += 8;
	}
	src0 += src0_stride;
	src1 += src1_stride;
	}
	} else if (w == 4) {
	for (int i = 0; i < h; i += 2) {
	src0_1 = src0;
	src1_1 = src1;
	__builtin_prefetch(src0_1 + 0 * src0_stride);
	__builtin_prefetch(src0_1 + 1 * src0_stride);
	__builtin_prefetch(src1_1 + 0 * src1_stride);
	__builtin_prefetch(src1_1 + 1 * src1_stride);
	tmp0 = vcombine_u16(vld1_u16(src0_1 + (0 * src0_stride)),
	vld1_u16(src0_1 + (1 * src0_stride)));
	tmp1 = vcombine_u16(vld1_u16(src1_1 + (0 * src1_stride)),
	vld1_u16(src1_1 + (1 * src1_stride)));
	diff_q = vabdq_u16(tmp0, tmp1);
	diff_q = vrshlq_u16(diff_q, dup_round);
	diff_d = vshrn_n_u16(diff_q, DIFF_FACTOR_LOG2);
	diff_d = vmin_u8(vadd_u8(diff_d, dup_38), dup_64);
	diff_d = vbsl_u8(diff_select, diff_d, vsub_u8(dup_64, diff_d));
	vst1_u8(mask, diff_d);
	src0 += src0_stride * 2;
	src1 += src1_stride * 2;
	mask += w * 2;
	}
	}
	}