Blame - aom_dsp/arm/sum_squares_neon.c - aom

blob: 1ce12ec23f69ce6bad3a8bb3e2f4d9486b6f0349 [file] [log] [blame]

Vitalii Dziumenko	22980c7	2020-04-17 15:50:12 +0300	[diff] [blame]	1	/*
				2	* Copyright (c) 2020, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <arm_neon.h>
				13	#include <assert.h>
				14
				15	#include "av1/common/arm/mem_neon.h"
				16	#include "config/aom_dsp_rtcd.h"
				17
				18	static INLINE uint32x4_t sum_squares_i16_4x4_neon(const int16_t *src,
				19	int stride) {
				20	const int16x4_t v_val_01_lo = vld1_s16(src + 0 * stride);
				21	const int16x4_t v_val_01_hi = vld1_s16(src + 1 * stride);
				22	const int16x4_t v_val_23_lo = vld1_s16(src + 2 * stride);
				23	const int16x4_t v_val_23_hi = vld1_s16(src + 3 * stride);
				24	int32x4_t v_sq_01_d = vmull_s16(v_val_01_lo, v_val_01_lo);
				25	v_sq_01_d = vmlal_s16(v_sq_01_d, v_val_01_hi, v_val_01_hi);
				26	int32x4_t v_sq_23_d = vmull_s16(v_val_23_lo, v_val_23_lo);
				27	v_sq_23_d = vmlal_s16(v_sq_23_d, v_val_23_hi, v_val_23_hi);
				28	#if defined(__aarch64__)
				29	return vreinterpretq_u32_s32(vpaddq_s32(v_sq_01_d, v_sq_23_d));
				30	#else
				31	return vreinterpretq_u32_s32(vcombine_s32(
				32	vqmovn_s64(vpaddlq_s32(v_sq_01_d)), vqmovn_s64(vpaddlq_s32(v_sq_23_d))));
				33	#endif
				34	}
				35
				36	uint64_t aom_sum_squares_2d_i16_4x4_neon(const int16_t *src, int stride) {
				37	const uint32x4_t v_sum_0123_d = sum_squares_i16_4x4_neon(src, stride);
				38	#if defined(__aarch64__)
				39	return (uint64_t)vaddvq_u32(v_sum_0123_d);
				40	#else
				41	uint64x2_t v_sum_d = vpaddlq_u32(v_sum_0123_d);
				42	v_sum_d = vaddq_u64(v_sum_d, vextq_u64(v_sum_d, v_sum_d, 1));
				43	return vgetq_lane_u64(v_sum_d, 0);
				44	#endif
				45	}
				46
				47	uint64_t aom_sum_squares_2d_i16_4xn_neon(const int16_t *src, int stride,
				48	int height) {
				49	int r = 0;
				50	uint32x4_t v_acc_q = vdupq_n_u32(0);
				51	do {
				52	const uint32x4_t v_acc_d = sum_squares_i16_4x4_neon(src, stride);
				53	v_acc_q = vaddq_u32(v_acc_q, v_acc_d);
				54	src += stride << 2;
				55	r += 4;
				56	} while (r < height);
				57
				58	uint64x2_t v_acc_64 = vpaddlq_u32(v_acc_q);
				59	#if defined(__aarch64__)
				60	return vaddvq_u64(v_acc_64);
				61	#else
				62	v_acc_64 = vaddq_u64(v_acc_64, vextq_u64(v_acc_64, v_acc_64, 1));
				63	return vgetq_lane_u64(v_acc_64, 0);
				64	#endif
				65	}
				66
				67	uint64_t aom_sum_squares_2d_i16_nxn_neon(const int16_t *src, int stride,
				68	int width, int height) {
				69	int r = 0;
				70	const int32x4_t zero = vdupq_n_s32(0);
				71	uint64x2_t v_acc_q = vreinterpretq_u64_s32(zero);
				72	do {
				73	int32x4_t v_sum = zero;
				74	int c = 0;
				75	do {
				76	const int16_t *b = src + c;
				77	const int16x8_t v_val_0 = vld1q_s16(b + 0 * stride);
				78	const int16x8_t v_val_1 = vld1q_s16(b + 1 * stride);
				79	const int16x8_t v_val_2 = vld1q_s16(b + 2 * stride);
				80	const int16x8_t v_val_3 = vld1q_s16(b + 3 * stride);
				81	const int16x4_t v_val_0_lo = vget_low_s16(v_val_0);
				82	const int16x4_t v_val_1_lo = vget_low_s16(v_val_1);
				83	const int16x4_t v_val_2_lo = vget_low_s16(v_val_2);
				84	const int16x4_t v_val_3_lo = vget_low_s16(v_val_3);
				85	int32x4_t v_sum_01 = vmull_s16(v_val_0_lo, v_val_0_lo);
				86	v_sum_01 = vmlal_s16(v_sum_01, v_val_1_lo, v_val_1_lo);
				87	int32x4_t v_sum_23 = vmull_s16(v_val_2_lo, v_val_2_lo);
				88	v_sum_23 = vmlal_s16(v_sum_23, v_val_3_lo, v_val_3_lo);
				89	#if defined(__aarch64__)
				90	v_sum_01 = vmlal_high_s16(v_sum_01, v_val_0, v_val_0);
				91	v_sum_01 = vmlal_high_s16(v_sum_01, v_val_1, v_val_1);
				92	v_sum_23 = vmlal_high_s16(v_sum_23, v_val_2, v_val_2);
				93	v_sum_23 = vmlal_high_s16(v_sum_23, v_val_3, v_val_3);
				94	v_sum = vaddq_s32(v_sum, vpaddq_s32(v_sum_01, v_sum_23));
				95	#else
				96	const int16x4_t v_val_0_hi = vget_high_s16(v_val_0);
				97	const int16x4_t v_val_1_hi = vget_high_s16(v_val_1);
				98	const int16x4_t v_val_2_hi = vget_high_s16(v_val_2);
				99	const int16x4_t v_val_3_hi = vget_high_s16(v_val_3);
				100	v_sum_01 = vmlal_s16(v_sum_01, v_val_0_hi, v_val_0_hi);
				101	v_sum_01 = vmlal_s16(v_sum_01, v_val_1_hi, v_val_1_hi);
				102	v_sum_23 = vmlal_s16(v_sum_23, v_val_2_hi, v_val_2_hi);
				103	v_sum_23 = vmlal_s16(v_sum_23, v_val_3_hi, v_val_3_hi);
				104	v_sum = vaddq_s32(v_sum, vcombine_s32(vqmovn_s64(vpaddlq_s32(v_sum_01)),
				105	vqmovn_s64(vpaddlq_s32(v_sum_23))));
				106	#endif
				107	c += 8;
				108	} while (c < width);
				109
				110	v_acc_q = vpadalq_u32(v_acc_q, vreinterpretq_u32_s32(v_sum));
				111
				112	src += 4 * stride;
				113	r += 4;
				114	} while (r < height);
				115	#if defined(__aarch64__)
				116	return vaddvq_u64(v_acc_q);
				117	#else
				118	v_acc_q = vaddq_u64(v_acc_q, vextq_u64(v_acc_q, v_acc_q, 1));
				119	return vgetq_lane_u64(v_acc_q, 0);
				120	#endif
				121	}
				122
				123	uint64_t aom_sum_squares_2d_i16_neon(const int16_t *src, int stride, int width,
				124	int height) {
				125	// 4 elements per row only requires half an SIMD register, so this
				126	// must be a special case, but also note that over 75% of all calls
				127	// are with size == 4, so it is also the common case.
				128	if (LIKELY(width == 4 && height == 4)) {
				129	return aom_sum_squares_2d_i16_4x4_neon(src, stride);
				130	} else if (LIKELY(width == 4 && (height & 3) == 0)) {
				131	return aom_sum_squares_2d_i16_4xn_neon(src, stride, height);
				132	} else if (LIKELY((width & 7) == 0 && (height & 3) == 0)) {
				133	// Generic case
				134	return aom_sum_squares_2d_i16_nxn_neon(src, stride, width, height);
				135	} else {
				136	return aom_sum_squares_2d_i16_c(src, stride, width, height);
				137	}
				138	}