Blame - aom_dsp/x86/avg_intrin_sse2.c - aom

blob: 969e4e195db6b8ee8415459bd0e17b715d386749 [file] [log] [blame]

Yue Chen	7cae98f	2018-08-24 10:43:16 -0700	[diff] [blame^]	1	/*
				2	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <immintrin.h>
				13
				14	#include "config/aom_dsp_rtcd.h"
				15	#include "aom/aom_integer.h"
				16	#include "aom_dsp/x86/bitdepth_conversion_sse2.h"
				17	#include "aom_ports/mem.h"
				18
				19	static void hadamard_col8_sse2(__m128i *in, int iter) {
				20	__m128i a0 = in[0];
				21	__m128i a1 = in[1];
				22	__m128i a2 = in[2];
				23	__m128i a3 = in[3];
				24	__m128i a4 = in[4];
				25	__m128i a5 = in[5];
				26	__m128i a6 = in[6];
				27	__m128i a7 = in[7];
				28
				29	__m128i b0 = _mm_add_epi16(a0, a1);
				30	__m128i b1 = _mm_sub_epi16(a0, a1);
				31	__m128i b2 = _mm_add_epi16(a2, a3);
				32	__m128i b3 = _mm_sub_epi16(a2, a3);
				33	__m128i b4 = _mm_add_epi16(a4, a5);
				34	__m128i b5 = _mm_sub_epi16(a4, a5);
				35	__m128i b6 = _mm_add_epi16(a6, a7);
				36	__m128i b7 = _mm_sub_epi16(a6, a7);
				37
				38	a0 = _mm_add_epi16(b0, b2);
				39	a1 = _mm_add_epi16(b1, b3);
				40	a2 = _mm_sub_epi16(b0, b2);
				41	a3 = _mm_sub_epi16(b1, b3);
				42	a4 = _mm_add_epi16(b4, b6);
				43	a5 = _mm_add_epi16(b5, b7);
				44	a6 = _mm_sub_epi16(b4, b6);
				45	a7 = _mm_sub_epi16(b5, b7);
				46
				47	if (iter == 0) {
				48	b0 = _mm_add_epi16(a0, a4);
				49	b7 = _mm_add_epi16(a1, a5);
				50	b3 = _mm_add_epi16(a2, a6);
				51	b4 = _mm_add_epi16(a3, a7);
				52	b2 = _mm_sub_epi16(a0, a4);
				53	b6 = _mm_sub_epi16(a1, a5);
				54	b1 = _mm_sub_epi16(a2, a6);
				55	b5 = _mm_sub_epi16(a3, a7);
				56
				57	a0 = _mm_unpacklo_epi16(b0, b1);
				58	a1 = _mm_unpacklo_epi16(b2, b3);
				59	a2 = _mm_unpackhi_epi16(b0, b1);
				60	a3 = _mm_unpackhi_epi16(b2, b3);
				61	a4 = _mm_unpacklo_epi16(b4, b5);
				62	a5 = _mm_unpacklo_epi16(b6, b7);
				63	a6 = _mm_unpackhi_epi16(b4, b5);
				64	a7 = _mm_unpackhi_epi16(b6, b7);
				65
				66	b0 = _mm_unpacklo_epi32(a0, a1);
				67	b1 = _mm_unpacklo_epi32(a4, a5);
				68	b2 = _mm_unpackhi_epi32(a0, a1);
				69	b3 = _mm_unpackhi_epi32(a4, a5);
				70	b4 = _mm_unpacklo_epi32(a2, a3);
				71	b5 = _mm_unpacklo_epi32(a6, a7);
				72	b6 = _mm_unpackhi_epi32(a2, a3);
				73	b7 = _mm_unpackhi_epi32(a6, a7);
				74
				75	in[0] = _mm_unpacklo_epi64(b0, b1);
				76	in[1] = _mm_unpackhi_epi64(b0, b1);
				77	in[2] = _mm_unpacklo_epi64(b2, b3);
				78	in[3] = _mm_unpackhi_epi64(b2, b3);
				79	in[4] = _mm_unpacklo_epi64(b4, b5);
				80	in[5] = _mm_unpackhi_epi64(b4, b5);
				81	in[6] = _mm_unpacklo_epi64(b6, b7);
				82	in[7] = _mm_unpackhi_epi64(b6, b7);
				83	} else {
				84	in[0] = _mm_add_epi16(a0, a4);
				85	in[7] = _mm_add_epi16(a1, a5);
				86	in[3] = _mm_add_epi16(a2, a6);
				87	in[4] = _mm_add_epi16(a3, a7);
				88	in[2] = _mm_sub_epi16(a0, a4);
				89	in[6] = _mm_sub_epi16(a1, a5);
				90	in[1] = _mm_sub_epi16(a2, a6);
				91	in[5] = _mm_sub_epi16(a3, a7);
				92	}
				93	}
				94
				95	static INLINE void hadamard_8x8_sse2(const int16_t *src_diff,
				96	ptrdiff_t src_stride, tran_low_t *coeff,
				97	int is_final) {
				98	__m128i src[8];
				99	src[0] = _mm_load_si128((const __m128i *)src_diff);
				100	src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				101	src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				102	src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				103	src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				104	src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				105	src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				106	src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
				107
				108	hadamard_col8_sse2(src, 0);
				109	hadamard_col8_sse2(src, 1);
				110
				111	if (is_final) {
				112	store_tran_low(src[0], coeff);
				113	coeff += 8;
				114	store_tran_low(src[1], coeff);
				115	coeff += 8;
				116	store_tran_low(src[2], coeff);
				117	coeff += 8;
				118	store_tran_low(src[3], coeff);
				119	coeff += 8;
				120	store_tran_low(src[4], coeff);
				121	coeff += 8;
				122	store_tran_low(src[5], coeff);
				123	coeff += 8;
				124	store_tran_low(src[6], coeff);
				125	coeff += 8;
				126	store_tran_low(src[7], coeff);
				127	} else {
				128	int16_t coeff16 = (int16_t )coeff;
				129	_mm_store_si128((__m128i *)coeff16, src[0]);
				130	coeff16 += 8;
				131	_mm_store_si128((__m128i *)coeff16, src[1]);
				132	coeff16 += 8;
				133	_mm_store_si128((__m128i *)coeff16, src[2]);
				134	coeff16 += 8;
				135	_mm_store_si128((__m128i *)coeff16, src[3]);
				136	coeff16 += 8;
				137	_mm_store_si128((__m128i *)coeff16, src[4]);
				138	coeff16 += 8;
				139	_mm_store_si128((__m128i *)coeff16, src[5]);
				140	coeff16 += 8;
				141	_mm_store_si128((__m128i *)coeff16, src[6]);
				142	coeff16 += 8;
				143	_mm_store_si128((__m128i *)coeff16, src[7]);
				144	}
				145	}
				146
				147	void aom_hadamard_8x8_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
				148	tran_low_t *coeff) {
				149	hadamard_8x8_sse2(src_diff, src_stride, coeff, 1);
				150	}
				151
				152	static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
				153	ptrdiff_t src_stride, tran_low_t *coeff,
				154	int is_final) {
				155	// For high bitdepths, it is unnecessary to store_tran_low
				156	// (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
				157	// next stage. Output to an intermediate buffer first, then store_tran_low()
				158	// in the final stage.
				159	DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
				160	int16_t *t_coeff = temp_coeff;
				161	int16_t coeff16 = (int16_t )coeff;
				162	int idx;
				163	for (idx = 0; idx < 4; ++idx) {
				164	const int16_t *src_ptr =
				165	src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
				166	hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t )(t_coeff + idx 64),
				167	0);
				168	}
				169
				170	for (idx = 0; idx < 64; idx += 8) {
				171	__m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
				172	__m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
				173	__m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
				174	__m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));
				175
				176	__m128i b0 = _mm_add_epi16(coeff0, coeff1);
				177	__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
				178	__m128i b2 = _mm_add_epi16(coeff2, coeff3);
				179	__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
				180
				181	b0 = _mm_srai_epi16(b0, 1);
				182	b1 = _mm_srai_epi16(b1, 1);
				183	b2 = _mm_srai_epi16(b2, 1);
				184	b3 = _mm_srai_epi16(b3, 1);
				185
				186	coeff0 = _mm_add_epi16(b0, b2);
				187	coeff1 = _mm_add_epi16(b1, b3);
				188	coeff2 = _mm_sub_epi16(b0, b2);
				189	coeff3 = _mm_sub_epi16(b1, b3);
				190
				191	if (is_final) {
				192	store_tran_low(coeff0, coeff);
				193	store_tran_low(coeff1, coeff + 64);
				194	store_tran_low(coeff2, coeff + 128);
				195	store_tran_low(coeff3, coeff + 192);
				196	coeff += 8;
				197	} else {
				198	_mm_store_si128((__m128i *)coeff16, coeff0);
				199	_mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
				200	_mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
				201	_mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
				202	coeff16 += 8;
				203	}
				204
				205	t_coeff += 8;
				206	}
				207	}
				208
				209	void aom_hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
				210	tran_low_t *coeff) {
				211	hadamard_16x16_sse2(src_diff, src_stride, coeff, 1);
				212	}
				213
				214	void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
				215	tran_low_t *coeff) {
				216	// For high bitdepths, it is unnecessary to store_tran_low
				217	// (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
				218	// next stage. Output to an intermediate buffer first, then store_tran_low()
				219	// in the final stage.
				220	DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
				221	int16_t *t_coeff = temp_coeff;
				222	int idx;
				223	for (idx = 0; idx < 4; ++idx) {
				224	const int16_t *src_ptr =
				225	src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
				226	hadamard_16x16_sse2(src_ptr, src_stride,
				227	(tran_low_t )(t_coeff + idx 256), 0);
				228	}
				229
				230	for (idx = 0; idx < 256; idx += 8) {
				231	__m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
				232	__m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
				233	__m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
				234	__m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));
				235
				236	__m128i b0 = _mm_add_epi16(coeff0, coeff1);
				237	__m128i b1 = _mm_sub_epi16(coeff0, coeff1);
				238	__m128i b2 = _mm_add_epi16(coeff2, coeff3);
				239	__m128i b3 = _mm_sub_epi16(coeff2, coeff3);
				240
				241	b0 = _mm_srai_epi16(b0, 2);
				242	b1 = _mm_srai_epi16(b1, 2);
				243	b2 = _mm_srai_epi16(b2, 2);
				244	b3 = _mm_srai_epi16(b3, 2);
				245
				246	coeff0 = _mm_add_epi16(b0, b2);
				247	coeff1 = _mm_add_epi16(b1, b3);
				248	store_tran_low(coeff0, coeff);
				249	store_tran_low(coeff1, coeff + 256);
				250
				251	coeff2 = _mm_sub_epi16(b0, b2);
				252	coeff3 = _mm_sub_epi16(b1, b3);
				253	store_tran_low(coeff2, coeff + 512);
				254	store_tran_low(coeff3, coeff + 768);
				255
				256	coeff += 8;
				257	t_coeff += 8;
				258	}
				259	}
				260
				261	int aom_satd_sse2(const tran_low_t *coeff, int length) {
				262	int i;
				263	const __m128i zero = _mm_setzero_si128();
				264	__m128i accum = zero;
				265
				266	for (i = 0; i < length; i += 8) {
				267	const __m128i src_line = load_tran_low(coeff);
				268	const __m128i inv = _mm_sub_epi16(zero, src_line);
				269	const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line)
				270	const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero);
				271	const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero);
				272	const __m128i sum = _mm_add_epi32(abs_lo, abs_hi);
				273	accum = _mm_add_epi32(accum, sum);
				274	coeff += 8;
				275	}
				276
				277	{ // cascading summation of accum
				278	__m128i hi = _mm_srli_si128(accum, 8);
				279	accum = _mm_add_epi32(accum, hi);
				280	hi = _mm_srli_epi64(accum, 32);
				281	accum = _mm_add_epi32(accum, hi);
				282	}
				283
				284	return _mm_cvtsi128_si32(accum);
				285	}