Blame - aom_dsp/x86/masked_sad_intrin_ssse3.c - avm

blob: c0d9ec8764ea4b119edb785940d45c49a4c580ea [file] [log] [blame]

Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	1	/*
Yaowu Xu	bde4ac8	2016-11-28 15:26:06 -0800	[diff] [blame]	2	* Copyright (c) 2016, Alliance for Open Media. All rights reserved
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	3	*
Yaowu Xu	bde4ac8	2016-11-28 15:26:06 -0800	[diff] [blame]	4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	10	*/
				11
				12	#include <stdlib.h>
				13	#include <emmintrin.h>
				14	#include <tmmintrin.h>
				15
				16	#include "aom_ports/mem.h"
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	17	#include "./aom_config.h"
				18	#include "aom/aom_integer.h"
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	19
				20	static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
				21	__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
				22	__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
				23	return _mm_unpacklo_epi64(temp1, temp2);
				24	}
				25
				26	static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
				27	__m128i temp1 = _mm_cvtsi32_si128((const uint32_t )ptr);
				28	__m128i temp2 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride));
				29	__m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
				30	temp1 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride * 2));
				31	temp2 = _mm_cvtsi32_si128((const uint32_t )(ptr + stride * 3));
				32	temp1 = _mm_unpacklo_epi32(temp1, temp2);
				33	return _mm_unpacklo_epi64(temp3, temp1);
				34	}
				35
				36	static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
				37	const uint8_t *b_ptr, int b_stride,
				38	const uint8_t *m_ptr, int m_stride,
				39	int width, int height);
				40
				41	static INLINE unsigned int masked_sad8xh_ssse3(
				42	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
				43	const uint8_t *m_ptr, int m_stride, int height);
				44
				45	static INLINE unsigned int masked_sad4xh_ssse3(
				46	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
				47	const uint8_t *m_ptr, int m_stride, int height);
				48
				49	#define MASKSADMXN_SSSE3(m, n) \
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	50	unsigned int aom_masked_sad##m##x##n##_ssse3( \
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	51	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
				52	const uint8_t *msk, int msk_stride) { \
				53	return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
				54	m, n); \
				55	}
				56
				57	#if CONFIG_EXT_PARTITION
				58	MASKSADMXN_SSSE3(128, 128)
				59	MASKSADMXN_SSSE3(128, 64)
				60	MASKSADMXN_SSSE3(64, 128)
				61	#endif // CONFIG_EXT_PARTITION
				62	MASKSADMXN_SSSE3(64, 64)
				63	MASKSADMXN_SSSE3(64, 32)
				64	MASKSADMXN_SSSE3(32, 64)
				65	MASKSADMXN_SSSE3(32, 32)
				66	MASKSADMXN_SSSE3(32, 16)
				67	MASKSADMXN_SSSE3(16, 32)
				68	MASKSADMXN_SSSE3(16, 16)
				69	MASKSADMXN_SSSE3(16, 8)
				70
				71	#define MASKSAD8XN_SSSE3(n) \
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	72	unsigned int aom_masked_sad8x##n##_ssse3( \
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	73	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
				74	const uint8_t *msk, int msk_stride) { \
				75	return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
				76	msk_stride, n); \
				77	}
				78
				79	MASKSAD8XN_SSSE3(16)
				80	MASKSAD8XN_SSSE3(8)
				81	MASKSAD8XN_SSSE3(4)
				82
				83	#define MASKSAD4XN_SSSE3(n) \
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	84	unsigned int aom_masked_sad4x##n##_ssse3( \
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	85	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
				86	const uint8_t *msk, int msk_stride) { \
				87	return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
				88	msk_stride, n); \
				89	}
				90
				91	MASKSAD4XN_SSSE3(8)
				92	MASKSAD4XN_SSSE3(4)
				93
				94	// For width a multiple of 16
				95	// Assumes values in m are <=64
				96	static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
				97	const uint8_t *b_ptr, int b_stride,
				98	const uint8_t *m_ptr, int m_stride,
				99	int width, int height) {
				100	int y, x;
				101	__m128i a, b, m, temp1, temp2;
				102	__m128i res = _mm_setzero_si128();
				103	__m128i one = _mm_set1_epi16(1);
				104	// For each row
				105	for (y = 0; y < height; y++) {
				106	// Covering the full width
				107	for (x = 0; x < width; x += 16) {
				108	// Load a, b, m in xmm registers
				109	a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
				110	b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
				111	m = _mm_loadu_si128((const __m128i *)(m_ptr + x));
				112
				113	// Calculate the difference between a & b
				114	temp1 = _mm_subs_epu8(a, b);
				115	temp2 = _mm_subs_epu8(b, a);
				116	temp1 = _mm_or_si128(temp1, temp2);
				117
				118	// Multiply by m and add together
				119	temp2 = _mm_maddubs_epi16(temp1, m);
				120	// Pad out row result to 32 bit integers & add to running total
				121	res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
				122	}
				123	// Move onto the next row
				124	a_ptr += a_stride;
				125	b_ptr += b_stride;
				126	m_ptr += m_stride;
				127	}
				128	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				129	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				130	// sad = (sad + 31) >> 6;
				131	return (_mm_cvtsi128_si32(res) + 31) >> 6;
				132	}
				133
				134	static INLINE unsigned int masked_sad8xh_ssse3(
				135	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
				136	const uint8_t *m_ptr, int m_stride, int height) {
				137	int y;
				138	__m128i a, b, m, temp1, temp2, row_res;
				139	__m128i res = _mm_setzero_si128();
				140	__m128i one = _mm_set1_epi16(1);
				141	// Add the masked SAD for 2 rows at a time
				142	for (y = 0; y < height; y += 2) {
				143	// Load a, b, m in xmm registers
				144	a = width8_load_2rows(a_ptr, a_stride);
				145	b = width8_load_2rows(b_ptr, b_stride);
				146	m = width8_load_2rows(m_ptr, m_stride);
				147
				148	// Calculate the difference between a & b
				149	temp1 = _mm_subs_epu8(a, b);
				150	temp2 = _mm_subs_epu8(b, a);
				151	temp1 = _mm_or_si128(temp1, temp2);
				152
				153	// Multiply by m and add together
				154	row_res = _mm_maddubs_epi16(temp1, m);
				155
				156	// Pad out row result to 32 bit integers & add to running total
				157	res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
				158
				159	// Move onto the next rows
				160	a_ptr += a_stride * 2;
				161	b_ptr += b_stride * 2;
				162	m_ptr += m_stride * 2;
				163	}
				164	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				165	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				166	// sad = (sad + 31) >> 6;
				167	return (_mm_cvtsi128_si32(res) + 31) >> 6;
				168	}
				169
				170	static INLINE unsigned int masked_sad4xh_ssse3(
				171	const uint8_t a_ptr, int a_stride, const uint8_t b_ptr, int b_stride,
				172	const uint8_t *m_ptr, int m_stride, int height) {
				173	int y;
				174	__m128i a, b, m, temp1, temp2, row_res;
				175	__m128i res = _mm_setzero_si128();
				176	__m128i one = _mm_set1_epi16(1);
				177	// Add the masked SAD for 4 rows at a time
				178	for (y = 0; y < height; y += 4) {
				179	// Load a, b, m in xmm registers
				180	a = width4_load_4rows(a_ptr, a_stride);
				181	b = width4_load_4rows(b_ptr, b_stride);
				182	m = width4_load_4rows(m_ptr, m_stride);
				183
				184	// Calculate the difference between a & b
				185	temp1 = _mm_subs_epu8(a, b);
				186	temp2 = _mm_subs_epu8(b, a);
				187	temp1 = _mm_or_si128(temp1, temp2);
				188
				189	// Multiply by m and add together
				190	row_res = _mm_maddubs_epi16(temp1, m);
				191
				192	// Pad out row result to 32 bit integers & add to running total
				193	res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
				194
				195	// Move onto the next rows
				196	a_ptr += a_stride * 4;
				197	b_ptr += b_stride * 4;
				198	m_ptr += m_stride * 4;
				199	}
				200	// Pad out row result to 32 bit integers & add to running total
				201	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				202	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				203	// sad = (sad + 31) >> 6;
				204	return (_mm_cvtsi128_si32(res) + 31) >> 6;
				205	}
				206
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	207	#if CONFIG_AOM_HIGHBITDEPTH
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	208	static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
				209	int stride) {
				210	__m128i temp1 = _mm_loadl_epi64((const __m128i *)ptr);
				211	__m128i temp2 = _mm_loadl_epi64((const __m128i *)(ptr + stride));
				212	return _mm_unpacklo_epi64(temp1, temp2);
				213	}
				214
				215	static INLINE unsigned int highbd_masked_sad_ssse3(
				216	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
				217	const uint8_t *m_ptr, int m_stride, int width, int height);
				218
				219	static INLINE unsigned int highbd_masked_sad4xh_ssse3(
				220	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
				221	const uint8_t *m_ptr, int m_stride, int height);
				222
				223	#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	224	unsigned int aom_highbd_masked_sad##m##x##n##_ssse3( \
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	225	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
				226	const uint8_t *msk, int msk_stride) { \
				227	return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
				228	msk_stride, m, n); \
				229	}
				230
				231	#if CONFIG_EXT_PARTITION
				232	HIGHBD_MASKSADMXN_SSSE3(128, 128)
				233	HIGHBD_MASKSADMXN_SSSE3(128, 64)
				234	HIGHBD_MASKSADMXN_SSSE3(64, 128)
				235	#endif // CONFIG_EXT_PARTITION
				236	HIGHBD_MASKSADMXN_SSSE3(64, 64)
				237	HIGHBD_MASKSADMXN_SSSE3(64, 32)
				238	HIGHBD_MASKSADMXN_SSSE3(32, 64)
				239	HIGHBD_MASKSADMXN_SSSE3(32, 32)
				240	HIGHBD_MASKSADMXN_SSSE3(32, 16)
				241	HIGHBD_MASKSADMXN_SSSE3(16, 32)
				242	HIGHBD_MASKSADMXN_SSSE3(16, 16)
				243	HIGHBD_MASKSADMXN_SSSE3(16, 8)
				244	HIGHBD_MASKSADMXN_SSSE3(8, 16)
				245	HIGHBD_MASKSADMXN_SSSE3(8, 8)
				246	HIGHBD_MASKSADMXN_SSSE3(8, 4)
				247
				248	#define HIGHBD_MASKSAD4XN_SSSE3(n) \
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	249	unsigned int aom_highbd_masked_sad4x##n##_ssse3( \
Yaowu Xu	c27fc14	2016-08-22 16:08:15 -0700	[diff] [blame]	250	const uint8_t src, int src_stride, const uint8_t ref, int ref_stride, \
				251	const uint8_t *msk, int msk_stride) { \
				252	return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
				253	msk_stride, n); \
				254	}
				255
				256	HIGHBD_MASKSAD4XN_SSSE3(8)
				257	HIGHBD_MASKSAD4XN_SSSE3(4)
				258
				259	// For width a multiple of 8
				260	// Assumes values in m are <=64
				261	static INLINE unsigned int highbd_masked_sad_ssse3(
				262	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
				263	const uint8_t *m_ptr, int m_stride, int width, int height) {
				264	int y, x;
				265	__m128i a, b, m, temp1, temp2;
				266	const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
				267	const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
				268	__m128i res = _mm_setzero_si128();
				269	// For each row
				270	for (y = 0; y < height; y++) {
				271	// Covering the full width
				272	for (x = 0; x < width; x += 8) {
				273	// Load a, b, m in xmm registers
				274	a = _mm_loadu_si128((const __m128i *)(a_ptr + x));
				275	b = _mm_loadu_si128((const __m128i *)(b_ptr + x));
				276	m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(m_ptr + x)),
				277	_mm_setzero_si128());
				278
				279	// Calculate the difference between a & b
				280	temp1 = _mm_subs_epu16(a, b);
				281	temp2 = _mm_subs_epu16(b, a);
				282	temp1 = _mm_or_si128(temp1, temp2);
				283
				284	// Add result of multiplying by m and add pairs together to running total
				285	res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
				286	}
				287	// Move onto the next row
				288	a_ptr += a_stride;
				289	b_ptr += b_stride;
				290	m_ptr += m_stride;
				291	}
				292	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				293	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				294	// sad = (sad + 31) >> 6;
				295	return (_mm_cvtsi128_si32(res) + 31) >> 6;
				296	}
				297
				298	static INLINE unsigned int highbd_masked_sad4xh_ssse3(
				299	const uint8_t a8_ptr, int a_stride, const uint8_t b8_ptr, int b_stride,
				300	const uint8_t *m_ptr, int m_stride, int height) {
				301	int y;
				302	__m128i a, b, m, temp1, temp2;
				303	const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
				304	const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
				305	__m128i res = _mm_setzero_si128();
				306	// Add the masked SAD for 2 rows at a time
				307	for (y = 0; y < height; y += 2) {
				308	// Load a, b, m in xmm registers
				309	a = highbd_width4_load_2rows(a_ptr, a_stride);
				310	b = highbd_width4_load_2rows(b_ptr, b_stride);
				311	temp1 = _mm_loadl_epi64((const __m128i *)m_ptr);
				312	temp2 = _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride));
				313	m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
				314	_mm_setzero_si128());
				315
				316	// Calculate the difference between a & b
				317	temp1 = _mm_subs_epu16(a, b);
				318	temp2 = _mm_subs_epu16(b, a);
				319	temp1 = _mm_or_si128(temp1, temp2);
				320
				321	// Multiply by m and add together
				322	res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
				323
				324	// Move onto the next rows
				325	a_ptr += a_stride * 2;
				326	b_ptr += b_stride * 2;
				327	m_ptr += m_stride * 2;
				328	}
				329	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				330	res = _mm_hadd_epi32(res, _mm_setzero_si128());
				331	// sad = (sad + 31) >> 6;
				332	return (_mm_cvtsi128_si32(res) + 31) >> 6;
				333	}
Yaowu Xu	f883b42	2016-08-30 14:01:10 -0700	[diff] [blame]	334	#endif // CONFIG_AOM_HIGHBITDEPTH