Blame - aom_dsp/x86/intrapred_ssse3.c - aom

blob: 807ed1770fca919cb2653ab4db08b414d0e332ae [file] [log] [blame]

Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	1	/*
				2	* Copyright (c) 2017, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <tmmintrin.h>
				13
Tom Finegan	44702c8	2018-05-22 13:00:39 -0700	[diff] [blame^]	14	#include "config/aom_dsp_rtcd.h"
				15
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	16	#include "aom_dsp/intrapred_common.h"
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	17
				18	// -----------------------------------------------------------------------------
Urvang Joshi	96d1c0a	2017-10-10 13:15:32 -0700	[diff] [blame]	19	// PAETH_PRED
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	20
				21	// Return 8 16-bit pixels in one row
				22	static INLINE __m128i paeth_8x1_pred(const __m128i left, const __m128i top,
				23	const __m128i *topleft) {
				24	const __m128i base = _mm_sub_epi16(_mm_add_epi16(top, left), *topleft);
				25
				26	__m128i pl = _mm_abs_epi16(_mm_sub_epi16(base, *left));
				27	__m128i pt = _mm_abs_epi16(_mm_sub_epi16(base, *top));
				28	__m128i ptl = _mm_abs_epi16(_mm_sub_epi16(base, *topleft));
				29
				30	__m128i mask1 = _mm_cmpgt_epi16(pl, pt);
				31	mask1 = _mm_or_si128(mask1, _mm_cmpgt_epi16(pl, ptl));
				32	__m128i mask2 = _mm_cmpgt_epi16(pt, ptl);
				33
				34	pl = _mm_andnot_si128(mask1, *left);
				35
				36	ptl = _mm_and_si128(mask2, *topleft);
				37	pt = _mm_andnot_si128(mask2, *top);
				38	pt = _mm_or_si128(pt, ptl);
				39	pt = _mm_and_si128(mask1, pt);
				40
				41	return _mm_or_si128(pl, pt);
				42	}
				43
				44	void aom_paeth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				45	const uint8_t above, const uint8_t left) {
				46	__m128i l = _mm_loadl_epi64((const __m128i *)left);
				47	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				48	const __m128i zero = _mm_setzero_si128();
				49	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				50	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				51	__m128i rep = _mm_set1_epi16(0x8000);
				52	const __m128i one = _mm_set1_epi16(1);
				53
				54	int i;
				55	for (i = 0; i < 4; ++i) {
				56	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				57	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				58
				59	(uint32_t )dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
				60	dst += stride;
				61	rep = _mm_add_epi16(rep, one);
				62	}
				63	}
				64
				65	void aom_paeth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				66	const uint8_t above, const uint8_t left) {
				67	__m128i l = _mm_loadl_epi64((const __m128i *)left);
				68	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				69	const __m128i zero = _mm_setzero_si128();
				70	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				71	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				72	__m128i rep = _mm_set1_epi16(0x8000);
				73	const __m128i one = _mm_set1_epi16(1);
				74
				75	int i;
				76	for (i = 0; i < 8; ++i) {
				77	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				78	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				79
				80	(uint32_t )dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
				81	dst += stride;
				82	rep = _mm_add_epi16(rep, one);
				83	}
				84	}
				85
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	86	void aom_paeth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				87	const uint8_t above, const uint8_t left) {
				88	__m128i l = _mm_load_si128((const __m128i *)left);
				89	const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
				90	const __m128i zero = _mm_setzero_si128();
				91	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				92	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				93	__m128i rep = _mm_set1_epi16(0x8000);
				94	const __m128i one = _mm_set1_epi16(1);
				95
				96	for (int i = 0; i < 16; ++i) {
				97	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				98	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				99
				100	(uint32_t )dst = _mm_cvtsi128_si32(_mm_packus_epi16(row, row));
				101	dst += stride;
				102	rep = _mm_add_epi16(rep, one);
				103	}
				104	}
				105
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	106	void aom_paeth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				107	const uint8_t above, const uint8_t left) {
				108	__m128i l = _mm_loadl_epi64((const __m128i *)left);
				109	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				110	const __m128i zero = _mm_setzero_si128();
				111	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				112	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				113	__m128i rep = _mm_set1_epi16(0x8000);
				114	const __m128i one = _mm_set1_epi16(1);
				115
				116	int i;
				117	for (i = 0; i < 4; ++i) {
				118	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				119	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				120
				121	_mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
				122	dst += stride;
				123	rep = _mm_add_epi16(rep, one);
				124	}
				125	}
				126
				127	void aom_paeth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				128	const uint8_t above, const uint8_t left) {
				129	__m128i l = _mm_loadl_epi64((const __m128i *)left);
				130	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				131	const __m128i zero = _mm_setzero_si128();
				132	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				133	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				134	__m128i rep = _mm_set1_epi16(0x8000);
				135	const __m128i one = _mm_set1_epi16(1);
				136
				137	int i;
				138	for (i = 0; i < 8; ++i) {
				139	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				140	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				141
				142	_mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
				143	dst += stride;
				144	rep = _mm_add_epi16(rep, one);
				145	}
				146	}
				147
				148	void aom_paeth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				149	const uint8_t above, const uint8_t left) {
				150	__m128i l = _mm_load_si128((const __m128i *)left);
				151	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				152	const __m128i zero = _mm_setzero_si128();
				153	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				154	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				155	__m128i rep = _mm_set1_epi16(0x8000);
				156	const __m128i one = _mm_set1_epi16(1);
				157
				158	int i;
				159	for (i = 0; i < 16; ++i) {
				160	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				161	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				162
				163	_mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
				164	dst += stride;
				165	rep = _mm_add_epi16(rep, one);
				166	}
				167	}
				168
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	169	void aom_paeth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				170	const uint8_t above, const uint8_t left) {
				171	const __m128i t = _mm_loadl_epi64((const __m128i *)above);
				172	const __m128i zero = _mm_setzero_si128();
				173	const __m128i t16 = _mm_unpacklo_epi8(t, zero);
				174	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				175	const __m128i one = _mm_set1_epi16(1);
				176
				177	for (int j = 0; j < 2; ++j) {
				178	const __m128i l = _mm_load_si128((const __m128i )(left + j 16));
				179	__m128i rep = _mm_set1_epi16(0x8000);
				180	for (int i = 0; i < 16; ++i) {
				181	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				182	const __m128i row = paeth_8x1_pred(&l16, &t16, &tl16);
				183
				184	_mm_storel_epi64((__m128i *)dst, _mm_packus_epi16(row, row));
				185	dst += stride;
				186	rep = _mm_add_epi16(rep, one);
				187	}
				188	}
				189	}
				190
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	191	// Return 16 8-bit pixels in one row
				192	static INLINE __m128i paeth_16x1_pred(const __m128i left, const __m128i top0,
				193	const __m128i *top1,
				194	const __m128i *topleft) {
				195	const __m128i p0 = paeth_8x1_pred(left, top0, topleft);
				196	const __m128i p1 = paeth_8x1_pred(left, top1, topleft);
				197	return _mm_packus_epi16(p0, p1);
				198	}
				199
Scott LaVarnway	00f8a93	2018-04-02 08:02:40 -0700	[diff] [blame]	200	void aom_paeth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				201	const uint8_t above, const uint8_t left) {
				202	__m128i l = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
				203	const __m128i t = _mm_load_si128((const __m128i *)above);
				204	const __m128i zero = _mm_setzero_si128();
				205	const __m128i top0 = _mm_unpacklo_epi8(t, zero);
				206	const __m128i top1 = _mm_unpackhi_epi8(t, zero);
				207	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				208	__m128i rep = _mm_set1_epi16(0x8000);
				209	const __m128i one = _mm_set1_epi16(1);
				210
				211	for (int i = 0; i < 4; ++i) {
				212	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				213	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				214
				215	_mm_store_si128((__m128i *)dst, row);
				216	dst += stride;
				217	rep = _mm_add_epi16(rep, one);
				218	}
				219	}
				220
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	221	void aom_paeth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				222	const uint8_t above, const uint8_t left) {
				223	__m128i l = _mm_loadl_epi64((const __m128i *)left);
				224	const __m128i t = _mm_load_si128((const __m128i *)above);
				225	const __m128i zero = _mm_setzero_si128();
				226	const __m128i top0 = _mm_unpacklo_epi8(t, zero);
				227	const __m128i top1 = _mm_unpackhi_epi8(t, zero);
				228	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				229	__m128i rep = _mm_set1_epi16(0x8000);
				230	const __m128i one = _mm_set1_epi16(1);
				231
				232	int i;
				233	for (i = 0; i < 8; ++i) {
				234	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				235	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				236
				237	_mm_store_si128((__m128i *)dst, row);
				238	dst += stride;
				239	rep = _mm_add_epi16(rep, one);
				240	}
				241	}
				242
				243	void aom_paeth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				244	const uint8_t *above,
				245	const uint8_t *left) {
				246	__m128i l = _mm_load_si128((const __m128i *)left);
				247	const __m128i t = _mm_load_si128((const __m128i *)above);
				248	const __m128i zero = _mm_setzero_si128();
				249	const __m128i top0 = _mm_unpacklo_epi8(t, zero);
				250	const __m128i top1 = _mm_unpackhi_epi8(t, zero);
				251	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				252	__m128i rep = _mm_set1_epi16(0x8000);
				253	const __m128i one = _mm_set1_epi16(1);
				254
				255	int i;
				256	for (i = 0; i < 16; ++i) {
				257	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				258	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				259
				260	_mm_store_si128((__m128i *)dst, row);
				261	dst += stride;
				262	rep = _mm_add_epi16(rep, one);
				263	}
				264	}
				265
				266	void aom_paeth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				267	const uint8_t *above,
				268	const uint8_t *left) {
				269	__m128i l = _mm_load_si128((const __m128i *)left);
				270	const __m128i t = _mm_load_si128((const __m128i *)above);
				271	const __m128i zero = _mm_setzero_si128();
				272	const __m128i top0 = _mm_unpacklo_epi8(t, zero);
				273	const __m128i top1 = _mm_unpackhi_epi8(t, zero);
				274	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				275	__m128i rep = _mm_set1_epi16(0x8000);
				276	const __m128i one = _mm_set1_epi16(1);
				277	__m128i l16;
				278
				279	int i;
				280	for (i = 0; i < 16; ++i) {
				281	l16 = _mm_shuffle_epi8(l, rep);
				282	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				283
				284	_mm_store_si128((__m128i *)dst, row);
				285	dst += stride;
				286	rep = _mm_add_epi16(rep, one);
				287	}
				288
				289	l = _mm_load_si128((const __m128i *)(left + 16));
				290	rep = _mm_set1_epi16(0x8000);
				291	for (i = 0; i < 16; ++i) {
				292	l16 = _mm_shuffle_epi8(l, rep);
				293	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				294
				295	_mm_store_si128((__m128i *)dst, row);
				296	dst += stride;
				297	rep = _mm_add_epi16(rep, one);
				298	}
				299	}
				300
Scott LaVarnway	ee5a4d4	2018-03-14 07:53:40 -0700	[diff] [blame]	301	void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				302	const uint8_t *above,
				303	const uint8_t *left) {
				304	const __m128i t = _mm_load_si128((const __m128i *)above);
				305	const __m128i zero = _mm_setzero_si128();
				306	const __m128i top0 = _mm_unpacklo_epi8(t, zero);
				307	const __m128i top1 = _mm_unpackhi_epi8(t, zero);
				308	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				309	const __m128i one = _mm_set1_epi16(1);
				310
				311	for (int j = 0; j < 4; ++j) {
				312	const __m128i l = _mm_load_si128((const __m128i )(left + j 16));
				313	__m128i rep = _mm_set1_epi16(0x8000);
				314	for (int i = 0; i < 16; ++i) {
				315	const __m128i l16 = _mm_shuffle_epi8(l, rep);
				316	const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
				317	_mm_store_si128((__m128i *)dst, row);
				318	dst += stride;
				319	rep = _mm_add_epi16(rep, one);
				320	}
				321	}
				322	}
				323
Scott LaVarnway	7600314	2018-04-03 07:17:32 -0700	[diff] [blame]	324	void aom_paeth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				325	const uint8_t above, const uint8_t left) {
				326	const __m128i a = _mm_load_si128((const __m128i *)above);
				327	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				328	const __m128i zero = _mm_setzero_si128();
				329	const __m128i al = _mm_unpacklo_epi8(a, zero);
				330	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				331	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				332	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				333
				334	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				335	__m128i rep = _mm_set1_epi16(0x8000);
				336	const __m128i one = _mm_set1_epi16(1);
				337	const __m128i l = _mm_loadl_epi64((const __m128i *)left);
				338	__m128i l16;
				339
				340	for (int i = 0; i < 8; ++i) {
				341	l16 = _mm_shuffle_epi8(l, rep);
				342	const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				343	const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				344
				345	_mm_store_si128((__m128i *)dst, r32l);
				346	_mm_store_si128((__m128i *)(dst + 16), r32h);
				347	dst += stride;
				348	rep = _mm_add_epi16(rep, one);
				349	}
				350	}
				351
Yi Luo	a0f66fc	2017-09-26 15:49:59 -0700	[diff] [blame]	352	void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				353	const uint8_t *above,
				354	const uint8_t *left) {
				355	const __m128i a = _mm_load_si128((const __m128i *)above);
				356	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				357	const __m128i zero = _mm_setzero_si128();
				358	const __m128i al = _mm_unpacklo_epi8(a, zero);
				359	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				360	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				361	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				362
				363	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				364	__m128i rep = _mm_set1_epi16(0x8000);
				365	const __m128i one = _mm_set1_epi16(1);
				366	__m128i l = _mm_load_si128((const __m128i *)left);
				367	__m128i l16;
				368
				369	int i;
				370	for (i = 0; i < 16; ++i) {
				371	l16 = _mm_shuffle_epi8(l, rep);
				372	const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				373	const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				374
				375	_mm_store_si128((__m128i *)dst, r32l);
				376	_mm_store_si128((__m128i *)(dst + 16), r32h);
				377	dst += stride;
				378	rep = _mm_add_epi16(rep, one);
				379	}
				380	}
				381
				382	void aom_paeth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				383	const uint8_t *above,
				384	const uint8_t *left) {
				385	const __m128i a = _mm_load_si128((const __m128i *)above);
				386	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				387	const __m128i zero = _mm_setzero_si128();
				388	const __m128i al = _mm_unpacklo_epi8(a, zero);
				389	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				390	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				391	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				392
				393	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				394	__m128i rep = _mm_set1_epi16(0x8000);
				395	const __m128i one = _mm_set1_epi16(1);
				396	__m128i l = _mm_load_si128((const __m128i *)left);
				397	__m128i l16;
				398
				399	int i;
				400	for (i = 0; i < 16; ++i) {
				401	l16 = _mm_shuffle_epi8(l, rep);
				402	const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				403	const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				404
				405	_mm_store_si128((__m128i *)dst, r32l);
				406	_mm_store_si128((__m128i *)(dst + 16), r32h);
				407	dst += stride;
				408	rep = _mm_add_epi16(rep, one);
				409	}
				410
				411	rep = _mm_set1_epi16(0x8000);
				412	l = _mm_load_si128((const __m128i *)(left + 16));
				413	for (i = 0; i < 16; ++i) {
				414	l16 = _mm_shuffle_epi8(l, rep);
				415	const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				416	const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				417
				418	_mm_store_si128((__m128i *)dst, r32l);
				419	_mm_store_si128((__m128i *)(dst + 16), r32h);
				420	dst += stride;
				421	rep = _mm_add_epi16(rep, one);
				422	}
				423	}
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	424
Scott LaVarnway	f0cf4e3	2018-02-26 12:19:02 -0800	[diff] [blame]	425	void aom_paeth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				426	const uint8_t *above,
				427	const uint8_t *left) {
				428	const __m128i a = _mm_load_si128((const __m128i *)above);
				429	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				430	const __m128i zero = _mm_setzero_si128();
				431	const __m128i al = _mm_unpacklo_epi8(a, zero);
				432	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				433	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				434	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				435
				436	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				437	const __m128i one = _mm_set1_epi16(1);
				438	__m128i l16;
				439
				440	int i, j;
				441	for (j = 0; j < 4; ++j) {
				442	const __m128i l = _mm_load_si128((const __m128i )(left + j 16));
				443	__m128i rep = _mm_set1_epi16(0x8000);
				444	for (i = 0; i < 16; ++i) {
				445	l16 = _mm_shuffle_epi8(l, rep);
				446	const __m128i r32l = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				447	const __m128i r32h = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				448
				449	_mm_store_si128((__m128i *)dst, r32l);
				450	_mm_store_si128((__m128i *)(dst + 16), r32h);
				451	dst += stride;
				452	rep = _mm_add_epi16(rep, one);
				453	}
				454	}
				455	}
				456
				457	void aom_paeth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				458	const uint8_t *above,
				459	const uint8_t *left) {
				460	const __m128i a = _mm_load_si128((const __m128i *)above);
				461	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				462	const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
				463	const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
				464	const __m128i zero = _mm_setzero_si128();
				465	const __m128i al = _mm_unpacklo_epi8(a, zero);
				466	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				467	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				468	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				469	const __m128i cl = _mm_unpacklo_epi8(c, zero);
				470	const __m128i ch = _mm_unpackhi_epi8(c, zero);
				471	const __m128i dl = _mm_unpacklo_epi8(d, zero);
				472	const __m128i dh = _mm_unpackhi_epi8(d, zero);
				473
				474	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				475	const __m128i one = _mm_set1_epi16(1);
				476	__m128i l16;
				477
				478	int i, j;
				479	for (j = 0; j < 2; ++j) {
				480	const __m128i l = _mm_load_si128((const __m128i )(left + j 16));
				481	__m128i rep = _mm_set1_epi16(0x8000);
				482	for (i = 0; i < 16; ++i) {
				483	l16 = _mm_shuffle_epi8(l, rep);
				484	const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				485	const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				486	const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
				487	const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
				488
				489	_mm_store_si128((__m128i *)dst, r0);
				490	_mm_store_si128((__m128i *)(dst + 16), r1);
				491	_mm_store_si128((__m128i *)(dst + 32), r2);
				492	_mm_store_si128((__m128i *)(dst + 48), r3);
				493	dst += stride;
				494	rep = _mm_add_epi16(rep, one);
				495	}
				496	}
				497	}
				498
				499	void aom_paeth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				500	const uint8_t *above,
				501	const uint8_t *left) {
				502	const __m128i a = _mm_load_si128((const __m128i *)above);
				503	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				504	const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
				505	const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
				506	const __m128i zero = _mm_setzero_si128();
				507	const __m128i al = _mm_unpacklo_epi8(a, zero);
				508	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				509	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				510	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				511	const __m128i cl = _mm_unpacklo_epi8(c, zero);
				512	const __m128i ch = _mm_unpackhi_epi8(c, zero);
				513	const __m128i dl = _mm_unpacklo_epi8(d, zero);
				514	const __m128i dh = _mm_unpackhi_epi8(d, zero);
				515
				516	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				517	const __m128i one = _mm_set1_epi16(1);
				518	__m128i l16;
				519
				520	int i, j;
				521	for (j = 0; j < 4; ++j) {
				522	const __m128i l = _mm_load_si128((const __m128i )(left + j 16));
				523	__m128i rep = _mm_set1_epi16(0x8000);
				524	for (i = 0; i < 16; ++i) {
				525	l16 = _mm_shuffle_epi8(l, rep);
				526	const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				527	const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				528	const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
				529	const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
				530
				531	_mm_store_si128((__m128i *)dst, r0);
				532	_mm_store_si128((__m128i *)(dst + 16), r1);
				533	_mm_store_si128((__m128i *)(dst + 32), r2);
				534	_mm_store_si128((__m128i *)(dst + 48), r3);
				535	dst += stride;
				536	rep = _mm_add_epi16(rep, one);
				537	}
				538	}
				539	}
				540
				541	void aom_paeth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				542	const uint8_t *above,
				543	const uint8_t *left) {
				544	const __m128i a = _mm_load_si128((const __m128i *)above);
				545	const __m128i b = _mm_load_si128((const __m128i *)(above + 16));
				546	const __m128i c = _mm_load_si128((const __m128i *)(above + 32));
				547	const __m128i d = _mm_load_si128((const __m128i *)(above + 48));
				548	const __m128i zero = _mm_setzero_si128();
				549	const __m128i al = _mm_unpacklo_epi8(a, zero);
				550	const __m128i ah = _mm_unpackhi_epi8(a, zero);
				551	const __m128i bl = _mm_unpacklo_epi8(b, zero);
				552	const __m128i bh = _mm_unpackhi_epi8(b, zero);
				553	const __m128i cl = _mm_unpacklo_epi8(c, zero);
				554	const __m128i ch = _mm_unpackhi_epi8(c, zero);
				555	const __m128i dl = _mm_unpacklo_epi8(d, zero);
				556	const __m128i dh = _mm_unpackhi_epi8(d, zero);
				557
				558	const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
				559	const __m128i one = _mm_set1_epi16(1);
				560	__m128i l16;
				561
				562	int i;
				563	const __m128i l = _mm_load_si128((const __m128i *)left);
				564	__m128i rep = _mm_set1_epi16(0x8000);
				565	for (i = 0; i < 16; ++i) {
				566	l16 = _mm_shuffle_epi8(l, rep);
				567	const __m128i r0 = paeth_16x1_pred(&l16, &al, &ah, &tl16);
				568	const __m128i r1 = paeth_16x1_pred(&l16, &bl, &bh, &tl16);
				569	const __m128i r2 = paeth_16x1_pred(&l16, &cl, &ch, &tl16);
				570	const __m128i r3 = paeth_16x1_pred(&l16, &dl, &dh, &tl16);
				571
				572	_mm_store_si128((__m128i *)dst, r0);
				573	_mm_store_si128((__m128i *)(dst + 16), r1);
				574	_mm_store_si128((__m128i *)(dst + 32), r2);
				575	_mm_store_si128((__m128i *)(dst + 48), r3);
				576	dst += stride;
				577	rep = _mm_add_epi16(rep, one);
				578	}
				579	}
				580
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	581	// -----------------------------------------------------------------------------
				582	// SMOOTH_PRED
				583
				584	// pixels[0]: above and below_pred interleave vector
				585	// pixels[1]: left vector
				586	// pixels[2]: right_pred vector
				587	static INLINE void load_pixel_w4(const uint8_t above, const uint8_t left,
				588	int height, __m128i *pixels) {
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	589	__m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
				590	if (height == 4)
				591	pixels[1] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
				592	else if (height == 8)
				593	pixels[1] = _mm_loadl_epi64(((const __m128i *)left));
				594	else
				595	pixels[1] = _mm_loadu_si128(((const __m128i *)left));
				596
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	597	pixels[2] = _mm_set1_epi16((uint16_t)above[3]);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	598
				599	const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
				600	const __m128i zero = _mm_setzero_si128();
				601	d = _mm_unpacklo_epi8(d, zero);
				602	pixels[0] = _mm_unpacklo_epi16(d, bp);
				603	}
				604
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	605	// weight_h[0]: weight_h vector
				606	// weight_h[1]: scale - weight_h vector
				607	// weight_h[2]: same as [0], second half for height = 16 only
				608	// weight_h[3]: same as [1], second half for height = 16 only
				609	// weight_w[0]: weights_w and scale - weights_w interleave vector
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	610	static INLINE void load_weight_w4(const uint8_t *weight_array, int height,
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	611	__m128i weight_h, __m128i weight_w) {
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	612	const __m128i zero = _mm_setzero_si128();
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	613	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	614	const __m128i t = _mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
				615	weight_h[0] = _mm_unpacklo_epi8(t, zero);
				616	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				617	weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	618
				619	if (height == 8) {
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	620	const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
				621	weight_h[0] = _mm_unpacklo_epi8(weight, zero);
				622	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				623	} else if (height == 16) {
				624	const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
				625	weight_h[0] = _mm_unpacklo_epi8(weight, zero);
				626	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				627	weight_h[2] = _mm_unpackhi_epi8(weight, zero);
				628	weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	629	}
				630	}
				631
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	632	static INLINE void smooth_pred_4xh(const __m128i pixel, const __m128i wh,
				633	const __m128i ww, int h, uint8_t dst,
				634	ptrdiff_t stride, int second_half) {
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	635	const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
				636	const __m128i one = _mm_set1_epi16(1);
				637	const __m128i inc = _mm_set1_epi16(0x202);
				638	const __m128i gat = _mm_set1_epi32(0xc080400);
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	639	__m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	640	__m128i d = _mm_set1_epi16(0x100);
				641
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	642	for (int i = 0; i < h; ++i) {
				643	const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
				644	const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	645	const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
				646	__m128i s = _mm_madd_epi16(pixel[0], wh_sc);
				647
				648	__m128i b = _mm_shuffle_epi8(pixel[1], rep);
				649	b = _mm_unpacklo_epi16(b, pixel[2]);
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	650	__m128i sum = _mm_madd_epi16(b, ww[0]);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	651
				652	sum = _mm_add_epi32(s, sum);
				653	sum = _mm_add_epi32(sum, round);
				654	sum = _mm_srai_epi32(sum, 1 + sm_weight_log2_scale);
				655
				656	sum = _mm_shuffle_epi8(sum, gat);
				657	(uint32_t )dst = _mm_cvtsi128_si32(sum);
				658	dst += stride;
				659
				660	rep = _mm_add_epi16(rep, one);
				661	d = _mm_add_epi16(d, inc);
				662	}
				663	}
				664
				665	void aom_smooth_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				666	const uint8_t above, const uint8_t left) {
				667	__m128i pixels[3];
				668	load_pixel_w4(above, left, 4, pixels);
				669
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	670	__m128i wh[4], ww[2];
				671	load_weight_w4(sm_weight_arrays, 4, wh, ww);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	672
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	673	smooth_pred_4xh(pixels, wh, ww, 4, dst, stride, 0);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	674	}
				675
				676	void aom_smooth_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				677	const uint8_t above, const uint8_t left) {
				678	__m128i pixels[3];
				679	load_pixel_w4(above, left, 8, pixels);
				680
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	681	__m128i wh[4], ww[2];
				682	load_weight_w4(sm_weight_arrays, 8, wh, ww);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	683
Scott LaVarnway	aaed33a	2018-03-30 08:04:20 -0700	[diff] [blame]	684	smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
				685	}
				686
				687	void aom_smooth_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				688	const uint8_t *above,
				689	const uint8_t *left) {
				690	__m128i pixels[3];
				691	load_pixel_w4(above, left, 16, pixels);
				692
				693	__m128i wh[4], ww[2];
				694	load_weight_w4(sm_weight_arrays, 16, wh, ww);
				695
				696	smooth_pred_4xh(pixels, wh, ww, 8, dst, stride, 0);
				697	dst += stride << 3;
				698	smooth_pred_4xh(pixels, &wh[2], ww, 8, dst, stride, 1);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	699	}
				700
				701	// pixels[0]: above and below_pred interleave vector, first half
				702	// pixels[1]: above and below_pred interleave vector, second half
				703	// pixels[2]: left vector
				704	// pixels[3]: right_pred vector
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	705	// pixels[4]: above and below_pred interleave vector, first half
				706	// pixels[5]: above and below_pred interleave vector, second half
				707	// pixels[6]: left vector + 16
				708	// pixels[7]: right_pred vector
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	709	static INLINE void load_pixel_w8(const uint8_t above, const uint8_t left,
				710	int height, __m128i *pixels) {
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	711	const __m128i zero = _mm_setzero_si128();
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	712	const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
				713	__m128i d = _mm_loadl_epi64((const __m128i *)above);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	714	d = _mm_unpacklo_epi8(d, zero);
				715	pixels[0] = _mm_unpacklo_epi16(d, bp);
				716	pixels[1] = _mm_unpackhi_epi16(d, bp);
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	717
				718	pixels[3] = _mm_set1_epi16((uint16_t)above[7]);
				719
				720	if (height == 4) {
				721	pixels[2] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
				722	} else if (height == 8) {
				723	pixels[2] = _mm_loadl_epi64((const __m128i *)left);
				724	} else if (height == 16) {
				725	pixels[2] = _mm_load_si128((const __m128i *)left);
				726	} else {
				727	pixels[2] = _mm_load_si128((const __m128i *)left);
				728	pixels[4] = pixels[0];
				729	pixels[5] = pixels[1];
				730	pixels[6] = _mm_load_si128((const __m128i *)(left + 16));
				731	pixels[7] = pixels[3];
				732	}
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	733	}
				734
				735	// weight_h[0]: weight_h vector
				736	// weight_h[1]: scale - weight_h vector
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	737	// weight_h[2]: same as [0], offset 8
				738	// weight_h[3]: same as [1], offset 8
				739	// weight_h[4]: same as [0], offset 16
				740	// weight_h[5]: same as [1], offset 16
				741	// weight_h[6]: same as [0], offset 24
				742	// weight_h[7]: same as [1], offset 24
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	743	// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
				744	// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
				745	static INLINE void load_weight_w8(const uint8_t *weight_array, int height,
				746	__m128i weight_h, __m128i weight_w) {
				747	const __m128i zero = _mm_setzero_si128();
				748	const int we_offset = height < 8 ? 4 : 8;
				749	__m128i we = _mm_loadu_si128((const __m128i *)&weight_array[we_offset]);
				750	weight_h[0] = _mm_unpacklo_epi8(we, zero);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	751	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				752	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				753
				754	if (height == 4) {
				755	we = _mm_srli_si128(we, 4);
				756	__m128i tmp1 = _mm_unpacklo_epi8(we, zero);
				757	__m128i tmp2 = _mm_sub_epi16(d, tmp1);
				758	weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
				759	weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
				760	} else {
				761	weight_w[0] = _mm_unpacklo_epi16(weight_h[0], weight_h[1]);
				762	weight_w[1] = _mm_unpackhi_epi16(weight_h[0], weight_h[1]);
				763	}
				764
				765	if (height == 16) {
				766	we = _mm_loadu_si128((const __m128i *)&weight_array[16]);
				767	weight_h[0] = _mm_unpacklo_epi8(we, zero);
				768	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				769	weight_h[2] = _mm_unpackhi_epi8(we, zero);
				770	weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	771	} else if (height == 32) {
				772	const __m128i weight_lo =
				773	_mm_loadu_si128((const __m128i *)&weight_array[32]);
				774	weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
				775	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				776	weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
				777	weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
				778	const __m128i weight_hi =
				779	_mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
				780	weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
				781	weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
				782	weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
				783	weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
Yi Luo	46ae1ea	2017-09-29 17:02:40 -0700	[diff] [blame]	784	}
				785	}
				786
				787	static INLINE void smooth_pred_8xh(const __m128i pixels, const __m128i wh,
				788	const __m128i ww, int h, uint8_t dst,
				789	ptrdiff_t stride, int second_half) {
				790	const __m128i round = _mm_set1_epi32((1 << sm_weight_log2_scale));
				791	const __m128i one = _mm_set1_epi16(1);
				792	const __m128i inc = _mm_set1_epi16(0x202);
				793	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				794
				795	__m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
				796	__m128i d = _mm_set1_epi16(0x100);
				797
				798	int i;
				799	for (i = 0; i < h; ++i) {
				800	const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
				801	const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
				802	const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
				803	__m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
				804	__m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
				805
				806	__m128i b = _mm_shuffle_epi8(pixels[2], rep);
				807	b = _mm_unpacklo_epi16(b, pixels[3]);
				808	__m128i sum0 = _mm_madd_epi16(b, ww[0]);
				809	__m128i sum1 = _mm_madd_epi16(b, ww[1]);
				810
				811	s0 = _mm_add_epi32(s0, sum0);
				812	s0 = _mm_add_epi32(s0, round);
				813	s0 = _mm_srai_epi32(s0, 1 + sm_weight_log2_scale);
				814
				815	s1 = _mm_add_epi32(s1, sum1);
				816	s1 = _mm_add_epi32(s1, round);
				817	s1 = _mm_srai_epi32(s1, 1 + sm_weight_log2_scale);
				818
				819	sum0 = _mm_packus_epi16(s0, s1);
				820	sum0 = _mm_shuffle_epi8(sum0, gat);
				821	_mm_storel_epi64((__m128i *)dst, sum0);
				822	dst += stride;
				823
				824	rep = _mm_add_epi16(rep, one);
				825	d = _mm_add_epi16(d, inc);
				826	}
				827	}
				828
				829	void aom_smooth_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				830	const uint8_t above, const uint8_t left) {
				831	__m128i pixels[4];
				832	load_pixel_w8(above, left, 4, pixels);
				833
				834	__m128i wh[4], ww[2];
				835	load_weight_w8(sm_weight_arrays, 4, wh, ww);
				836
				837	smooth_pred_8xh(pixels, wh, ww, 4, dst, stride, 0);
				838	}
				839
				840	void aom_smooth_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				841	const uint8_t above, const uint8_t left) {
				842	__m128i pixels[4];
				843	load_pixel_w8(above, left, 8, pixels);
				844
				845	__m128i wh[4], ww[2];
				846	load_weight_w8(sm_weight_arrays, 8, wh, ww);
				847
				848	smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
				849	}
				850
				851	void aom_smooth_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				852	const uint8_t *above,
				853	const uint8_t *left) {
				854	__m128i pixels[4];
				855	load_pixel_w8(above, left, 16, pixels);
				856
				857	__m128i wh[4], ww[2];
				858	load_weight_w8(sm_weight_arrays, 16, wh, ww);
				859
				860	smooth_pred_8xh(pixels, wh, ww, 8, dst, stride, 0);
				861	dst += stride << 3;
				862	smooth_pred_8xh(pixels, &wh[2], ww, 8, dst, stride, 1);
				863	}
				864
Scott LaVarnway	925d4e5	2018-04-02 05:12:44 -0700	[diff] [blame]	865	void aom_smooth_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				866	const uint8_t *above,
				867	const uint8_t *left) {
				868	__m128i pixels[8];
				869	load_pixel_w8(above, left, 32, pixels);
				870
				871	__m128i wh[8], ww[2];
				872	load_weight_w8(sm_weight_arrays, 32, wh, ww);
				873
				874	smooth_pred_8xh(&pixels[0], wh, ww, 8, dst, stride, 0);
				875	dst += stride << 3;
				876	smooth_pred_8xh(&pixels[0], &wh[2], ww, 8, dst, stride, 1);
				877	dst += stride << 3;
				878	smooth_pred_8xh(&pixels[4], &wh[4], ww, 8, dst, stride, 0);
				879	dst += stride << 3;
				880	smooth_pred_8xh(&pixels[4], &wh[6], ww, 8, dst, stride, 1);
				881	}
				882
Scott LaVarnway	7cb2db1	2018-03-12 06:49:03 -0700	[diff] [blame]	883	static INLINE void smooth_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
				884	const uint8_t *above,
				885	const uint8_t *left, uint32_t bw,
				886	uint32_t bh) {
				887	const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
				888	const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
				889	const __m128i zero = _mm_setzero_si128();
				890	const __m128i scale_value =
				891	_mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				892	const __m128i bottom_left = _mm_cvtsi32_si128((uint32_t)left[bh - 1]);
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	893	const __m128i dup16 = _mm_set1_epi32(0x01000100);
Scott LaVarnway	7cb2db1	2018-03-12 06:49:03 -0700	[diff] [blame]	894	const __m128i top_right =
				895	_mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)above[bw - 1]), dup16);
				896	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				897	const __m128i round = _mm_set1_epi32((uint16_t)(1 << sm_weight_log2_scale));
				898
				899	for (uint32_t y = 0; y < bh; ++y) {
				900	const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
				901	const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
				902	const __m128i scale_m_weights_y = _mm_sub_epi16(scale_value, weights_y);
				903	__m128i pred_scaled_bl = _mm_mullo_epi16(scale_m_weights_y, bottom_left);
				904	const __m128i wl_y =
				905	_mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, left_y), 0);
				906	pred_scaled_bl = _mm_add_epi32(pred_scaled_bl, round);
				907	pred_scaled_bl = _mm_shuffle_epi32(pred_scaled_bl, 0);
				908
				909	for (uint32_t x = 0; x < bw; x += 8) {
				910	const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
				911	const __m128i weights_x =
				912	_mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
				913	const __m128i tw_x = _mm_unpacklo_epi8(top_x, weights_x);
				914	const __m128i tw_x_lo = _mm_unpacklo_epi8(tw_x, zero);
				915	const __m128i tw_x_hi = _mm_unpackhi_epi8(tw_x, zero);
				916
				917	__m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
				918	__m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
				919
				920	const __m128i scale_m_weights_x =
				921	_mm_sub_epi16(scale_value, _mm_unpacklo_epi8(weights_x, zero));
				922	const __m128i swxtr = _mm_mullo_epi16(scale_m_weights_x, top_right);
				923	const __m128i swxtr_lo = _mm_unpacklo_epi16(swxtr, zero);
				924	const __m128i swxtr_hi = _mm_unpackhi_epi16(swxtr, zero);
				925
				926	pred_lo = _mm_add_epi32(pred_lo, pred_scaled_bl);
				927	pred_hi = _mm_add_epi32(pred_hi, pred_scaled_bl);
				928
				929	pred_lo = _mm_add_epi32(pred_lo, swxtr_lo);
				930	pred_hi = _mm_add_epi32(pred_hi, swxtr_hi);
				931
				932	pred_lo = _mm_srai_epi32(pred_lo, (1 + sm_weight_log2_scale));
				933	pred_hi = _mm_srai_epi32(pred_hi, (1 + sm_weight_log2_scale));
				934
				935	__m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
				936	pred = _mm_shuffle_epi8(pred, gat);
				937	_mm_storel_epi64((__m128i *)(dst + x), pred);
				938	}
				939	dst += stride;
				940	}
				941	}
Scott LaVarnway	e25a4ba	2018-03-21 13:20:31 -0700	[diff] [blame]	942
Scott LaVarnway	00f8a93	2018-04-02 08:02:40 -0700	[diff] [blame]	943	void aom_smooth_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				944	const uint8_t *above,
				945	const uint8_t *left) {
				946	smooth_predictor_wxh(dst, stride, above, left, 16, 4);
				947	}
				948
Scott LaVarnway	5997c7d	2018-03-14 12:16:46 -0700	[diff] [blame]	949	void aom_smooth_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				950	const uint8_t *above,
				951	const uint8_t *left) {
				952	smooth_predictor_wxh(dst, stride, above, left, 16, 8);
				953	}
				954
				955	void aom_smooth_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				956	const uint8_t *above,
				957	const uint8_t *left) {
				958	smooth_predictor_wxh(dst, stride, above, left, 16, 16);
				959	}
				960
				961	void aom_smooth_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				962	const uint8_t *above,
				963	const uint8_t *left) {
				964	smooth_predictor_wxh(dst, stride, above, left, 16, 32);
				965	}
Scott LaVarnway	7cb2db1	2018-03-12 06:49:03 -0700	[diff] [blame]	966
Scott LaVarnway	7600314	2018-04-03 07:17:32 -0700	[diff] [blame]	967	void aom_smooth_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				968	const uint8_t *above,
				969	const uint8_t *left) {
				970	smooth_predictor_wxh(dst, stride, above, left, 32, 8);
				971	}
				972
Scott LaVarnway	bcaa2f8	2018-03-13 16:21:01 -0700	[diff] [blame]	973	void aom_smooth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				974	const uint8_t *above,
				975	const uint8_t *left) {
				976	smooth_predictor_wxh(dst, stride, above, left, 32, 16);
				977	}
				978
				979	void aom_smooth_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				980	const uint8_t *above,
				981	const uint8_t *left) {
				982	smooth_predictor_wxh(dst, stride, above, left, 32, 32);
				983	}
				984
Scott LaVarnway	7cb2db1	2018-03-12 06:49:03 -0700	[diff] [blame]	985	void aom_smooth_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				986	const uint8_t *above,
				987	const uint8_t *left) {
				988	smooth_predictor_wxh(dst, stride, above, left, 32, 64);
				989	}
				990
				991	void aom_smooth_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				992	const uint8_t *above,
				993	const uint8_t *left) {
				994	smooth_predictor_wxh(dst, stride, above, left, 64, 64);
				995	}
				996
				997	void aom_smooth_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				998	const uint8_t *above,
				999	const uint8_t *left) {
				1000	smooth_predictor_wxh(dst, stride, above, left, 64, 32);
				1001	}
				1002
				1003	void aom_smooth_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1004	const uint8_t *above,
				1005	const uint8_t *left) {
				1006	smooth_predictor_wxh(dst, stride, above, left, 64, 16);
				1007	}
Scott LaVarnway	139542e	2018-03-13 16:35:14 -0700	[diff] [blame]	1008
				1009	void aom_smooth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1010	const uint8_t *above,
				1011	const uint8_t *left) {
				1012	smooth_predictor_wxh(dst, stride, above, left, 16, 64);
				1013	}
Scott LaVarnway	e25a4ba	2018-03-21 13:20:31 -0700	[diff] [blame]	1014
				1015	// -----------------------------------------------------------------------------
				1016	// SMOOTH_V_PRED
				1017
Scott LaVarnway	7db820e	2018-03-26 17:01:29 -0700	[diff] [blame]	1018	// pixels[0]: above and below_pred interleave vector
				1019	static INLINE void load_pixel_v_w4(const uint8_t above, const uint8_t left,
				1020	int height, __m128i *pixels) {
				1021	const __m128i zero = _mm_setzero_si128();
				1022	__m128i d = _mm_cvtsi32_si128(((const uint32_t *)above)[0]);
				1023	const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
				1024	d = _mm_unpacklo_epi8(d, zero);
				1025	pixels[0] = _mm_unpacklo_epi16(d, bp);
				1026	}
				1027
				1028	// weights[0]: weights_h vector
				1029	// weights[1]: scale - weights_h vector
				1030	static INLINE void load_weight_v_w4(const uint8_t *weight_array, int height,
				1031	__m128i *weights) {
				1032	const __m128i zero = _mm_setzero_si128();
				1033	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				1034
				1035	if (height == 4) {
				1036	const __m128i weight =
				1037	_mm_cvtsi32_si128(((const uint32_t *)weight_array)[1]);
				1038	weights[0] = _mm_unpacklo_epi8(weight, zero);
				1039	weights[1] = _mm_sub_epi16(d, weights[0]);
				1040	} else if (height == 8) {
				1041	const __m128i weight = _mm_loadl_epi64((const __m128i *)&weight_array[8]);
				1042	weights[0] = _mm_unpacklo_epi8(weight, zero);
				1043	weights[1] = _mm_sub_epi16(d, weights[0]);
				1044	} else {
				1045	const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
				1046	weights[0] = _mm_unpacklo_epi8(weight, zero);
				1047	weights[1] = _mm_sub_epi16(d, weights[0]);
				1048	weights[2] = _mm_unpackhi_epi8(weight, zero);
				1049	weights[3] = _mm_sub_epi16(d, weights[2]);
				1050	}
				1051	}
				1052
				1053	static INLINE void smooth_v_pred_4xh(const __m128i *pixel,
				1054	const __m128i weight, int h, uint8_t dst,
				1055	ptrdiff_t stride) {
				1056	const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
				1057	const __m128i inc = _mm_set1_epi16(0x202);
				1058	const __m128i gat = _mm_set1_epi32(0xc080400);
				1059	__m128i d = _mm_set1_epi16(0x100);
				1060
				1061	for (int i = 0; i < h; ++i) {
				1062	const __m128i wg_wg = _mm_shuffle_epi8(weight[0], d);
				1063	const __m128i sc_sc = _mm_shuffle_epi8(weight[1], d);
				1064	const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
				1065	__m128i sum = _mm_madd_epi16(pixel[0], wh_sc);
				1066	sum = _mm_add_epi32(sum, pred_round);
				1067	sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
				1068	sum = _mm_shuffle_epi8(sum, gat);
				1069	(uint32_t )dst = _mm_cvtsi128_si32(sum);
				1070	dst += stride;
				1071	d = _mm_add_epi16(d, inc);
				1072	}
				1073	}
				1074
				1075	void aom_smooth_v_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1076	const uint8_t *above,
				1077	const uint8_t *left) {
				1078	__m128i pixels;
				1079	load_pixel_v_w4(above, left, 4, &pixels);
				1080
				1081	__m128i weights[2];
				1082	load_weight_v_w4(sm_weight_arrays, 4, weights);
				1083
				1084	smooth_v_pred_4xh(&pixels, weights, 4, dst, stride);
				1085	}
				1086
				1087	void aom_smooth_v_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1088	const uint8_t *above,
				1089	const uint8_t *left) {
				1090	__m128i pixels;
				1091	load_pixel_v_w4(above, left, 8, &pixels);
				1092
				1093	__m128i weights[2];
				1094	load_weight_v_w4(sm_weight_arrays, 8, weights);
				1095
				1096	smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
				1097	}
				1098
				1099	void aom_smooth_v_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1100	const uint8_t *above,
				1101	const uint8_t *left) {
				1102	__m128i pixels;
				1103	load_pixel_v_w4(above, left, 16, &pixels);
				1104
				1105	__m128i weights[4];
				1106	load_weight_v_w4(sm_weight_arrays, 16, weights);
				1107
				1108	smooth_v_pred_4xh(&pixels, weights, 8, dst, stride);
				1109	dst += stride << 3;
				1110	smooth_v_pred_4xh(&pixels, &weights[2], 8, dst, stride);
				1111	}
				1112
Scott LaVarnway	664b74f	2018-03-22 13:47:41 -0700	[diff] [blame]	1113	// pixels[0]: above and below_pred interleave vector, first half
				1114	// pixels[1]: above and below_pred interleave vector, second half
				1115	static INLINE void load_pixel_v_w8(const uint8_t above, const uint8_t left,
				1116	int height, __m128i *pixels) {
				1117	const __m128i zero = _mm_setzero_si128();
				1118	__m128i d = _mm_loadl_epi64((const __m128i *)above);
				1119	const __m128i bp = _mm_set1_epi16((uint16_t)left[height - 1]);
				1120	d = _mm_unpacklo_epi8(d, zero);
				1121	pixels[0] = _mm_unpacklo_epi16(d, bp);
				1122	pixels[1] = _mm_unpackhi_epi16(d, bp);
				1123	}
				1124
				1125	// weight_h[0]: weight_h vector
				1126	// weight_h[1]: scale - weight_h vector
				1127	// weight_h[2]: same as [0], offset 8
				1128	// weight_h[3]: same as [1], offset 8
				1129	// weight_h[4]: same as [0], offset 16
				1130	// weight_h[5]: same as [1], offset 16
				1131	// weight_h[6]: same as [0], offset 24
				1132	// weight_h[7]: same as [1], offset 24
				1133	static INLINE void load_weight_v_w8(const uint8_t *weight_array, int height,
				1134	__m128i *weight_h) {
				1135	const __m128i zero = _mm_setzero_si128();
				1136	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				1137
				1138	if (height < 16) {
				1139	const int offset = height < 8 ? 4 : 8;
				1140	const __m128i weight =
				1141	_mm_loadu_si128((const __m128i *)&weight_array[offset]);
				1142	weight_h[0] = _mm_unpacklo_epi8(weight, zero);
				1143	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				1144	} else if (height == 16) {
				1145	const __m128i weight = _mm_loadu_si128((const __m128i *)&weight_array[16]);
				1146	weight_h[0] = _mm_unpacklo_epi8(weight, zero);
				1147	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				1148	weight_h[2] = _mm_unpackhi_epi8(weight, zero);
				1149	weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
				1150	} else {
				1151	const __m128i weight_lo =
				1152	_mm_loadu_si128((const __m128i *)&weight_array[32]);
				1153	weight_h[0] = _mm_unpacklo_epi8(weight_lo, zero);
				1154	weight_h[1] = _mm_sub_epi16(d, weight_h[0]);
				1155	weight_h[2] = _mm_unpackhi_epi8(weight_lo, zero);
				1156	weight_h[3] = _mm_sub_epi16(d, weight_h[2]);
				1157	const __m128i weight_hi =
				1158	_mm_loadu_si128((const __m128i *)&weight_array[32 + 16]);
				1159	weight_h[4] = _mm_unpacklo_epi8(weight_hi, zero);
				1160	weight_h[5] = _mm_sub_epi16(d, weight_h[4]);
				1161	weight_h[6] = _mm_unpackhi_epi8(weight_hi, zero);
				1162	weight_h[7] = _mm_sub_epi16(d, weight_h[6]);
				1163	}
				1164	}
				1165
				1166	static INLINE void smooth_v_pred_8xh(const __m128i pixels, const __m128i wh,
				1167	int h, uint8_t *dst, ptrdiff_t stride) {
				1168	const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
				1169	const __m128i inc = _mm_set1_epi16(0x202);
				1170	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				1171	__m128i d = _mm_set1_epi16(0x100);
				1172
				1173	for (int i = 0; i < h; ++i) {
				1174	const __m128i wg_wg = _mm_shuffle_epi8(wh[0], d);
				1175	const __m128i sc_sc = _mm_shuffle_epi8(wh[1], d);
				1176	const __m128i wh_sc = _mm_unpacklo_epi16(wg_wg, sc_sc);
				1177	__m128i s0 = _mm_madd_epi16(pixels[0], wh_sc);
				1178	__m128i s1 = _mm_madd_epi16(pixels[1], wh_sc);
				1179
				1180	s0 = _mm_add_epi32(s0, pred_round);
				1181	s0 = _mm_srai_epi32(s0, sm_weight_log2_scale);
				1182
				1183	s1 = _mm_add_epi32(s1, pred_round);
				1184	s1 = _mm_srai_epi32(s1, sm_weight_log2_scale);
				1185
				1186	__m128i sum01 = _mm_packus_epi16(s0, s1);
				1187	sum01 = _mm_shuffle_epi8(sum01, gat);
				1188	_mm_storel_epi64((__m128i *)dst, sum01);
				1189	dst += stride;
				1190
				1191	d = _mm_add_epi16(d, inc);
				1192	}
				1193	}
				1194
				1195	void aom_smooth_v_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1196	const uint8_t *above,
				1197	const uint8_t *left) {
				1198	__m128i pixels[2];
				1199	load_pixel_v_w8(above, left, 4, pixels);
				1200
				1201	__m128i wh[2];
				1202	load_weight_v_w8(sm_weight_arrays, 4, wh);
				1203
				1204	smooth_v_pred_8xh(pixels, wh, 4, dst, stride);
				1205	}
				1206
				1207	void aom_smooth_v_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1208	const uint8_t *above,
				1209	const uint8_t *left) {
				1210	__m128i pixels[2];
				1211	load_pixel_v_w8(above, left, 8, pixels);
				1212
				1213	__m128i wh[2];
				1214	load_weight_v_w8(sm_weight_arrays, 8, wh);
				1215
				1216	smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
				1217	}
				1218
				1219	void aom_smooth_v_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1220	const uint8_t *above,
				1221	const uint8_t *left) {
				1222	__m128i pixels[2];
				1223	load_pixel_v_w8(above, left, 16, pixels);
				1224
				1225	__m128i wh[4];
				1226	load_weight_v_w8(sm_weight_arrays, 16, wh);
				1227
				1228	smooth_v_pred_8xh(pixels, wh, 8, dst, stride);
				1229	dst += stride << 3;
				1230	smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
				1231	}
				1232
				1233	void aom_smooth_v_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1234	const uint8_t *above,
				1235	const uint8_t *left) {
				1236	__m128i pixels[2];
				1237	load_pixel_v_w8(above, left, 32, pixels);
				1238
				1239	__m128i wh[8];
				1240	load_weight_v_w8(sm_weight_arrays, 32, wh);
				1241
				1242	smooth_v_pred_8xh(pixels, &wh[0], 8, dst, stride);
				1243	dst += stride << 3;
				1244	smooth_v_pred_8xh(pixels, &wh[2], 8, dst, stride);
				1245	dst += stride << 3;
				1246	smooth_v_pred_8xh(pixels, &wh[4], 8, dst, stride);
				1247	dst += stride << 3;
				1248	smooth_v_pred_8xh(pixels, &wh[6], 8, dst, stride);
				1249	}
				1250
Scott LaVarnway	e25a4ba	2018-03-21 13:20:31 -0700	[diff] [blame]	1251	static INLINE void smooth_v_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
				1252	const uint8_t *above,
				1253	const uint8_t *left, uint32_t bw,
				1254	uint32_t bh) {
				1255	const uint8_t *const sm_weights_h = sm_weight_arrays + bh;
				1256	const __m128i zero = _mm_setzero_si128();
				1257	const __m128i scale_value =
				1258	_mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	1259	const __m128i dup16 = _mm_set1_epi32(0x01000100);
Scott LaVarnway	e25a4ba	2018-03-21 13:20:31 -0700	[diff] [blame]	1260	const __m128i bottom_left =
				1261	_mm_shuffle_epi8(_mm_cvtsi32_si128((uint32_t)left[bh - 1]), dup16);
				1262	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				1263	const __m128i round =
				1264	_mm_set1_epi32((uint16_t)(1 << (sm_weight_log2_scale - 1)));
				1265
				1266	for (uint32_t y = 0; y < bh; ++y) {
				1267	const __m128i weights_y = _mm_cvtsi32_si128((uint32_t)sm_weights_h[y]);
				1268	const __m128i scale_m_weights_y =
				1269	_mm_shuffle_epi8(_mm_sub_epi16(scale_value, weights_y), dup16);
				1270	const __m128i wl_y =
				1271	_mm_shuffle_epi32(_mm_unpacklo_epi16(weights_y, bottom_left), 0);
				1272
				1273	for (uint32_t x = 0; x < bw; x += 8) {
				1274	const __m128i top_x = _mm_loadl_epi64((const __m128i *)(above + x));
				1275	// 8 -> 16
				1276	const __m128i tw_x = _mm_unpacklo_epi8(top_x, zero);
				1277	const __m128i tw_x_lo = _mm_unpacklo_epi16(tw_x, scale_m_weights_y);
				1278	const __m128i tw_x_hi = _mm_unpackhi_epi16(tw_x, scale_m_weights_y);
				1279	// top_x * weights_y + scale_m_weights_y * bottom_left
				1280	__m128i pred_lo = _mm_madd_epi16(tw_x_lo, wl_y);
				1281	__m128i pred_hi = _mm_madd_epi16(tw_x_hi, wl_y);
				1282
				1283	pred_lo = _mm_add_epi32(pred_lo, round);
				1284	pred_hi = _mm_add_epi32(pred_hi, round);
				1285	pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
				1286	pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
				1287
				1288	__m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
				1289	pred = _mm_shuffle_epi8(pred, gat);
				1290	_mm_storel_epi64((__m128i *)(dst + x), pred);
				1291	}
				1292	dst += stride;
				1293	}
				1294	}
				1295
				1296	void aom_smooth_v_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1297	const uint8_t *above,
				1298	const uint8_t *left) {
				1299	smooth_v_predictor_wxh(dst, stride, above, left, 16, 4);
				1300	}
				1301
				1302	void aom_smooth_v_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1303	const uint8_t *above,
				1304	const uint8_t *left) {
				1305	smooth_v_predictor_wxh(dst, stride, above, left, 16, 8);
				1306	}
				1307
				1308	void aom_smooth_v_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1309	const uint8_t *above,
				1310	const uint8_t *left) {
				1311	smooth_v_predictor_wxh(dst, stride, above, left, 16, 16);
				1312	}
				1313
				1314	void aom_smooth_v_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1315	const uint8_t *above,
				1316	const uint8_t *left) {
				1317	smooth_v_predictor_wxh(dst, stride, above, left, 16, 32);
				1318	}
				1319
				1320	void aom_smooth_v_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1321	const uint8_t *above,
				1322	const uint8_t *left) {
				1323	smooth_v_predictor_wxh(dst, stride, above, left, 32, 8);
				1324	}
				1325
				1326	void aom_smooth_v_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1327	const uint8_t *above,
				1328	const uint8_t *left) {
				1329	smooth_v_predictor_wxh(dst, stride, above, left, 32, 16);
				1330	}
				1331
				1332	void aom_smooth_v_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1333	const uint8_t *above,
				1334	const uint8_t *left) {
				1335	smooth_v_predictor_wxh(dst, stride, above, left, 32, 32);
				1336	}
				1337
				1338	void aom_smooth_v_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1339	const uint8_t *above,
				1340	const uint8_t *left) {
				1341	smooth_v_predictor_wxh(dst, stride, above, left, 32, 64);
				1342	}
				1343
Scott LaVarnway	e25a4ba	2018-03-21 13:20:31 -0700	[diff] [blame]	1344	void aom_smooth_v_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1345	const uint8_t *above,
				1346	const uint8_t *left) {
				1347	smooth_v_predictor_wxh(dst, stride, above, left, 64, 64);
				1348	}
				1349
				1350	void aom_smooth_v_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1351	const uint8_t *above,
				1352	const uint8_t *left) {
				1353	smooth_v_predictor_wxh(dst, stride, above, left, 64, 32);
				1354	}
				1355
				1356	void aom_smooth_v_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1357	const uint8_t *above,
				1358	const uint8_t *left) {
				1359	smooth_v_predictor_wxh(dst, stride, above, left, 64, 16);
				1360	}
				1361
				1362	void aom_smooth_v_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1363	const uint8_t *above,
				1364	const uint8_t *left) {
				1365	smooth_v_predictor_wxh(dst, stride, above, left, 16, 64);
				1366	}
Scott LaVarnway	deeee7c	2018-03-28 07:26:51 -0700	[diff] [blame]	1367
				1368	// -----------------------------------------------------------------------------
				1369	// SMOOTH_H_PRED
				1370
				1371	// pixels[0]: left vector
				1372	// pixels[1]: right_pred vector
				1373	static INLINE void load_pixel_h_w4(const uint8_t above, const uint8_t left,
				1374	int height, __m128i *pixels) {
				1375	if (height == 4)
				1376	pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
				1377	else if (height == 8)
				1378	pixels[0] = _mm_loadl_epi64(((const __m128i *)left));
				1379	else
				1380	pixels[0] = _mm_loadu_si128(((const __m128i *)left));
				1381	pixels[1] = _mm_set1_epi16((uint16_t)above[3]);
				1382	}
				1383
				1384	// weights[0]: weights_w and scale - weights_w interleave vector
				1385	static INLINE void load_weight_h_w4(const uint8_t *weight_array, int height,
				1386	__m128i *weights) {
				1387	(void)height;
				1388	const __m128i t = _mm_loadu_si128((const __m128i *)&weight_array[4]);
				1389	const __m128i zero = _mm_setzero_si128();
				1390
				1391	const __m128i weights_0 = _mm_unpacklo_epi8(t, zero);
				1392	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				1393	const __m128i weights_1 = _mm_sub_epi16(d, weights_0);
				1394	weights[0] = _mm_unpacklo_epi16(weights_0, weights_1);
				1395	}
				1396
				1397	static INLINE void smooth_h_pred_4xh(const __m128i *pixel,
				1398	const __m128i weight, int h, uint8_t dst,
				1399	ptrdiff_t stride) {
				1400	const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
				1401	const __m128i one = _mm_set1_epi16(1);
				1402	const __m128i gat = _mm_set1_epi32(0xc080400);
				1403	__m128i rep = _mm_set1_epi16(0x8000);
				1404
				1405	for (int i = 0; i < h; ++i) {
				1406	__m128i b = _mm_shuffle_epi8(pixel[0], rep);
				1407	b = _mm_unpacklo_epi16(b, pixel[1]);
				1408	__m128i sum = _mm_madd_epi16(b, weight[0]);
				1409
				1410	sum = _mm_add_epi32(sum, pred_round);
				1411	sum = _mm_srai_epi32(sum, sm_weight_log2_scale);
				1412
				1413	sum = _mm_shuffle_epi8(sum, gat);
				1414	(uint32_t )dst = _mm_cvtsi128_si32(sum);
				1415	dst += stride;
				1416
				1417	rep = _mm_add_epi16(rep, one);
				1418	}
				1419	}
				1420
				1421	void aom_smooth_h_predictor_4x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1422	const uint8_t *above,
				1423	const uint8_t *left) {
				1424	__m128i pixels[2];
				1425	load_pixel_h_w4(above, left, 4, pixels);
				1426
				1427	__m128i weights;
				1428	load_weight_h_w4(sm_weight_arrays, 4, &weights);
				1429
				1430	smooth_h_pred_4xh(pixels, &weights, 4, dst, stride);
				1431	}
				1432
				1433	void aom_smooth_h_predictor_4x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1434	const uint8_t *above,
				1435	const uint8_t *left) {
				1436	__m128i pixels[2];
				1437	load_pixel_h_w4(above, left, 8, pixels);
				1438
				1439	__m128i weights;
				1440	load_weight_h_w4(sm_weight_arrays, 8, &weights);
				1441
				1442	smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
				1443	}
				1444
				1445	void aom_smooth_h_predictor_4x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1446	const uint8_t *above,
				1447	const uint8_t *left) {
				1448	__m128i pixels[2];
				1449	load_pixel_h_w4(above, left, 16, pixels);
				1450
				1451	__m128i weights;
				1452	load_weight_h_w4(sm_weight_arrays, 8, &weights);
				1453
				1454	smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
				1455	dst += stride << 3;
				1456
				1457	pixels[0] = _mm_srli_si128(pixels[0], 8);
				1458	smooth_h_pred_4xh(pixels, &weights, 8, dst, stride);
				1459	}
Scott LaVarnway	5be7c66	2018-03-28 14:27:45 -0700	[diff] [blame]	1460
				1461	// pixels[0]: left vector
				1462	// pixels[1]: right_pred vector
				1463	// pixels[2]: left vector + 16
				1464	// pixels[3]: right_pred vector
				1465	static INLINE void load_pixel_h_w8(const uint8_t above, const uint8_t left,
				1466	int height, __m128i *pixels) {
				1467	pixels[1] = _mm_set1_epi16((uint16_t)above[7]);
				1468
				1469	if (height == 4) {
				1470	pixels[0] = _mm_cvtsi32_si128(((const uint32_t *)left)[0]);
				1471	} else if (height == 8) {
				1472	pixels[0] = _mm_loadl_epi64((const __m128i *)left);
				1473	} else if (height == 16) {
				1474	pixels[0] = _mm_load_si128((const __m128i *)left);
				1475	} else {
				1476	pixels[0] = _mm_load_si128((const __m128i *)left);
				1477	pixels[2] = _mm_load_si128((const __m128i *)(left + 16));
				1478	pixels[3] = pixels[1];
				1479	}
				1480	}
				1481
				1482	// weight_w[0]: weights_w and scale - weights_w interleave vector, first half
				1483	// weight_w[1]: weights_w and scale - weights_w interleave vector, second half
				1484	static INLINE void load_weight_h_w8(const uint8_t *weight_array, int height,
				1485	__m128i *weight_w) {
				1486	(void)height;
				1487	const __m128i zero = _mm_setzero_si128();
				1488	const __m128i d = _mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				1489	const __m128i we = _mm_loadu_si128((const __m128i *)&weight_array[8]);
				1490	const __m128i tmp1 = _mm_unpacklo_epi8(we, zero);
				1491	const __m128i tmp2 = _mm_sub_epi16(d, tmp1);
				1492	weight_w[0] = _mm_unpacklo_epi16(tmp1, tmp2);
				1493	weight_w[1] = _mm_unpackhi_epi16(tmp1, tmp2);
				1494	}
				1495
				1496	static INLINE void smooth_h_pred_8xh(const __m128i pixels, const __m128i ww,
				1497	int h, uint8_t *dst, ptrdiff_t stride,
				1498	int second_half) {
				1499	const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
				1500	const __m128i one = _mm_set1_epi16(1);
				1501	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				1502	__m128i rep = second_half ? _mm_set1_epi16(0x8008) : _mm_set1_epi16(0x8000);
				1503
				1504	for (int i = 0; i < h; ++i) {
				1505	__m128i b = _mm_shuffle_epi8(pixels[0], rep);
				1506	b = _mm_unpacklo_epi16(b, pixels[1]);
				1507	__m128i sum0 = _mm_madd_epi16(b, ww[0]);
				1508	__m128i sum1 = _mm_madd_epi16(b, ww[1]);
				1509
				1510	sum0 = _mm_add_epi32(sum0, pred_round);
				1511	sum0 = _mm_srai_epi32(sum0, sm_weight_log2_scale);
				1512
				1513	sum1 = _mm_add_epi32(sum1, pred_round);
				1514	sum1 = _mm_srai_epi32(sum1, sm_weight_log2_scale);
				1515
				1516	sum0 = _mm_packus_epi16(sum0, sum1);
				1517	sum0 = _mm_shuffle_epi8(sum0, gat);
				1518	_mm_storel_epi64((__m128i *)dst, sum0);
				1519	dst += stride;
				1520
				1521	rep = _mm_add_epi16(rep, one);
				1522	}
				1523	}
				1524
				1525	void aom_smooth_h_predictor_8x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1526	const uint8_t *above,
				1527	const uint8_t *left) {
				1528	__m128i pixels[2];
				1529	load_pixel_h_w8(above, left, 4, pixels);
				1530
				1531	__m128i ww[2];
				1532	load_weight_h_w8(sm_weight_arrays, 4, ww);
				1533
				1534	smooth_h_pred_8xh(pixels, ww, 4, dst, stride, 0);
				1535	}
				1536
				1537	void aom_smooth_h_predictor_8x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1538	const uint8_t *above,
				1539	const uint8_t *left) {
				1540	__m128i pixels[2];
				1541	load_pixel_h_w8(above, left, 8, pixels);
				1542
				1543	__m128i ww[2];
				1544	load_weight_h_w8(sm_weight_arrays, 8, ww);
				1545
				1546	smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
				1547	}
				1548
				1549	void aom_smooth_h_predictor_8x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1550	const uint8_t *above,
				1551	const uint8_t *left) {
				1552	__m128i pixels[2];
				1553	load_pixel_h_w8(above, left, 16, pixels);
				1554
				1555	__m128i ww[2];
				1556	load_weight_h_w8(sm_weight_arrays, 16, ww);
				1557
				1558	smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 0);
				1559	dst += stride << 3;
				1560	smooth_h_pred_8xh(pixels, ww, 8, dst, stride, 1);
				1561	}
				1562
				1563	void aom_smooth_h_predictor_8x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1564	const uint8_t *above,
				1565	const uint8_t *left) {
				1566	__m128i pixels[4];
				1567	load_pixel_h_w8(above, left, 32, pixels);
				1568
				1569	__m128i ww[2];
				1570	load_weight_h_w8(sm_weight_arrays, 32, ww);
				1571
				1572	smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 0);
				1573	dst += stride << 3;
				1574	smooth_h_pred_8xh(&pixels[0], ww, 8, dst, stride, 1);
				1575	dst += stride << 3;
				1576	smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 0);
				1577	dst += stride << 3;
				1578	smooth_h_pred_8xh(&pixels[2], ww, 8, dst, stride, 1);
				1579	}
Scott LaVarnway	6d9d52d	2018-03-30 07:01:16 -0700	[diff] [blame]	1580
				1581	static INLINE void smooth_h_predictor_wxh(uint8_t *dst, ptrdiff_t stride,
				1582	const uint8_t *above,
				1583	const uint8_t *left, uint32_t bw,
				1584	uint32_t bh) {
				1585	const uint8_t *const sm_weights_w = sm_weight_arrays + bw;
				1586	const __m128i zero = _mm_setzero_si128();
				1587	const __m128i scale_value =
				1588	_mm_set1_epi16((uint16_t)(1 << sm_weight_log2_scale));
				1589	const __m128i top_right = _mm_cvtsi32_si128((uint32_t)above[bw - 1]);
				1590	const __m128i gat = _mm_set_epi32(0, 0, 0xe0c0a08, 0x6040200);
				1591	const __m128i pred_round = _mm_set1_epi32((1 << (sm_weight_log2_scale - 1)));
				1592
				1593	for (uint32_t y = 0; y < bh; ++y) {
				1594	const __m128i left_y = _mm_cvtsi32_si128((uint32_t)left[y]);
				1595	const __m128i tr_ly =
				1596	_mm_shuffle_epi32(_mm_unpacklo_epi16(top_right, left_y), 0);
				1597
				1598	for (uint32_t x = 0; x < bw; x += 8) {
				1599	const __m128i weights_x =
				1600	_mm_loadl_epi64((const __m128i *)(sm_weights_w + x));
				1601	const __m128i weights_xw = _mm_unpacklo_epi8(weights_x, zero);
				1602	const __m128i scale_m_weights_x = _mm_sub_epi16(scale_value, weights_xw);
				1603	const __m128i wx_lo = _mm_unpacklo_epi16(scale_m_weights_x, weights_xw);
				1604	const __m128i wx_hi = _mm_unpackhi_epi16(scale_m_weights_x, weights_xw);
				1605	__m128i pred_lo = _mm_madd_epi16(wx_lo, tr_ly);
				1606	__m128i pred_hi = _mm_madd_epi16(wx_hi, tr_ly);
				1607
				1608	pred_lo = _mm_add_epi32(pred_lo, pred_round);
				1609	pred_hi = _mm_add_epi32(pred_hi, pred_round);
				1610
				1611	pred_lo = _mm_srai_epi32(pred_lo, sm_weight_log2_scale);
				1612	pred_hi = _mm_srai_epi32(pred_hi, sm_weight_log2_scale);
				1613
				1614	__m128i pred = _mm_packus_epi16(pred_lo, pred_hi);
				1615	pred = _mm_shuffle_epi8(pred, gat);
				1616	_mm_storel_epi64((__m128i *)(dst + x), pred);
				1617	}
				1618	dst += stride;
				1619	}
				1620	}
				1621
				1622	void aom_smooth_h_predictor_16x4_ssse3(uint8_t *dst, ptrdiff_t stride,
				1623	const uint8_t *above,
				1624	const uint8_t *left) {
				1625	smooth_h_predictor_wxh(dst, stride, above, left, 16, 4);
				1626	}
				1627
				1628	void aom_smooth_h_predictor_16x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1629	const uint8_t *above,
				1630	const uint8_t *left) {
				1631	smooth_h_predictor_wxh(dst, stride, above, left, 16, 8);
				1632	}
				1633
				1634	void aom_smooth_h_predictor_16x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1635	const uint8_t *above,
				1636	const uint8_t *left) {
				1637	smooth_h_predictor_wxh(dst, stride, above, left, 16, 16);
				1638	}
				1639
				1640	void aom_smooth_h_predictor_16x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1641	const uint8_t *above,
				1642	const uint8_t *left) {
				1643	smooth_h_predictor_wxh(dst, stride, above, left, 16, 32);
				1644	}
				1645
				1646	void aom_smooth_h_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1647	const uint8_t *above,
				1648	const uint8_t *left) {
				1649	smooth_h_predictor_wxh(dst, stride, above, left, 16, 64);
				1650	}
				1651
				1652	void aom_smooth_h_predictor_32x8_ssse3(uint8_t *dst, ptrdiff_t stride,
				1653	const uint8_t *above,
				1654	const uint8_t *left) {
				1655	smooth_h_predictor_wxh(dst, stride, above, left, 32, 8);
				1656	}
				1657
				1658	void aom_smooth_h_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1659	const uint8_t *above,
				1660	const uint8_t *left) {
				1661	smooth_h_predictor_wxh(dst, stride, above, left, 32, 16);
				1662	}
				1663
				1664	void aom_smooth_h_predictor_32x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1665	const uint8_t *above,
				1666	const uint8_t *left) {
				1667	smooth_h_predictor_wxh(dst, stride, above, left, 32, 32);
				1668	}
				1669
				1670	void aom_smooth_h_predictor_32x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1671	const uint8_t *above,
				1672	const uint8_t *left) {
				1673	smooth_h_predictor_wxh(dst, stride, above, left, 32, 64);
				1674	}
				1675
				1676	void aom_smooth_h_predictor_64x64_ssse3(uint8_t *dst, ptrdiff_t stride,
				1677	const uint8_t *above,
				1678	const uint8_t *left) {
				1679	smooth_h_predictor_wxh(dst, stride, above, left, 64, 64);
				1680	}
				1681
				1682	void aom_smooth_h_predictor_64x32_ssse3(uint8_t *dst, ptrdiff_t stride,
				1683	const uint8_t *above,
				1684	const uint8_t *left) {
				1685	smooth_h_predictor_wxh(dst, stride, above, left, 64, 32);
				1686	}
				1687
				1688	void aom_smooth_h_predictor_64x16_ssse3(uint8_t *dst, ptrdiff_t stride,
				1689	const uint8_t *above,
				1690	const uint8_t *left) {
				1691	smooth_h_predictor_wxh(dst, stride, above, left, 64, 16);
				1692	}