Blame - av1/common/x86/selfguided_avx2.c - avm

blob: 375def62ed3a3b2ca4daefae0f3cc3c776cce560 [file] [log] [blame]

Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	1	/*
				2	* Copyright (c) 2018, Alliance for Open Media. All rights reserved
				3	*
				4	* This source code is subject to the terms of the BSD 2 Clause License and
				5	* the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
				6	* was not distributed with this source code in the LICENSE file, you can
				7	* obtain it at www.aomedia.org/license/software. If the Alliance for Open
				8	* Media Patent License 1.0 was not distributed with this source code in the
				9	* PATENTS file, you can obtain it at www.aomedia.org/license/patent.
				10	*/
				11
				12	#include <immintrin.h>
				13
Tom Finegan	60e653d	2018-05-22 11:34:58 -0700	[diff] [blame]	14	#include "config/aom_config.h"
Tom Finegan	44702c8	2018-05-22 13:00:39 -0700	[diff] [blame^]	15	#include "config/av1_rtcd.h"
Tom Finegan	60e653d	2018-05-22 11:34:58 -0700	[diff] [blame]	16
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	17	#include "av1/common/restoration.h"
				18	#include "aom_dsp/x86/synonyms.h"
				19	#include "aom_dsp/x86/synonyms_avx2.h"
				20
				21	// Load 8 bytes from the possibly-misaligned pointer p, extend each byte to
				22	// 32-bit precision and return them in an AVX2 register.
				23	static __m256i yy256_load_extend_8_32(const void *p) {
				24	return _mm256_cvtepu8_epi32(xx_loadl_64(p));
				25	}
				26
				27	// Load 8 halfwords from the possibly-misaligned pointer p, extend each
				28	// halfword to 32-bit precision and return them in an AVX2 register.
				29	static __m256i yy256_load_extend_16_32(const void *p) {
				30	return _mm256_cvtepu16_epi32(xx_loadu_128(p));
				31	}
				32
				33	// Compute the scan of an AVX2 register holding 8 32-bit integers. If the
				34	// register holds x0..x7 then the scan will hold x0, x0+x1, x0+x1+x2, ...,
				35	// x0+x1+...+x7
				36	//
				37	// Let [...] represent a 128-bit block, and let a, ..., h be 32-bit integers
				38	// (assumed small enough to be able to add them without overflow).
				39	//
				40	// Use -> as shorthand for summing, i.e. h->a = h + g + f + e + d + c + b + a.
				41	//
				42	// x = [h g f e][d c b a]
				43	// x01 = [g f e 0][c b a 0]
				44	// x02 = [g+h f+g e+f e][c+d b+c a+b a]
				45	// x03 = [e+f e 0 0][a+b a 0 0]
				46	// x04 = [e->h e->g e->f e][a->d a->c a->b a]
				47	// s = a->d
				48	// s01 = [a->d a->d a->d a->d]
				49	// s02 = [a->d a->d a->d a->d][0 0 0 0]
				50	// ret = [a->h a->g a->f a->e][a->d a->c a->b a]
				51	static __m256i scan_32(__m256i x) {
				52	const __m256i x01 = _mm256_slli_si256(x, 4);
				53	const __m256i x02 = _mm256_add_epi32(x, x01);
				54	const __m256i x03 = _mm256_slli_si256(x02, 8);
				55	const __m256i x04 = _mm256_add_epi32(x02, x03);
				56	const int32_t s = _mm256_extract_epi32(x04, 3);
				57	const __m128i s01 = _mm_set1_epi32(s);
				58	const __m256i s02 = _mm256_insertf128_si256(_mm256_setzero_si256(), s01, 1);
				59	return _mm256_add_epi32(x04, s02);
				60	}
				61
				62	// Compute two integral images from src. B sums elements; A sums their
				63	// squares. The images are offset by one pixel, so will have width and height
				64	// equal to width + 1, height + 1 and the first row and column will be zero.
				65	//
				66	// A+1 and B+1 should be aligned to 32 bytes. buf_stride should be a multiple
				67	// of 8.
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	68
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	69	static void memset_zero_avx(int32_t dest, const __m256i *zero, size_t count) {
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	70	unsigned int i = 0;
				71	for (i = 0; i < (count & 0xffffffe0); i += 32) {
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	72	_mm256_storeu_si256((__m256i )(dest + i), zero);
				73	_mm256_storeu_si256((__m256i )(dest + i + 8), zero);
				74	_mm256_storeu_si256((__m256i )(dest + i + 16), zero);
				75	_mm256_storeu_si256((__m256i )(dest + i + 24), zero);
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	76	}
				77	for (; i < (count & 0xfffffff8); i += 8) {
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	78	_mm256_storeu_si256((__m256i )(dest + i), zero);
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	79	}
				80	for (; i < count; i++) {
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	81	dest[i] = 0;
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	82	}
				83	return dest;
				84	}
				85
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	86	static void integral_images(const uint8_t *src, int src_stride, int width,
				87	int height, int32_t A, int32_t B,
				88	int buf_stride) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	89	const __m256i zero = _mm256_setzero_si256();
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	90	// Write out the zero top row
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	91	memset_zero_avx(A, &zero, (width + 8));
				92	memset_zero_avx(B, &zero, (width + 8));
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	93	for (int i = 0; i < height; ++i) {
				94	// Zero the left column.
				95	A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
				96
				97	// ldiff is the difference H - D where H is the output sample immediately
				98	// to the left and D is the output sample above it. These are scalars,
				99	// replicated across the eight lanes.
				100	__m256i ldiff1 = zero, ldiff2 = zero;
				101	for (int j = 0; j < width; j += 8) {
				102	const int ABj = 1 + j;
				103
				104	const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
				105	const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
				106
				107	const __m256i x1 = yy256_load_extend_8_32(src + j + i * src_stride);
				108	const __m256i x2 = _mm256_madd_epi16(x1, x1);
				109
				110	const __m256i sc1 = scan_32(x1);
				111	const __m256i sc2 = scan_32(x2);
				112
				113	const __m256i row1 =
				114	_mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
				115	const __m256i row2 =
				116	_mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
				117
				118	yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
				119	yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
				120
				121	// Calculate the new H - D.
				122	ldiff1 = _mm256_set1_epi32(
				123	_mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
				124	ldiff2 = _mm256_set1_epi32(
				125	_mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
				126	}
				127	}
				128	}
				129
				130	// Compute two integral images from src. B sums elements; A sums their squares
				131	//
				132	// A and B should be aligned to 32 bytes. buf_stride should be a multiple of 8.
				133	static void integral_images_highbd(const uint16_t *src, int src_stride,
				134	int width, int height, int32_t *A,
				135	int32_t *B, int buf_stride) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	136	const __m256i zero = _mm256_setzero_si256();
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	137	// Write out the zero top row
Victoria Zhislina	4198d39	2018-05-17 15:19:26 +0300	[diff] [blame]	138	memset_zero_avx(A, &zero, (width + 8));
				139	memset_zero_avx(B, &zero, (width + 8));
Victoria Zhislina	eef1cb1	2018-04-23 15:46:30 +0300	[diff] [blame]	140
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	141	for (int i = 0; i < height; ++i) {
				142	// Zero the left column.
				143	A[(i + 1) * buf_stride] = B[(i + 1) * buf_stride] = 0;
				144
				145	// ldiff is the difference H - D where H is the output sample immediately
				146	// to the left and D is the output sample above it. These are scalars,
				147	// replicated across the eight lanes.
				148	__m256i ldiff1 = zero, ldiff2 = zero;
				149	for (int j = 0; j < width; j += 8) {
				150	const int ABj = 1 + j;
				151
				152	const __m256i above1 = yy_load_256(B + ABj + i * buf_stride);
				153	const __m256i above2 = yy_load_256(A + ABj + i * buf_stride);
				154
				155	const __m256i x1 = yy256_load_extend_16_32(src + j + i * src_stride);
				156	const __m256i x2 = _mm256_madd_epi16(x1, x1);
				157
				158	const __m256i sc1 = scan_32(x1);
				159	const __m256i sc2 = scan_32(x2);
				160
				161	const __m256i row1 =
				162	_mm256_add_epi32(_mm256_add_epi32(sc1, above1), ldiff1);
				163	const __m256i row2 =
				164	_mm256_add_epi32(_mm256_add_epi32(sc2, above2), ldiff2);
				165
				166	yy_store_256(B + ABj + (i + 1) * buf_stride, row1);
				167	yy_store_256(A + ABj + (i + 1) * buf_stride, row2);
				168
				169	// Calculate the new H - D.
				170	ldiff1 = _mm256_set1_epi32(
				171	_mm256_extract_epi32(_mm256_sub_epi32(row1, above1), 7));
				172	ldiff2 = _mm256_set1_epi32(
				173	_mm256_extract_epi32(_mm256_sub_epi32(row2, above2), 7));
				174	}
				175	}
				176	}
				177
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	178	// Compute 8 values of boxsum from the given integral image. ii should point
				179	// at the middle of the box (for the first value). r is the box radius.
Kyle Siefring	f14613e	2018-03-01 11:19:16 -0500	[diff] [blame]	180	static INLINE __m256i boxsum_from_ii(const int32_t *ii, int stride, int r) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	181	const __m256i tl = yy_loadu_256(ii - (r + 1) - (r + 1) * stride);
				182	const __m256i tr = yy_loadu_256(ii + (r + 0) - (r + 1) * stride);
				183	const __m256i bl = yy_loadu_256(ii - (r + 1) + r * stride);
				184	const __m256i br = yy_loadu_256(ii + (r + 0) + r * stride);
				185	const __m256i u = _mm256_sub_epi32(tr, tl);
				186	const __m256i v = _mm256_sub_epi32(br, bl);
				187	return _mm256_sub_epi32(v, u);
				188	}
				189
				190	static __m256i round_for_shift(unsigned shift) {
				191	return _mm256_set1_epi32((1 << shift) >> 1);
				192	}
				193
				194	static __m256i compute_p(__m256i sum1, __m256i sum2, int bit_depth, int n) {
				195	__m256i an, bb;
				196	if (bit_depth > 8) {
				197	const __m256i rounding_a = round_for_shift(2 * (bit_depth - 8));
				198	const __m256i rounding_b = round_for_shift(bit_depth - 8);
				199	const __m128i shift_a = _mm_cvtsi32_si128(2 * (bit_depth - 8));
				200	const __m128i shift_b = _mm_cvtsi32_si128(bit_depth - 8);
				201	const __m256i a =
				202	_mm256_srl_epi32(_mm256_add_epi32(sum2, rounding_a), shift_a);
				203	const __m256i b =
				204	_mm256_srl_epi32(_mm256_add_epi32(sum1, rounding_b), shift_b);
				205	// b < 2^14, so we can use a 16-bit madd rather than a 32-bit
				206	// mullo to square it
				207	bb = _mm256_madd_epi16(b, b);
				208	an = _mm256_max_epi32(_mm256_mullo_epi32(a, _mm256_set1_epi32(n)), bb);
				209	} else {
				210	bb = _mm256_madd_epi16(sum1, sum1);
				211	an = _mm256_mullo_epi32(sum2, _mm256_set1_epi32(n));
				212	}
				213	return _mm256_sub_epi32(an, bb);
				214	}
				215
				216	// Assumes that C, D are integral images for the original buffer which has been
				217	// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
				218	// on the sides. A, B, C, D point at logical position (0, 0).
				219	static void calc_ab(int32_t A, int32_t B, const int32_t C, const int32_t D,
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	220	int width, int height, int buf_stride, int bit_depth,
				221	int sgr_params_idx, int radius_idx) {
				222	const sgr_params_type *const params = &sgr_params[sgr_params_idx];
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	223	const int r = params->r[radius_idx];
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	224	const int n = (2 * r + 1) * (2 * r + 1);
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	225	const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	226	// one_over_n[n-1] is 2^12/n, so easily fits in an int16
				227	const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
				228
				229	const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
				230	const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
				231
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	232	// Set up masks
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	233	const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	234	__m256i mask[8];
				235	for (int idx = 0; idx < 8; idx++) {
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	236	const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	237	mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
				238	}
				239
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	240	for (int i = -1; i < height + 1; ++i) {
				241	for (int j = -1; j < width + 1; j += 8) {
				242	const int32_t Cij = C + i buf_stride + j;
				243	const int32_t Dij = D + i buf_stride + j;
				244
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	245	__m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
				246	__m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	247
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	248	// When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
				249	// some uninitialised data in their upper words. We use a mask to
				250	// ensure that these bits are set to 0.
				251	int idx = AOMMIN(8, width + 1 - j);
				252	assert(idx >= 1);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	253
Imdad Sardharwalla	f32dabd	2018-01-17 13:55:37 +0000	[diff] [blame]	254	if (idx < 8) {
				255	sum1 = _mm256_and_si256(mask[idx], sum1);
				256	sum2 = _mm256_and_si256(mask[idx], sum2);
				257	}
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	258
				259	const __m256i p = compute_p(sum1, sum2, bit_depth, n);
				260
				261	const __m256i z = _mm256_min_epi32(
				262	_mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
				263	SGRPROJ_MTABLE_BITS),
				264	_mm256_set1_epi32(255));
				265
				266	const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
				267
				268	yy_storeu_256(A + i * buf_stride + j, a_res);
				269
				270	const __m256i a_complement =
				271	_mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
				272
				273	// sum1 might have lanes greater than 2^15, so we can't use madd to do
				274	// multiplication involving sum1. However, a_complement and one_over_n
				275	// are both less than 256, so we can multiply them first.
				276	const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
				277	const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
				278	const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
				279	SGRPROJ_RECIP_BITS);
				280
				281	yy_storeu_256(B + i * buf_stride + j, b_res);
				282	}
				283	}
				284	}
				285
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	286	// Calculate 8 values of the "cross sum" starting at buf. This is a 3x3 filter
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	287	// where the outer four corners have weight 3 and all other pixels have weight
				288	// 4.
				289	//
				290	// Pixels are indexed as follows:
				291	// xtl xt xtr
				292	// xl x xr
				293	// xbl xb xbr
				294	//
				295	// buf points to x
				296	//
				297	// fours = xl + xt + xr + xb + x
				298	// threes = xtl + xtr + xbr + xbl
				299	// cross_sum = 4 * fours + 3 * threes
				300	// = 4 * (fours + threes) - threes
				301	// = (fours + threes) << 2 - threes
Kyle Siefring	f14613e	2018-03-01 11:19:16 -0500	[diff] [blame]	302	static INLINE __m256i cross_sum(const int32_t *buf, int stride) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	303	const __m256i xtl = yy_loadu_256(buf - 1 - stride);
				304	const __m256i xt = yy_loadu_256(buf - stride);
				305	const __m256i xtr = yy_loadu_256(buf + 1 - stride);
				306	const __m256i xl = yy_loadu_256(buf - 1);
				307	const __m256i x = yy_loadu_256(buf);
				308	const __m256i xr = yy_loadu_256(buf + 1);
				309	const __m256i xbl = yy_loadu_256(buf - 1 + stride);
				310	const __m256i xb = yy_loadu_256(buf + stride);
				311	const __m256i xbr = yy_loadu_256(buf + 1 + stride);
				312
				313	const __m256i fours = _mm256_add_epi32(
				314	xl, _mm256_add_epi32(xt, _mm256_add_epi32(xr, _mm256_add_epi32(xb, x))));
				315	const __m256i threes =
				316	_mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
				317
				318	return _mm256_sub_epi32(_mm256_slli_epi32(_mm256_add_epi32(fours, threes), 2),
				319	threes);
				320	}
				321
				322	// The final filter for self-guided restoration. Computes a weighted average
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	323	// across A, B with "cross sums" (see cross_sum implementation above).
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	324	static void final_filter(int32_t dst, int dst_stride, const int32_t A,
				325	const int32_t B, int buf_stride, const void dgd8,
				326	int dgd_stride, int width, int height, int highbd) {
				327	const int nb = 5;
				328	const __m256i rounding =
				329	round_for_shift(SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
				330	const uint8_t *dgd_real =
				331	highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
				332
				333	for (int i = 0; i < height; ++i) {
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	334	for (int j = 0; j < width; j += 8) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	335	const __m256i a = cross_sum(A + i * buf_stride + j, buf_stride);
				336	const __m256i b = cross_sum(B + i * buf_stride + j, buf_stride);
				337
				338	const __m128i raw =
				339	xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
				340	const __m256i src =
				341	highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
				342
				343	__m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
				344	__m256i w = _mm256_srai_epi32(_mm256_add_epi32(v, rounding),
				345	SGRPROJ_SGR_BITS + nb - SGRPROJ_RST_BITS);
				346
				347	yy_storeu_256(dst + i * dst_stride + j, w);
				348	}
				349	}
				350	}
				351
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	352	// Assumes that C, D are integral images for the original buffer which has been
				353	// extended to have a padding of SGRPROJ_BORDER_VERT/SGRPROJ_BORDER_HORZ pixels
				354	// on the sides. A, B, C, D point at logical position (0, 0).
				355	static void calc_ab_fast(int32_t A, int32_t B, const int32_t *C,
				356	const int32_t *D, int width, int height,
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	357	int buf_stride, int bit_depth, int sgr_params_idx,
				358	int radius_idx) {
				359	const sgr_params_type *const params = &sgr_params[sgr_params_idx];
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	360	const int r = params->r[radius_idx];
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	361	const int n = (2 * r + 1) * (2 * r + 1);
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	362	const __m256i s = _mm256_set1_epi32(params->s[radius_idx]);
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	363	// one_over_n[n-1] is 2^12/n, so easily fits in an int16
				364	const __m256i one_over_n = _mm256_set1_epi32(one_by_x[n - 1]);
				365
				366	const __m256i rnd_z = round_for_shift(SGRPROJ_MTABLE_BITS);
				367	const __m256i rnd_res = round_for_shift(SGRPROJ_RECIP_BITS);
				368
				369	// Set up masks
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	370	const __m128i ones32 = _mm_set_epi32(0, 0, 0xffffffff, 0xffffffff);
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	371	__m256i mask[8];
				372	for (int idx = 0; idx < 8; idx++) {
Imdad Sardharwalla	5123251	2018-04-30 14:41:28 +0100	[diff] [blame]	373	const __m128i shift = _mm_cvtsi32_si128(8 * (8 - idx));
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	374	mask[idx] = _mm256_cvtepi8_epi32(_mm_srl_epi64(ones32, shift));
				375	}
				376
				377	for (int i = -1; i < height + 1; i += 2) {
				378	for (int j = -1; j < width + 1; j += 8) {
				379	const int32_t Cij = C + i buf_stride + j;
				380	const int32_t Dij = D + i buf_stride + j;
				381
				382	__m256i sum1 = boxsum_from_ii(Dij, buf_stride, r);
				383	__m256i sum2 = boxsum_from_ii(Cij, buf_stride, r);
				384
				385	// When width + 2 isn't a multiple of 8, sum1 and sum2 will contain
				386	// some uninitialised data in their upper words. We use a mask to
				387	// ensure that these bits are set to 0.
				388	int idx = AOMMIN(8, width + 1 - j);
				389	assert(idx >= 1);
				390
				391	if (idx < 8) {
				392	sum1 = _mm256_and_si256(mask[idx], sum1);
				393	sum2 = _mm256_and_si256(mask[idx], sum2);
				394	}
				395
				396	const __m256i p = compute_p(sum1, sum2, bit_depth, n);
				397
				398	const __m256i z = _mm256_min_epi32(
				399	_mm256_srli_epi32(_mm256_add_epi32(_mm256_mullo_epi32(p, s), rnd_z),
				400	SGRPROJ_MTABLE_BITS),
				401	_mm256_set1_epi32(255));
				402
				403	const __m256i a_res = _mm256_i32gather_epi32(x_by_xplus1, z, 4);
				404
				405	yy_storeu_256(A + i * buf_stride + j, a_res);
				406
				407	const __m256i a_complement =
				408	_mm256_sub_epi32(_mm256_set1_epi32(SGRPROJ_SGR), a_res);
				409
				410	// sum1 might have lanes greater than 2^15, so we can't use madd to do
				411	// multiplication involving sum1. However, a_complement and one_over_n
				412	// are both less than 256, so we can multiply them first.
				413	const __m256i a_comp_over_n = _mm256_madd_epi16(a_complement, one_over_n);
				414	const __m256i b_int = _mm256_mullo_epi32(a_comp_over_n, sum1);
				415	const __m256i b_res = _mm256_srli_epi32(_mm256_add_epi32(b_int, rnd_res),
				416	SGRPROJ_RECIP_BITS);
				417
				418	yy_storeu_256(B + i * buf_stride + j, b_res);
				419	}
				420	}
				421	}
				422
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	423	// Calculate 8 values of the "cross sum" starting at buf.
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	424	//
				425	// Pixels are indexed like this:
				426	// xtl xt xtr
				427	// - buf -
				428	// xbl xb xbr
				429	//
				430	// Pixels are weighted like this:
				431	// 5 6 5
				432	// 0 0 0
				433	// 5 6 5
				434	//
				435	// fives = xtl + xtr + xbl + xbr
				436	// sixes = xt + xb
				437	// cross_sum = 6 * sixes + 5 * fives
				438	// = 5 * (fives + sixes) - sixes
				439	// = (fives + sixes) << 2 + (fives + sixes) + sixes
Kyle Siefring	f14613e	2018-03-01 11:19:16 -0500	[diff] [blame]	440	static INLINE __m256i cross_sum_fast_even_row(const int32_t *buf, int stride) {
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	441	const __m256i xtl = yy_loadu_256(buf - 1 - stride);
				442	const __m256i xt = yy_loadu_256(buf - stride);
				443	const __m256i xtr = yy_loadu_256(buf + 1 - stride);
				444	const __m256i xbl = yy_loadu_256(buf - 1 + stride);
				445	const __m256i xb = yy_loadu_256(buf + stride);
				446	const __m256i xbr = yy_loadu_256(buf + 1 + stride);
				447
				448	const __m256i fives =
				449	_mm256_add_epi32(xtl, _mm256_add_epi32(xtr, _mm256_add_epi32(xbr, xbl)));
				450	const __m256i sixes = _mm256_add_epi32(xt, xb);
				451	const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
				452
				453	return _mm256_add_epi32(
				454	_mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
				455	fives_plus_sixes),
				456	sixes);
				457	}
				458
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	459	// Calculate 8 values of the "cross sum" starting at buf.
				460	//
				461	// Pixels are indexed like this:
				462	// xl x xr
				463	//
				464	// Pixels are weighted like this:
				465	// 5 6 5
				466	//
				467	// buf points to x
				468	//
				469	// fives = xl + xr
				470	// sixes = x
				471	// cross_sum = 5 * fives + 6 * sixes
				472	// = 4 * (fives + sixes) + (fives + sixes) + sixes
				473	// = (fives + sixes) << 2 + (fives + sixes) + sixes
Kyle Siefring	f14613e	2018-03-01 11:19:16 -0500	[diff] [blame]	474	static INLINE __m256i cross_sum_fast_odd_row(const int32_t *buf) {
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	475	const __m256i xl = yy_loadu_256(buf - 1);
				476	const __m256i x = yy_loadu_256(buf);
				477	const __m256i xr = yy_loadu_256(buf + 1);
				478
				479	const __m256i fives = _mm256_add_epi32(xl, xr);
				480	const __m256i sixes = x;
				481
				482	const __m256i fives_plus_sixes = _mm256_add_epi32(fives, sixes);
				483
				484	return _mm256_add_epi32(
				485	_mm256_add_epi32(_mm256_slli_epi32(fives_plus_sixes, 2),
				486	fives_plus_sixes),
				487	sixes);
				488	}
				489
Debargha Mukherjee	6a6609b	2018-03-14 18:45:53 -0700	[diff] [blame]	490	// The final filter for the self-guided restoration. Computes a
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	491	// weighted average across A, B with "cross sums" (see cross_sum_...
				492	// implementations above).
				493	static void final_filter_fast(int32_t dst, int dst_stride, const int32_t A,
				494	const int32_t *B, int buf_stride,
				495	const void *dgd8, int dgd_stride, int width,
				496	int height, int highbd) {
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	497	const int nb0 = 5;
				498	const int nb1 = 4;
				499
				500	const __m256i rounding0 =
				501	round_for_shift(SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
				502	const __m256i rounding1 =
				503	round_for_shift(SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
				504
				505	const uint8_t *dgd_real =
				506	highbd ? (const uint8_t *)CONVERT_TO_SHORTPTR(dgd8) : dgd8;
				507
				508	for (int i = 0; i < height; ++i) {
				509	if (!(i & 1)) { // even row
				510	for (int j = 0; j < width; j += 8) {
				511	const __m256i a =
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	512	cross_sum_fast_even_row(A + i * buf_stride + j, buf_stride);
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	513	const __m256i b =
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	514	cross_sum_fast_even_row(B + i * buf_stride + j, buf_stride);
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	515
				516	const __m128i raw =
				517	xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
				518	const __m256i src =
				519	highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
				520
				521	__m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
				522	__m256i w =
				523	_mm256_srai_epi32(_mm256_add_epi32(v, rounding0),
				524	SGRPROJ_SGR_BITS + nb0 - SGRPROJ_RST_BITS);
				525
				526	yy_storeu_256(dst + i * dst_stride + j, w);
				527	}
				528	} else { // odd row
				529	for (int j = 0; j < width; j += 8) {
Imdad Sardharwalla	2f4d0f4	2018-02-06 10:45:26 +0000	[diff] [blame]	530	const __m256i a = cross_sum_fast_odd_row(A + i * buf_stride + j);
				531	const __m256i b = cross_sum_fast_odd_row(B + i * buf_stride + j);
Imdad Sardharwalla	d051e56	2018-02-02 09:42:07 +0000	[diff] [blame]	532
				533	const __m128i raw =
				534	xx_loadu_128(dgd_real + ((i * dgd_stride + j) << highbd));
				535	const __m256i src =
				536	highbd ? _mm256_cvtepu16_epi32(raw) : _mm256_cvtepu8_epi32(raw);
				537
				538	__m256i v = _mm256_add_epi32(_mm256_madd_epi16(a, src), b);
				539	__m256i w =
				540	_mm256_srai_epi32(_mm256_add_epi32(v, rounding1),
				541	SGRPROJ_SGR_BITS + nb1 - SGRPROJ_RST_BITS);
				542
				543	yy_storeu_256(dst + i * dst_stride + j, w);
				544	}
				545	}
				546	}
				547	}
Imdad Sardharwalla	9d23457	2018-01-24 13:39:00 +0000	[diff] [blame]	548
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	549	void av1_selfguided_restoration_avx2(const uint8_t *dgd8, int width, int height,
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	550	int dgd_stride, int32_t *flt0,
				551	int32_t *flt1, int flt_stride,
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	552	int sgr_params_idx, int bit_depth,
				553	int highbd) {
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	554	// The ALIGN_POWER_OF_TWO macro here ensures that column 1 of Atl, Btl,
				555	// Ctl and Dtl is 32-byte aligned.
				556	const int buf_elts = ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3);
				557
				558	DECLARE_ALIGNED(32, int32_t,
				559	buf[4 * ALIGN_POWER_OF_TWO(RESTORATION_PROC_UNIT_PELS, 3)]);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	560
				561	const int width_ext = width + 2 * SGRPROJ_BORDER_HORZ;
				562	const int height_ext = height + 2 * SGRPROJ_BORDER_VERT;
				563
				564	// Adjusting the stride of A and B here appears to avoid bad cache effects,
				565	// leading to a significant speed improvement.
				566	// We also align the stride to a multiple of 32 bytes for efficiency.
				567	int buf_stride = ALIGN_POWER_OF_TWO(width_ext + 16, 3);
				568
				569	// The "tl" pointers point at the top-left of the initialised data for the
				570	// array.
				571	int32_t Atl = buf + 0 buf_elts + 7;
				572	int32_t Btl = buf + 1 buf_elts + 7;
				573	int32_t Ctl = buf + 2 buf_elts + 7;
				574	int32_t Dtl = buf + 3 buf_elts + 7;
				575
				576	// The "0" pointers are (- SGRPROJ_BORDER_VERT, -SGRPROJ_BORDER_HORZ). Note
				577	// there's a zero row and column in A, B (integral images), so we move down
				578	// and right one for them.
				579	const int buf_diag_border =
				580	SGRPROJ_BORDER_HORZ + buf_stride * SGRPROJ_BORDER_VERT;
				581
				582	int32_t *A0 = Atl + 1 + buf_stride;
				583	int32_t *B0 = Btl + 1 + buf_stride;
				584	int32_t *C0 = Ctl + 1 + buf_stride;
				585	int32_t *D0 = Dtl + 1 + buf_stride;
				586
				587	// Finally, A, B, C, D point at position (0, 0).
				588	int32_t *A = A0 + buf_diag_border;
				589	int32_t *B = B0 + buf_diag_border;
				590	int32_t *C = C0 + buf_diag_border;
				591	int32_t *D = D0 + buf_diag_border;
				592
				593	const int dgd_diag_border =
				594	SGRPROJ_BORDER_HORZ + dgd_stride * SGRPROJ_BORDER_VERT;
				595	const uint8_t *dgd0 = dgd8 - dgd_diag_border;
				596
				597	// Generate integral images from the input. C will contain sums of squares; D
				598	// will contain just sums
				599	if (highbd)
				600	integral_images_highbd(CONVERT_TO_SHORTPTR(dgd0), dgd_stride, width_ext,
				601	height_ext, Ctl, Dtl, buf_stride);
				602	else
				603	integral_images(dgd0, dgd_stride, width_ext, height_ext, Ctl, Dtl,
				604	buf_stride);
				605
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	606	const sgr_params_type *const params = &sgr_params[sgr_params_idx];
Debargha Mukherjee	25afc9b	2018-03-27 10:45:19 -0700	[diff] [blame]	607	// Write to flt0 and flt1
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	608	// If params->r == 0 we skip the corresponding filter. We only allow one of
				609	// the radii to be 0, as having both equal to 0 would be equivalent to
				610	// skipping SGR entirely.
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	611	assert(!(params->r[0] == 0 && params->r[1] == 0));
				612	assert(params->r[0] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
				613	assert(params->r[1] < AOMMIN(SGRPROJ_BORDER_VERT, SGRPROJ_BORDER_HORZ));
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	614
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	615	if (params->r[0] > 0) {
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	616	calc_ab_fast(A, B, C, D, width, height, buf_stride, bit_depth,
				617	sgr_params_idx, 0);
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	618	final_filter_fast(flt0, flt_stride, A, B, buf_stride, dgd8, dgd_stride,
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	619	width, height, highbd);
				620	}
				621
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	622	if (params->r[1] > 0) {
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	623	calc_ab(A, B, C, D, width, height, buf_stride, bit_depth, sgr_params_idx,
				624	1);
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	625	final_filter(flt1, flt_stride, A, B, buf_stride, dgd8, dgd_stride, width,
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	626	height, highbd);
				627	}
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	628	}
				629
				630	void apply_selfguided_restoration_avx2(const uint8_t *dat8, int width,
				631	int height, int stride, int eps,
				632	const int xqd, uint8_t dst8,
				633	int dst_stride, int32_t *tmpbuf,
				634	int bit_depth, int highbd) {
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	635	int32_t *flt0 = tmpbuf;
Urvang Joshi	813186b	2018-03-08 15:38:46 -0800	[diff] [blame]	636	int32_t *flt1 = flt0 + RESTORATION_UNITPELS_MAX;
				637	assert(width * height <= RESTORATION_UNITPELS_MAX);
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	638	av1_selfguided_restoration_avx2(dat8, width, height, stride, flt0, flt1,
Urvang Joshi	c079f7a	2018-05-11 16:13:56 -0700	[diff] [blame]	639	width, eps, bit_depth, highbd);
				640	const sgr_params_type *const params = &sgr_params[eps];
Imdad Sardharwalla	fdeb116	2018-02-21 17:38:20 +0000	[diff] [blame]	641	int xq[2];
				642	decode_xq(xqd, xq, params);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	643
				644	__m256i xq0 = _mm256_set1_epi32(xq[0]);
				645	__m256i xq1 = _mm256_set1_epi32(xq[1]);
				646
				647	for (int i = 0; i < height; ++i) {
				648	// Calculate output in batches of 16 pixels
				649	for (int j = 0; j < width; j += 16) {
				650	const int k = i * width + j;
				651	const int m = i * dst_stride + j;
				652
				653	const uint8_t dat8ij = dat8 + i stride + j;
				654	__m256i ep_0, ep_1;
				655	__m128i src_0, src_1;
				656	if (highbd) {
				657	src_0 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij));
				658	src_1 = xx_loadu_128(CONVERT_TO_SHORTPTR(dat8ij + 8));
				659	ep_0 = _mm256_cvtepu16_epi32(src_0);
				660	ep_1 = _mm256_cvtepu16_epi32(src_1);
				661	} else {
				662	src_0 = xx_loadu_128(dat8ij);
				663	ep_0 = _mm256_cvtepu8_epi32(src_0);
				664	ep_1 = _mm256_cvtepu8_epi32(_mm_srli_si128(src_0, 8));
				665	}
				666
				667	const __m256i u_0 = _mm256_slli_epi32(ep_0, SGRPROJ_RST_BITS);
				668	const __m256i u_1 = _mm256_slli_epi32(ep_1, SGRPROJ_RST_BITS);
				669
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	670	__m256i v_0 = _mm256_slli_epi32(u_0, SGRPROJ_PRJ_BITS);
				671	__m256i v_1 = _mm256_slli_epi32(u_1, SGRPROJ_PRJ_BITS);
				672
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	673	if (params->r[0] > 0) {
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	674	const __m256i f1_0 = _mm256_sub_epi32(yy_loadu_256(&flt0[k]), u_0);
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	675	v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq0, f1_0));
				676
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	677	const __m256i f1_1 = _mm256_sub_epi32(yy_loadu_256(&flt0[k + 8]), u_1);
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	678	v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq0, f1_1));
				679	}
				680
Urvang Joshi	3715b88	2018-05-14 20:05:25 -0400	[diff] [blame]	681	if (params->r[1] > 0) {
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	682	const __m256i f2_0 = _mm256_sub_epi32(yy_loadu_256(&flt1[k]), u_0);
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	683	v_0 = _mm256_add_epi32(v_0, _mm256_mullo_epi32(xq1, f2_0));
				684
Imdad Sardharwalla	7d3bd8d	2018-02-22 15:47:33 +0000	[diff] [blame]	685	const __m256i f2_1 = _mm256_sub_epi32(yy_loadu_256(&flt1[k + 8]), u_1);
Imdad Sardharwalla	81307a3	2018-02-21 13:18:06 +0000	[diff] [blame]	686	v_1 = _mm256_add_epi32(v_1, _mm256_mullo_epi32(xq1, f2_1));
				687	}
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	688
				689	const __m256i rounding =
				690	round_for_shift(SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
				691	const __m256i w_0 = _mm256_srai_epi32(
				692	_mm256_add_epi32(v_0, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
				693	const __m256i w_1 = _mm256_srai_epi32(
				694	_mm256_add_epi32(v_1, rounding), SGRPROJ_PRJ_BITS + SGRPROJ_RST_BITS);
				695
				696	if (highbd) {
				697	// Pack into 16 bits and clamp to [0, 2^bit_depth)
				698	// Note that packing into 16 bits messes up the order of the bits,
				699	// so we use a permute function to correct this
				700	const __m256i tmp = _mm256_packus_epi32(w_0, w_1);
				701	const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
				702	const __m256i max = _mm256_set1_epi16((1 << bit_depth) - 1);
				703	const __m256i res = _mm256_min_epi16(tmp2, max);
David Barker	340261d	2018-02-08 16:52:36 +0000	[diff] [blame]	704	yy_storeu_256(CONVERT_TO_SHORTPTR(dst8 + m), res);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	705	} else {
				706	// Pack into 8 bits and clamp to [0, 256)
				707	// Note that each pack messes up the order of the bits,
				708	// so we use a permute function to correct this
				709	const __m256i tmp = _mm256_packs_epi32(w_0, w_1);
				710	const __m256i tmp2 = _mm256_permute4x64_epi64(tmp, 0xd8);
				711	const __m256i res =
				712	_mm256_packus_epi16(tmp2, tmp2 /* "don't care" value */);
				713	const __m128i res2 =
				714	_mm256_castsi256_si128(_mm256_permute4x64_epi64(res, 0xd8));
David Barker	340261d	2018-02-08 16:52:36 +0000	[diff] [blame]	715	xx_storeu_128(dst8 + m, res2);
Imdad Sardharwalla	c6acc53	2018-01-03 15:18:24 +0000	[diff] [blame]	716	}
				717	}
				718	}
				719	}