blob: 9df0ddc5e66b9776ce438fdfbebbeb7cbb76a73d [file] [log] [blame]
Cheng Chencce312f2017-11-09 16:19:17 -08001/*
2 * Copyright (c) 2016, Alliance for Open Media. All rights reserved
3 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <smmintrin.h>
13
Tom Finegan44702c82018-05-22 13:00:39 -070014#include "config/av1_rtcd.h"
15
Cheng Chencce312f2017-11-09 16:19:17 -080016#include "av1/common/warped_motion.h"
17
Hien Ho830b8972019-04-04 15:51:14 -070018static const uint8_t warp_highbd_arrange_bytes[16] = { 0, 2, 4, 6, 8, 10,
19 12, 14, 1, 3, 5, 7,
20 9, 11, 13, 15 };
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053021
Remya070a2482018-08-22 17:18:39 +053022static const uint8_t highbd_shuffle_alpha0_mask0[16] = {
23 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3
24};
25static const uint8_t highbd_shuffle_alpha0_mask1[16] = {
26 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7
27};
Hien Ho830b8972019-04-04 15:51:14 -070028static const uint8_t highbd_shuffle_alpha0_mask2[16] = { 8, 9, 10, 11, 8, 9,
29 10, 11, 8, 9, 10, 11,
30 8, 9, 10, 11 };
31static const uint8_t highbd_shuffle_alpha0_mask3[16] = { 12, 13, 14, 15, 12, 13,
32 14, 15, 12, 13, 14, 15,
33 12, 13, 14, 15 };
Remya070a2482018-08-22 17:18:39 +053034
35static INLINE void highbd_prepare_horizontal_filter_coeff(int alpha, int sx,
36 __m128i *coeff) {
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053037 // Filter even-index pixels
Yaowu Xu3a19b8a2019-05-01 08:40:42 -070038 const __m128i tmp_0 =
39 _mm_loadu_si128((__m128i *)(av1_warped_filter +
40 ((sx + 0 * alpha) >> WARPEDDIFF_PREC_BITS)));
41 const __m128i tmp_2 =
42 _mm_loadu_si128((__m128i *)(av1_warped_filter +
43 ((sx + 2 * alpha) >> WARPEDDIFF_PREC_BITS)));
44 const __m128i tmp_4 =
45 _mm_loadu_si128((__m128i *)(av1_warped_filter +
46 ((sx + 4 * alpha) >> WARPEDDIFF_PREC_BITS)));
47 const __m128i tmp_6 =
48 _mm_loadu_si128((__m128i *)(av1_warped_filter +
49 ((sx + 6 * alpha) >> WARPEDDIFF_PREC_BITS)));
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053050
51 // coeffs 0 1 0 1 2 3 2 3 for pixels 0, 2
52 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
53 // coeffs 0 1 0 1 2 3 2 3 for pixels 4, 6
54 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
55 // coeffs 4 5 4 5 6 7 6 7 for pixels 0, 2
56 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
57 // coeffs 4 5 4 5 6 7 6 7 for pixels 4, 6
58 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
59
60 // coeffs 0 1 0 1 0 1 0 1 for pixels 0, 2, 4, 6
Remya070a2482018-08-22 17:18:39 +053061 coeff[0] = _mm_unpacklo_epi64(tmp_8, tmp_10);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053062 // coeffs 2 3 2 3 2 3 2 3 for pixels 0, 2, 4, 6
Remya070a2482018-08-22 17:18:39 +053063 coeff[2] = _mm_unpackhi_epi64(tmp_8, tmp_10);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053064 // coeffs 4 5 4 5 4 5 4 5 for pixels 0, 2, 4, 6
Remya070a2482018-08-22 17:18:39 +053065 coeff[4] = _mm_unpacklo_epi64(tmp_12, tmp_14);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053066 // coeffs 6 7 6 7 6 7 6 7 for pixels 0, 2, 4, 6
Remya070a2482018-08-22 17:18:39 +053067 coeff[6] = _mm_unpackhi_epi64(tmp_12, tmp_14);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053068
69 // Filter odd-index pixels
Yaowu Xu3a19b8a2019-05-01 08:40:42 -070070 const __m128i tmp_1 =
71 _mm_loadu_si128((__m128i *)(av1_warped_filter +
72 ((sx + 1 * alpha) >> WARPEDDIFF_PREC_BITS)));
73 const __m128i tmp_3 =
74 _mm_loadu_si128((__m128i *)(av1_warped_filter +
75 ((sx + 3 * alpha) >> WARPEDDIFF_PREC_BITS)));
76 const __m128i tmp_5 =
77 _mm_loadu_si128((__m128i *)(av1_warped_filter +
78 ((sx + 5 * alpha) >> WARPEDDIFF_PREC_BITS)));
79 const __m128i tmp_7 =
80 _mm_loadu_si128((__m128i *)(av1_warped_filter +
81 ((sx + 7 * alpha) >> WARPEDDIFF_PREC_BITS)));
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053082
83 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
84 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
85 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
86 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
87
Remya070a2482018-08-22 17:18:39 +053088 coeff[1] = _mm_unpacklo_epi64(tmp_9, tmp_11);
89 coeff[3] = _mm_unpackhi_epi64(tmp_9, tmp_11);
90 coeff[5] = _mm_unpacklo_epi64(tmp_13, tmp_15);
91 coeff[7] = _mm_unpackhi_epi64(tmp_13, tmp_15);
92}
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +053093
Remya070a2482018-08-22 17:18:39 +053094static INLINE void highbd_prepare_horizontal_filter_coeff_alpha0(
95 int sx, __m128i *coeff) {
96 // Filter coeff
97 const __m128i tmp_0 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -070098 (__m128i *)(av1_warped_filter + (sx >> WARPEDDIFF_PREC_BITS)));
Remya070a2482018-08-22 17:18:39 +053099
100 coeff[0] = _mm_shuffle_epi8(
101 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask0));
102 coeff[2] = _mm_shuffle_epi8(
103 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask1));
104 coeff[4] = _mm_shuffle_epi8(
105 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask2));
106 coeff[6] = _mm_shuffle_epi8(
107 tmp_0, _mm_loadu_si128((__m128i *)highbd_shuffle_alpha0_mask3));
108
109 coeff[1] = coeff[0];
110 coeff[3] = coeff[2];
111 coeff[5] = coeff[4];
112 coeff[7] = coeff[6];
113}
114
115static INLINE void highbd_filter_src_pixels(
116 const __m128i *src, const __m128i *src2, __m128i *tmp, __m128i *coeff,
117 const int offset_bits_horiz, const int reduce_bits_horiz, int k) {
118 const __m128i src_1 = *src;
119 const __m128i src2_1 = *src2;
120
121 const __m128i round_const = _mm_set1_epi32((1 << offset_bits_horiz) +
122 ((1 << reduce_bits_horiz) >> 1));
123
124 const __m128i res_0 = _mm_madd_epi16(src_1, coeff[0]);
125 const __m128i res_2 =
126 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 4), coeff[2]);
127 const __m128i res_4 =
128 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 8), coeff[4]);
129 const __m128i res_6 =
130 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 12), coeff[6]);
131
132 __m128i res_even =
133 _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6));
134 res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const),
135 _mm_cvtsi32_si128(reduce_bits_horiz));
136
137 const __m128i res_1 =
138 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 2), coeff[1]);
139 const __m128i res_3 =
140 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 6), coeff[3]);
141 const __m128i res_5 =
142 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 10), coeff[5]);
143 const __m128i res_7 =
144 _mm_madd_epi16(_mm_alignr_epi8(src2_1, src_1, 14), coeff[7]);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530145
146 __m128i res_odd =
147 _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7));
148 res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const),
149 _mm_cvtsi32_si128(reduce_bits_horiz));
150
151 // Combine results into one register.
152 // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
153 // as this order helps with the vertical filter.
154 tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
155}
156
Remya070a2482018-08-22 17:18:39 +0530157static INLINE void highbd_horiz_filter(const __m128i *src, const __m128i *src2,
158 __m128i *tmp, int sx, int alpha, int k,
159 const int offset_bits_horiz,
160 const int reduce_bits_horiz) {
161 __m128i coeff[8];
162 highbd_prepare_horizontal_filter_coeff(alpha, sx, coeff);
163 highbd_filter_src_pixels(src, src2, tmp, coeff, offset_bits_horiz,
164 reduce_bits_horiz, k);
165}
166
167static INLINE void highbd_warp_horizontal_filter_alpha0_beta0(
168 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
169 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
170 const int offset_bits_horiz, const int reduce_bits_horiz) {
171 (void)beta;
172 (void)alpha;
173 int k;
174
175 __m128i coeff[8];
176 highbd_prepare_horizontal_filter_coeff_alpha0(sx4, coeff);
177
178 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
179 int iy = iy4 + k;
180 if (iy < 0)
181 iy = 0;
182 else if (iy > height - 1)
183 iy = height - 1;
184
185 // Load source pixels
186 const __m128i src =
187 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
188 const __m128i src2 =
189 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
190 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
191 reduce_bits_horiz, k);
192 }
193}
194
195static INLINE void highbd_warp_horizontal_filter_alpha0(
196 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
197 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
198 const int offset_bits_horiz, const int reduce_bits_horiz) {
199 (void)alpha;
200 int k;
201 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
202 int iy = iy4 + k;
203 if (iy < 0)
204 iy = 0;
205 else if (iy > height - 1)
206 iy = height - 1;
207 int sx = sx4 + beta * (k + 4);
208
209 // Load source pixels
210 const __m128i src =
211 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
212 const __m128i src2 =
213 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
214
215 __m128i coeff[8];
216 highbd_prepare_horizontal_filter_coeff_alpha0(sx, coeff);
217 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
218 reduce_bits_horiz, k);
219 }
220}
221
222static INLINE void highbd_warp_horizontal_filter_beta0(
223 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
224 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
225 const int offset_bits_horiz, const int reduce_bits_horiz) {
226 (void)beta;
227 int k;
228 __m128i coeff[8];
229 highbd_prepare_horizontal_filter_coeff(alpha, sx4, coeff);
230
231 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
232 int iy = iy4 + k;
233 if (iy < 0)
234 iy = 0;
235 else if (iy > height - 1)
236 iy = height - 1;
237
238 // Load source pixels
239 const __m128i src =
240 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
241 const __m128i src2 =
242 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
243 highbd_filter_src_pixels(&src, &src2, tmp, coeff, offset_bits_horiz,
244 reduce_bits_horiz, k);
245 }
246}
247
248static INLINE void highbd_warp_horizontal_filter(
249 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
250 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
251 const int offset_bits_horiz, const int reduce_bits_horiz) {
252 int k;
253 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
254 int iy = iy4 + k;
255 if (iy < 0)
256 iy = 0;
257 else if (iy > height - 1)
258 iy = height - 1;
259 int sx = sx4 + beta * (k + 4);
260
261 // Load source pixels
262 const __m128i src =
263 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
264 const __m128i src2 =
265 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
266
267 highbd_horiz_filter(&src, &src2, tmp, sx, alpha, k, offset_bits_horiz,
268 reduce_bits_horiz);
269 }
270}
271
272static INLINE void highbd_prepare_warp_horizontal_filter(
273 const uint16_t *ref, __m128i *tmp, int stride, int32_t ix4, int32_t iy4,
274 int32_t sx4, int alpha, int beta, int p_height, int height, int i,
275 const int offset_bits_horiz, const int reduce_bits_horiz) {
276 if (alpha == 0 && beta == 0)
277 highbd_warp_horizontal_filter_alpha0_beta0(
278 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
279 offset_bits_horiz, reduce_bits_horiz);
280
281 else if (alpha == 0 && beta != 0)
282 highbd_warp_horizontal_filter_alpha0(ref, tmp, stride, ix4, iy4, sx4, alpha,
283 beta, p_height, height, i,
284 offset_bits_horiz, reduce_bits_horiz);
285
286 else if (alpha != 0 && beta == 0)
287 highbd_warp_horizontal_filter_beta0(ref, tmp, stride, ix4, iy4, sx4, alpha,
288 beta, p_height, height, i,
289 offset_bits_horiz, reduce_bits_horiz);
290 else
291 highbd_warp_horizontal_filter(ref, tmp, stride, ix4, iy4, sx4, alpha, beta,
292 p_height, height, i, offset_bits_horiz,
293 reduce_bits_horiz);
294}
295
Cheng Chencce312f2017-11-09 16:19:17 -0800296void av1_highbd_warp_affine_sse4_1(const int32_t *mat, const uint16_t *ref,
297 int width, int height, int stride,
298 uint16_t *pred, int p_col, int p_row,
299 int p_width, int p_height, int p_stride,
300 int subsampling_x, int subsampling_y, int bd,
301 ConvolveParams *conv_params, int16_t alpha,
302 int16_t beta, int16_t gamma, int16_t delta) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530303 __m128i tmp[15];
304 int i, j, k;
305 const int reduce_bits_horiz =
306 conv_params->round_0 +
307 AOMMAX(bd + FILTER_BITS - conv_params->round_0 - 14, 0);
308 const int reduce_bits_vert = conv_params->is_compound
309 ? conv_params->round_1
310 : 2 * FILTER_BITS - reduce_bits_horiz;
311 const int offset_bits_horiz = bd + FILTER_BITS - 1;
312 assert(IMPLIES(conv_params->is_compound, conv_params->dst != NULL));
313 assert(!(bd == 12 && reduce_bits_horiz < 5));
Peng Binb0f64c52018-04-26 15:41:07 +0800314 assert(IMPLIES(conv_params->do_average, conv_params->is_compound));
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530315
316 const int offset_bits_vert = bd + 2 * FILTER_BITS - reduce_bits_horiz;
317 const __m128i clip_pixel =
318 _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
319 const __m128i reduce_bits_vert_shift = _mm_cvtsi32_si128(reduce_bits_vert);
320 const __m128i reduce_bits_vert_const =
321 _mm_set1_epi32(((1 << reduce_bits_vert) >> 1));
322 const __m128i res_add_const = _mm_set1_epi32(1 << offset_bits_vert);
323 const int round_bits =
324 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
325 const int offset_bits = bd + 2 * FILTER_BITS - conv_params->round_0;
326 const __m128i res_sub_const =
327 _mm_set1_epi32(-(1 << (offset_bits - conv_params->round_1)) -
328 (1 << (offset_bits - conv_params->round_1 - 1)));
329 __m128i round_bits_shift = _mm_cvtsi32_si128(round_bits);
330 __m128i round_bits_const = _mm_set1_epi32(((1 << round_bits) >> 1));
331
332 const int w0 = conv_params->fwd_offset;
333 const int w1 = conv_params->bck_offset;
334 const __m128i wt0 = _mm_set1_epi32(w0);
335 const __m128i wt1 = _mm_set1_epi32(w1);
336
337 /* Note: For this code to work, the left/right frame borders need to be
338 extended by at least 13 pixels each. By the time we get here, other
339 code will have set up this border, but we allow an explicit check
340 for debugging purposes.
341 */
342 /*for (i = 0; i < height; ++i) {
343 for (j = 0; j < 13; ++j) {
344 assert(ref[i * stride - 13 + j] == ref[i * stride]);
345 assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
346 }
347 }*/
348
349 for (i = 0; i < p_height; i += 8) {
350 for (j = 0; j < p_width; j += 8) {
351 const int32_t src_x = (p_col + j + 4) << subsampling_x;
352 const int32_t src_y = (p_row + i + 4) << subsampling_y;
Debargha Mukherjeeb7614282021-10-20 23:37:58 -0700353 const int64_t dst_x =
354 (int64_t)mat[2] * src_x + (int64_t)mat[3] * src_y + (int64_t)mat[0];
355 const int64_t dst_y =
356 (int64_t)mat[4] * src_x + (int64_t)mat[5] * src_y + (int64_t)mat[1];
357 const int64_t x4 = dst_x >> subsampling_x;
358 const int64_t y4 = dst_y >> subsampling_y;
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530359
Debargha Mukherjeeb7614282021-10-20 23:37:58 -0700360 int32_t ix4 = (int32_t)(x4 >> WARPEDMODEL_PREC_BITS);
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530361 int32_t sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
Debargha Mukherjeeb7614282021-10-20 23:37:58 -0700362 int32_t iy4 = (int32_t)(y4 >> WARPEDMODEL_PREC_BITS);
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530363 int32_t sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
364
365 // Add in all the constant terms, including rounding and offset
366 sx4 += alpha * (-4) + beta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
367 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
368 sy4 += gamma * (-4) + delta * (-4) + (1 << (WARPEDDIFF_PREC_BITS - 1)) +
369 (WARPEDPIXEL_PREC_SHIFTS << WARPEDDIFF_PREC_BITS);
370
371 sx4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
372 sy4 &= ~((1 << WARP_PARAM_REDUCE_BITS) - 1);
373
374 // Horizontal filter
375 // If the block is aligned such that, after clamping, every sample
376 // would be taken from the leftmost/rightmost column, then we can
377 // skip the expensive horizontal filter.
378 if (ix4 <= -7) {
379 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
380 int iy = iy4 + k;
381 if (iy < 0)
382 iy = 0;
383 else if (iy > height - 1)
384 iy = height - 1;
385 tmp[k + 7] = _mm_set1_epi16(
386 (1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
387 ref[iy * stride] * (1 << (FILTER_BITS - reduce_bits_horiz)));
388 }
389 } else if (ix4 >= width + 6) {
390 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
391 int iy = iy4 + k;
392 if (iy < 0)
393 iy = 0;
394 else if (iy > height - 1)
395 iy = height - 1;
396 tmp[k + 7] =
397 _mm_set1_epi16((1 << (bd + FILTER_BITS - reduce_bits_horiz - 1)) +
398 ref[iy * stride + (width - 1)] *
399 (1 << (FILTER_BITS - reduce_bits_horiz)));
400 }
Ravi Chaudharyc9b62252018-06-22 12:26:44 +0530401 } else if (((ix4 - 7) < 0) || ((ix4 + 9) > width)) {
402 const int out_of_boundary_left = -(ix4 - 6);
403 const int out_of_boundary_right = (ix4 + 8) - width;
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530404
405 for (k = -7; k < AOMMIN(8, p_height - i); ++k) {
406 int iy = iy4 + k;
407 if (iy < 0)
408 iy = 0;
409 else if (iy > height - 1)
410 iy = height - 1;
411 int sx = sx4 + beta * (k + 4);
412
413 // Load source pixels
414 const __m128i src =
415 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
416 const __m128i src2 =
417 _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 + 1));
418
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530419 const __m128i src_01 = _mm_shuffle_epi8(
420 src, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
421 const __m128i src2_01 = _mm_shuffle_epi8(
422 src2, _mm_loadu_si128((__m128i *)warp_highbd_arrange_bytes));
423
Ravi Chaudharyc9b62252018-06-22 12:26:44 +0530424 __m128i src_lo = _mm_unpacklo_epi64(src_01, src2_01);
425 __m128i src_hi = _mm_unpackhi_epi64(src_01, src2_01);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530426
Ravi Chaudharyc9b62252018-06-22 12:26:44 +0530427 if (out_of_boundary_left >= 0) {
428 const __m128i shuffle_reg_left =
429 _mm_loadu_si128((__m128i *)warp_pad_left[out_of_boundary_left]);
430 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_left);
431 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_left);
432 }
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530433
Ravi Chaudharyc9b62252018-06-22 12:26:44 +0530434 if (out_of_boundary_right >= 0) {
435 const __m128i shuffle_reg_right = _mm_loadu_si128(
436 (__m128i *)warp_pad_right[out_of_boundary_right]);
437 src_lo = _mm_shuffle_epi8(src_lo, shuffle_reg_right);
438 src_hi = _mm_shuffle_epi8(src_hi, shuffle_reg_right);
439 }
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530440
Ravi Chaudharyc9b62252018-06-22 12:26:44 +0530441 const __m128i src_padded = _mm_unpacklo_epi8(src_lo, src_hi);
442 const __m128i src2_padded = _mm_unpackhi_epi8(src_lo, src_hi);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530443
Remya070a2482018-08-22 17:18:39 +0530444 highbd_horiz_filter(&src_padded, &src2_padded, tmp, sx, alpha, k,
445 offset_bits_horiz, reduce_bits_horiz);
Ravi Chaudhary8ff220c2018-05-09 16:42:19 +0530446 }
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530447 } else {
Remya070a2482018-08-22 17:18:39 +0530448 highbd_prepare_warp_horizontal_filter(
449 ref, tmp, stride, ix4, iy4, sx4, alpha, beta, p_height, height, i,
450 offset_bits_horiz, reduce_bits_horiz);
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530451 }
452
453 // Vertical filter
454 for (k = -4; k < AOMMIN(4, p_height - i - 4); ++k) {
455 int sy = sy4 + delta * (k + 4);
456
457 // Load from tmp and rearrange pairs of consecutive rows into the
458 // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
459 const __m128i *src = tmp + (k + 4);
460 const __m128i src_0 = _mm_unpacklo_epi16(src[0], src[1]);
461 const __m128i src_2 = _mm_unpacklo_epi16(src[2], src[3]);
462 const __m128i src_4 = _mm_unpacklo_epi16(src[4], src[5]);
463 const __m128i src_6 = _mm_unpacklo_epi16(src[6], src[7]);
464
465 // Filter even-index pixels
466 const __m128i tmp_0 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700467 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530468 ((sy + 0 * gamma) >> WARPEDDIFF_PREC_BITS)));
469 const __m128i tmp_2 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700470 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530471 ((sy + 2 * gamma) >> WARPEDDIFF_PREC_BITS)));
472 const __m128i tmp_4 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700473 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530474 ((sy + 4 * gamma) >> WARPEDDIFF_PREC_BITS)));
475 const __m128i tmp_6 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700476 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530477 ((sy + 6 * gamma) >> WARPEDDIFF_PREC_BITS)));
478
479 const __m128i tmp_8 = _mm_unpacklo_epi32(tmp_0, tmp_2);
480 const __m128i tmp_10 = _mm_unpacklo_epi32(tmp_4, tmp_6);
481 const __m128i tmp_12 = _mm_unpackhi_epi32(tmp_0, tmp_2);
482 const __m128i tmp_14 = _mm_unpackhi_epi32(tmp_4, tmp_6);
483
484 const __m128i coeff_0 = _mm_unpacklo_epi64(tmp_8, tmp_10);
485 const __m128i coeff_2 = _mm_unpackhi_epi64(tmp_8, tmp_10);
486 const __m128i coeff_4 = _mm_unpacklo_epi64(tmp_12, tmp_14);
487 const __m128i coeff_6 = _mm_unpackhi_epi64(tmp_12, tmp_14);
488
489 const __m128i res_0 = _mm_madd_epi16(src_0, coeff_0);
490 const __m128i res_2 = _mm_madd_epi16(src_2, coeff_2);
491 const __m128i res_4 = _mm_madd_epi16(src_4, coeff_4);
492 const __m128i res_6 = _mm_madd_epi16(src_6, coeff_6);
493
494 const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
495 _mm_add_epi32(res_4, res_6));
496
497 // Filter odd-index pixels
498 const __m128i src_1 = _mm_unpackhi_epi16(src[0], src[1]);
499 const __m128i src_3 = _mm_unpackhi_epi16(src[2], src[3]);
500 const __m128i src_5 = _mm_unpackhi_epi16(src[4], src[5]);
501 const __m128i src_7 = _mm_unpackhi_epi16(src[6], src[7]);
502
503 const __m128i tmp_1 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700504 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530505 ((sy + 1 * gamma) >> WARPEDDIFF_PREC_BITS)));
506 const __m128i tmp_3 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700507 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530508 ((sy + 3 * gamma) >> WARPEDDIFF_PREC_BITS)));
509 const __m128i tmp_5 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700510 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530511 ((sy + 5 * gamma) >> WARPEDDIFF_PREC_BITS)));
512 const __m128i tmp_7 = _mm_loadu_si128(
Yaowu Xu3a19b8a2019-05-01 08:40:42 -0700513 (__m128i *)(av1_warped_filter +
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530514 ((sy + 7 * gamma) >> WARPEDDIFF_PREC_BITS)));
515
516 const __m128i tmp_9 = _mm_unpacklo_epi32(tmp_1, tmp_3);
517 const __m128i tmp_11 = _mm_unpacklo_epi32(tmp_5, tmp_7);
518 const __m128i tmp_13 = _mm_unpackhi_epi32(tmp_1, tmp_3);
519 const __m128i tmp_15 = _mm_unpackhi_epi32(tmp_5, tmp_7);
520
521 const __m128i coeff_1 = _mm_unpacklo_epi64(tmp_9, tmp_11);
522 const __m128i coeff_3 = _mm_unpackhi_epi64(tmp_9, tmp_11);
523 const __m128i coeff_5 = _mm_unpacklo_epi64(tmp_13, tmp_15);
524 const __m128i coeff_7 = _mm_unpackhi_epi64(tmp_13, tmp_15);
525
526 const __m128i res_1 = _mm_madd_epi16(src_1, coeff_1);
527 const __m128i res_3 = _mm_madd_epi16(src_3, coeff_3);
528 const __m128i res_5 = _mm_madd_epi16(src_5, coeff_5);
529 const __m128i res_7 = _mm_madd_epi16(src_7, coeff_7);
530
531 const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
532 _mm_add_epi32(res_5, res_7));
533
534 // Rearrange pixels back into the order 0 ... 7
535 __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
536 __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
537
538 if (conv_params->is_compound) {
539 __m128i *const p =
540 (__m128i *)&conv_params
541 ->dst[(i + k + 4) * conv_params->dst_stride + j];
542 res_lo = _mm_add_epi32(res_lo, res_add_const);
543 res_lo = _mm_sra_epi32(_mm_add_epi32(res_lo, reduce_bits_vert_const),
544 reduce_bits_vert_shift);
545
546 if (conv_params->do_average) {
547 __m128i *const dst16 = (__m128i *)&pred[(i + k + 4) * p_stride + j];
548 __m128i p_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p));
549
Debargha Mukherjee7ac3eb12018-12-12 10:26:50 -0800550 if (conv_params->use_dist_wtd_comp_avg) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530551 res_lo = _mm_add_epi32(_mm_mullo_epi32(p_32, wt0),
552 _mm_mullo_epi32(res_lo, wt1));
553 res_lo = _mm_srai_epi32(res_lo, DIST_PRECISION_BITS);
554 } else {
555 res_lo = _mm_srai_epi32(_mm_add_epi32(p_32, res_lo), 1);
556 }
557
558 __m128i res32_lo = _mm_add_epi32(res_lo, res_sub_const);
559 res32_lo = _mm_sra_epi32(_mm_add_epi32(res32_lo, round_bits_const),
560 round_bits_shift);
561
562 __m128i res16_lo = _mm_packus_epi32(res32_lo, res32_lo);
563 res16_lo = _mm_min_epi16(res16_lo, clip_pixel);
564 _mm_storel_epi64(dst16, res16_lo);
565 } else {
566 res_lo = _mm_packus_epi32(res_lo, res_lo);
567 _mm_storel_epi64(p, res_lo);
568 }
569 if (p_width > 4) {
570 __m128i *const p4 =
571 (__m128i *)&conv_params
572 ->dst[(i + k + 4) * conv_params->dst_stride + j + 4];
573
574 res_hi = _mm_add_epi32(res_hi, res_add_const);
575 res_hi =
576 _mm_sra_epi32(_mm_add_epi32(res_hi, reduce_bits_vert_const),
577 reduce_bits_vert_shift);
578 if (conv_params->do_average) {
579 __m128i *const dst16_4 =
580 (__m128i *)&pred[(i + k + 4) * p_stride + j + 4];
581 __m128i p4_32 = _mm_cvtepu16_epi32(_mm_loadl_epi64(p4));
582
Debargha Mukherjee7ac3eb12018-12-12 10:26:50 -0800583 if (conv_params->use_dist_wtd_comp_avg) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530584 res_hi = _mm_add_epi32(_mm_mullo_epi32(p4_32, wt0),
585 _mm_mullo_epi32(res_hi, wt1));
586 res_hi = _mm_srai_epi32(res_hi, DIST_PRECISION_BITS);
587 } else {
588 res_hi = _mm_srai_epi32(_mm_add_epi32(p4_32, res_hi), 1);
589 }
590
591 __m128i res32_hi = _mm_add_epi32(res_hi, res_sub_const);
592 res32_hi = _mm_sra_epi32(
593 _mm_add_epi32(res32_hi, round_bits_const), round_bits_shift);
594 __m128i res16_hi = _mm_packus_epi32(res32_hi, res32_hi);
595 res16_hi = _mm_min_epi16(res16_hi, clip_pixel);
596 _mm_storel_epi64(dst16_4, res16_hi);
597 } else {
598 res_hi = _mm_packus_epi32(res_hi, res_hi);
599 _mm_storel_epi64(p4, res_hi);
600 }
601 }
602 } else {
603 // Round and pack into 8 bits
604 const __m128i round_const =
605 _mm_set1_epi32(-(1 << (bd + reduce_bits_vert - 1)) +
606 ((1 << reduce_bits_vert) >> 1));
607
608 const __m128i res_lo_round = _mm_srai_epi32(
609 _mm_add_epi32(res_lo, round_const), reduce_bits_vert);
610 const __m128i res_hi_round = _mm_srai_epi32(
611 _mm_add_epi32(res_hi, round_const), reduce_bits_vert);
612
613 __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
614 // Clamp res_16bit to the range [0, 2^bd - 1]
615 const __m128i max_val = _mm_set1_epi16((1 << bd) - 1);
616 const __m128i zero = _mm_setzero_si128();
617 res_16bit = _mm_max_epi16(_mm_min_epi16(res_16bit, max_val), zero);
618
619 // Store, blending with 'pred' if needed
620 __m128i *const p = (__m128i *)&pred[(i + k + 4) * p_stride + j];
621
622 // Note: If we're outputting a 4x4 block, we need to be very careful
623 // to only output 4 pixels at this point, to avoid encode/decode
624 // mismatches when encoding with multiple threads.
625 if (p_width == 4) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530626 _mm_storel_epi64(p, res_16bit);
627 } else {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530628 _mm_storeu_si128(p, res_16bit);
629 }
630 }
631 }
632 }
633 }
Cheng Chencce312f2017-11-09 16:19:17 -0800634}