blob: e298cf6531deda2adb7b83b1c3d3ace1cc41a414 [file] [log] [blame]
Ravi Chaudhary7466fbb2018-02-28 16:35:55 +05301/*
Ravi Chaudharybb6a7692018-03-02 15:30:05 +05302 * Copyright (c) 2018, Alliance for Open Media. All rights reserved
Ravi Chaudhary7466fbb2018-02-28 16:35:55 +05303 *
4 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
10 */
11
12#include <immintrin.h>
13#include <assert.h>
14
Tom Finegan44702c82018-05-22 13:00:39 -070015#include "config/aom_dsp_rtcd.h"
16
Ravi Chaudhary7466fbb2018-02-28 16:35:55 +053017#include "aom_dsp/x86/convolve_avx2.h"
18#include "aom_dsp/x86/convolve_common_intrin.h"
19#include "aom_dsp/x86/convolve_sse4_1.h"
20#include "aom_dsp/x86/synonyms.h"
21#include "aom_dsp/aom_dsp_common.h"
22#include "aom_dsp/aom_filter.h"
23#include "av1/common/convolve.h"
24
Cherma Rajan Aa7be3682018-03-20 10:00:51 +053025void av1_highbd_jnt_convolve_2d_copy_avx2(
26 const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
Peng Bin3a0c2ed2018-07-19 16:24:00 +080027 int h, const InterpFilterParams *filter_params_x,
28 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
Cherma Rajan Aa7be3682018-03-20 10:00:51 +053029 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
30 CONV_BUF_TYPE *dst = conv_params->dst;
31 int dst_stride = conv_params->dst_stride;
32 (void)filter_params_x;
33 (void)filter_params_y;
34 (void)subpel_x_q4;
35 (void)subpel_y_q4;
36
37 const int bits =
38 FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
39 const __m128i left_shift = _mm_cvtsi32_si128(bits);
40 const int do_average = conv_params->do_average;
41 const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
42 const int w0 = conv_params->fwd_offset;
43 const int w1 = conv_params->bck_offset;
44 const __m256i wt0 = _mm256_set1_epi32(w0);
45 const __m256i wt1 = _mm256_set1_epi32(w1);
46 const __m256i zero = _mm256_setzero_si256();
47 int i, j;
48
49 const int offset_0 =
50 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
51 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
52 const __m256i offset_const = _mm256_set1_epi32(offset);
53 const __m256i offset_const_16b = _mm256_set1_epi16(offset);
54 const int rounding_shift =
55 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
56 const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
57 const __m256i clip_pixel_to_bd =
58 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
59
60 assert(bits <= 4);
61
62 if (!(w % 16)) {
63 for (i = 0; i < h; i += 1) {
64 for (j = 0; j < w; j += 16) {
65 const __m256i src_16bit =
66 _mm256_loadu_si256((__m256i *)(&src[i * src_stride + j]));
67
68 const __m256i res = _mm256_sll_epi16(src_16bit, left_shift);
69
70 if (do_average) {
71 const __m256i data_0 =
72 _mm256_loadu_si256((__m256i *)(&dst[i * dst_stride + j]));
73
74 const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_0, zero);
75 const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_0, zero);
76
77 const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
78 const __m256i res_unsigned_lo =
79 _mm256_add_epi32(res_32b_lo, offset_const);
80
81 const __m256i comp_avg_res_lo = highbd_comp_avg(
82 &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
83
84 const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
85 const __m256i res_unsigned_hi =
86 _mm256_add_epi32(res_32b_hi, offset_const);
87
88 const __m256i comp_avg_res_hi = highbd_comp_avg(
89 &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
90
91 const __m256i round_result_lo = highbd_convolve_rounding(
92 &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
93 const __m256i round_result_hi = highbd_convolve_rounding(
94 &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
95
96 const __m256i res_16b =
97 _mm256_packus_epi32(round_result_lo, round_result_hi);
98 const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
99
100 _mm256_store_si256((__m256i *)(&dst0[i * dst_stride0 + j]), res_clip);
101 } else {
102 const __m256i res_unsigned_16b =
103 _mm256_adds_epu16(res, offset_const_16b);
104
105 _mm256_store_si256((__m256i *)(&dst[i * dst_stride + j]),
106 res_unsigned_16b);
107 }
108 }
109 }
110 } else if (!(w % 4)) {
111 for (i = 0; i < h; i += 2) {
112 for (j = 0; j < w; j += 8) {
113 const __m128i src_row_0 =
114 _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
115 const __m128i src_row_1 =
116 _mm_loadu_si128((__m128i *)(&src[i * src_stride + j + src_stride]));
117 // since not all compilers yet support _mm256_set_m128i()
118 const __m256i src_10 = _mm256_insertf128_si256(
119 _mm256_castsi128_si256(src_row_0), src_row_1, 1);
120
121 const __m256i res = _mm256_sll_epi16(src_10, left_shift);
122
123 if (w - j < 8) {
124 if (do_average) {
125 const __m256i data_0 = _mm256_castsi128_si256(
126 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
127 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
128 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
129 const __m256i data_01 =
130 _mm256_permute2x128_si256(data_0, data_1, 0x20);
131
132 const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
133
134 const __m256i res_32b = _mm256_unpacklo_epi16(res, zero);
135 const __m256i res_unsigned_lo =
136 _mm256_add_epi32(res_32b, offset_const);
137
138 const __m256i comp_avg_res = highbd_comp_avg(
139 &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
140
141 const __m256i round_result = highbd_convolve_rounding(
142 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
143
144 const __m256i res_16b =
145 _mm256_packus_epi32(round_result, round_result);
146 const __m256i res_clip =
147 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
148
149 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
150 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
151
152 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
153 _mm_storel_epi64(
154 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
155 } else {
156 const __m256i res_unsigned_16b =
157 _mm256_adds_epu16(res, offset_const_16b);
158
159 const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
160 const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
161
162 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
163 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
164 res_1);
165 }
166 } else {
167 if (do_average) {
168 const __m256i data_0 = _mm256_castsi128_si256(
169 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
170 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
171 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
172 const __m256i data_01 =
173 _mm256_permute2x128_si256(data_0, data_1, 0x20);
174
175 const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
176 const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
177
178 const __m256i res_32b_lo = _mm256_unpacklo_epi16(res, zero);
179 const __m256i res_unsigned_lo =
180 _mm256_add_epi32(res_32b_lo, offset_const);
181
182 const __m256i comp_avg_res_lo = highbd_comp_avg(
183 &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
184
185 const __m256i res_32b_hi = _mm256_unpackhi_epi16(res, zero);
186 const __m256i res_unsigned_hi =
187 _mm256_add_epi32(res_32b_hi, offset_const);
188
189 const __m256i comp_avg_res_hi = highbd_comp_avg(
190 &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
191
192 const __m256i round_result_lo =
193 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
194 &rounding_const, rounding_shift);
195 const __m256i round_result_hi =
196 highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
197 &rounding_const, rounding_shift);
198
199 const __m256i res_16b =
200 _mm256_packus_epi32(round_result_lo, round_result_hi);
201 const __m256i res_clip =
202 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
203
204 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
205 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
206
207 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
208 _mm_store_si128(
209 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
210 } else {
211 const __m256i res_unsigned_16b =
212 _mm256_adds_epu16(res, offset_const_16b);
213 const __m128i res_0 = _mm256_castsi256_si128(res_unsigned_16b);
214 const __m128i res_1 = _mm256_extracti128_si256(res_unsigned_16b, 1);
215
216 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
217 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
218 res_1);
219 }
220 }
221 }
222 }
223 }
224}
225
Peng Bin3a0c2ed2018-07-19 16:24:00 +0800226void av1_highbd_jnt_convolve_2d_avx2(
227 const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
228 int h, const InterpFilterParams *filter_params_x,
229 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
230 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530231 DECLARE_ALIGNED(32, int16_t, im_block[(MAX_SB_SIZE + MAX_FILTER_TAP) * 8]);
232 CONV_BUF_TYPE *dst = conv_params->dst;
233 int dst_stride = conv_params->dst_stride;
234 int im_h = h + filter_params_y->taps - 1;
235 int im_stride = 8;
236 int i, j;
237 const int fo_vert = filter_params_y->taps / 2 - 1;
238 const int fo_horiz = filter_params_x->taps / 2 - 1;
239 const uint16_t *const src_ptr = src - fo_vert * src_stride - fo_horiz;
240
241 // Check that, even with 12-bit input, the intermediate values will fit
242 // into an unsigned 16-bit intermediate array.
243 assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
244
245 __m256i s[8], coeffs_y[4], coeffs_x[4];
246 const int do_average = conv_params->do_average;
247 const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
248
249 const int w0 = conv_params->fwd_offset;
250 const int w1 = conv_params->bck_offset;
251 const __m256i wt0 = _mm256_set1_epi32(w0);
252 const __m256i wt1 = _mm256_set1_epi32(w1);
253 const __m256i zero = _mm256_setzero_si256();
254
255 const __m256i round_const_x = _mm256_set1_epi32(
256 ((1 << conv_params->round_0) >> 1) + (1 << (bd + FILTER_BITS - 1)));
257 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
258
259 const __m256i round_const_y = _mm256_set1_epi32(
260 ((1 << conv_params->round_1) >> 1) -
261 (1 << (bd + 2 * FILTER_BITS - conv_params->round_0 - 1)));
262 const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
263
264 const int offset_0 =
265 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
266 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
267 const __m256i offset_const = _mm256_set1_epi32(offset);
268 const int rounding_shift =
269 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
270 const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
271
272 const __m256i clip_pixel_to_bd =
273 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
274
275 prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
276 prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
277
278 for (j = 0; j < w; j += 8) {
279 /* Horizontal filter */
280 {
281 for (i = 0; i < im_h; i += 2) {
282 const __m256i row0 =
283 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
284 __m256i row1 = _mm256_set1_epi16(0);
285 if (i + 1 < im_h)
286 row1 =
287 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
288
289 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
290 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
291
292 // even pixels
293 s[0] = _mm256_alignr_epi8(r1, r0, 0);
294 s[1] = _mm256_alignr_epi8(r1, r0, 4);
295 s[2] = _mm256_alignr_epi8(r1, r0, 8);
296 s[3] = _mm256_alignr_epi8(r1, r0, 12);
297
298 __m256i res_even = convolve(s, coeffs_x);
299 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
300 round_shift_x);
301
302 // odd pixels
303 s[0] = _mm256_alignr_epi8(r1, r0, 2);
304 s[1] = _mm256_alignr_epi8(r1, r0, 6);
305 s[2] = _mm256_alignr_epi8(r1, r0, 10);
306 s[3] = _mm256_alignr_epi8(r1, r0, 14);
307
308 __m256i res_odd = convolve(s, coeffs_x);
309 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
310 round_shift_x);
311
312 __m256i res_even1 = _mm256_packs_epi32(res_even, res_even);
313 __m256i res_odd1 = _mm256_packs_epi32(res_odd, res_odd);
314 __m256i res = _mm256_unpacklo_epi16(res_even1, res_odd1);
315
316 _mm256_store_si256((__m256i *)&im_block[i * im_stride], res);
317 }
318 }
319
320 /* Vertical filter */
321 {
322 __m256i s0 = _mm256_loadu_si256((__m256i *)(im_block + 0 * im_stride));
323 __m256i s1 = _mm256_loadu_si256((__m256i *)(im_block + 1 * im_stride));
324 __m256i s2 = _mm256_loadu_si256((__m256i *)(im_block + 2 * im_stride));
325 __m256i s3 = _mm256_loadu_si256((__m256i *)(im_block + 3 * im_stride));
326 __m256i s4 = _mm256_loadu_si256((__m256i *)(im_block + 4 * im_stride));
327 __m256i s5 = _mm256_loadu_si256((__m256i *)(im_block + 5 * im_stride));
328
329 s[0] = _mm256_unpacklo_epi16(s0, s1);
330 s[1] = _mm256_unpacklo_epi16(s2, s3);
331 s[2] = _mm256_unpacklo_epi16(s4, s5);
332
333 s[4] = _mm256_unpackhi_epi16(s0, s1);
334 s[5] = _mm256_unpackhi_epi16(s2, s3);
335 s[6] = _mm256_unpackhi_epi16(s4, s5);
336
337 for (i = 0; i < h; i += 2) {
338 const int16_t *data = &im_block[i * im_stride];
339
340 const __m256i s6 =
341 _mm256_loadu_si256((__m256i *)(data + 6 * im_stride));
342 const __m256i s7 =
343 _mm256_loadu_si256((__m256i *)(data + 7 * im_stride));
344
345 s[3] = _mm256_unpacklo_epi16(s6, s7);
346 s[7] = _mm256_unpackhi_epi16(s6, s7);
347
348 const __m256i res_a = convolve(s, coeffs_y);
349
350 const __m256i res_a_round = _mm256_sra_epi32(
351 _mm256_add_epi32(res_a, round_const_y), round_shift_y);
352
353 const __m256i res_unsigned_lo =
354 _mm256_add_epi32(res_a_round, offset_const);
355
356 if (w - j < 8) {
357 if (do_average) {
358 const __m256i data_0 = _mm256_castsi128_si256(
359 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
360 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
361 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
362 const __m256i data_01 =
363 _mm256_permute2x128_si256(data_0, data_1, 0x20);
364
365 const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
366
367 const __m256i comp_avg_res = highbd_comp_avg(
368 &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
369
370 const __m256i round_result = highbd_convolve_rounding(
371 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
372
373 const __m256i res_16b =
374 _mm256_packus_epi32(round_result, round_result);
375 const __m256i res_clip =
376 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
377
378 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
379 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
380
381 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
382 _mm_storel_epi64(
383 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
384 } else {
385 __m256i res_16b =
386 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
387 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
388 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
389
390 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
391 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
392 res_1);
393 }
394 } else {
395 const __m256i res_b = convolve(s + 4, coeffs_y);
396 const __m256i res_b_round = _mm256_sra_epi32(
397 _mm256_add_epi32(res_b, round_const_y), round_shift_y);
398
399 __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
400
401 if (do_average) {
402 const __m256i data_0 = _mm256_castsi128_si256(
403 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
404 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
405 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
406 const __m256i data_01 =
407 _mm256_permute2x128_si256(data_0, data_1, 0x20);
408
409 const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
410 const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
411
412 const __m256i comp_avg_res_lo = highbd_comp_avg(
413 &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
414 const __m256i comp_avg_res_hi = highbd_comp_avg(
415 &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
416
417 const __m256i round_result_lo =
418 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
419 &rounding_const, rounding_shift);
420 const __m256i round_result_hi =
421 highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
422 &rounding_const, rounding_shift);
423
424 const __m256i res_16b =
425 _mm256_packus_epi32(round_result_lo, round_result_hi);
426 const __m256i res_clip =
427 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
428
429 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
430 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
431
432 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
433 _mm_store_si128(
434 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
435 } else {
436 __m256i res_16b =
437 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
438 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
439 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
440
441 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
442 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
443 res_1);
444 }
445 }
446
447 s[0] = s[1];
448 s[1] = s[2];
449 s[2] = s[3];
450
451 s[4] = s[5];
452 s[5] = s[6];
453 s[6] = s[7];
454 }
455 }
456 }
457}
458
Peng Bin3a0c2ed2018-07-19 16:24:00 +0800459void av1_highbd_jnt_convolve_x_avx2(
460 const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
461 int h, const InterpFilterParams *filter_params_x,
462 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
463 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530464 CONV_BUF_TYPE *dst = conv_params->dst;
465 int dst_stride = conv_params->dst_stride;
466 const int fo_horiz = filter_params_x->taps / 2 - 1;
467 const uint16_t *const src_ptr = src - fo_horiz;
468 const int bits = FILTER_BITS - conv_params->round_1;
469 (void)filter_params_y;
470 (void)subpel_y_q4;
471
472 int i, j;
473 __m256i s[4], coeffs_x[4];
474
475 const int do_average = conv_params->do_average;
476 const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
477 const int w0 = conv_params->fwd_offset;
478 const int w1 = conv_params->bck_offset;
479 const __m256i wt0 = _mm256_set1_epi32(w0);
480 const __m256i wt1 = _mm256_set1_epi32(w1);
481 const __m256i zero = _mm256_setzero_si256();
482
483 const __m256i round_const_x =
484 _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
485 const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
486 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
487
488 const int offset_0 =
489 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
490 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
491 const __m256i offset_const = _mm256_set1_epi32(offset);
492 const int rounding_shift =
493 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
494 const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
495 const __m256i clip_pixel_to_bd =
496 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
497
498 assert(bits >= 0);
499 prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
500
501 for (j = 0; j < w; j += 8) {
502 /* Horizontal filter */
503 for (i = 0; i < h; i += 2) {
504 const __m256i row0 =
505 _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
506 __m256i row1 =
507 _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
508
509 const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
510 const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
511
512 // even pixels
513 s[0] = _mm256_alignr_epi8(r1, r0, 0);
514 s[1] = _mm256_alignr_epi8(r1, r0, 4);
515 s[2] = _mm256_alignr_epi8(r1, r0, 8);
516 s[3] = _mm256_alignr_epi8(r1, r0, 12);
517
518 __m256i res_even = convolve(s, coeffs_x);
519 res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
520 round_shift_x);
521
522 // odd pixels
523 s[0] = _mm256_alignr_epi8(r1, r0, 2);
524 s[1] = _mm256_alignr_epi8(r1, r0, 6);
525 s[2] = _mm256_alignr_epi8(r1, r0, 10);
526 s[3] = _mm256_alignr_epi8(r1, r0, 14);
527
528 __m256i res_odd = convolve(s, coeffs_x);
529 res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
530 round_shift_x);
531
532 res_even = _mm256_sll_epi32(res_even, round_shift_bits);
533 res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
534
535 __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
536
537 __m256i res_unsigned_lo = _mm256_add_epi32(res1, offset_const);
538
539 if (w - j < 8) {
540 if (do_average) {
541 const __m256i data_0 = _mm256_castsi128_si256(
542 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
543 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
544 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
545 const __m256i data_01 =
546 _mm256_permute2x128_si256(data_0, data_1, 0x20);
547
548 const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
549
550 const __m256i comp_avg_res = highbd_comp_avg(
551 &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
552
553 const __m256i round_result = highbd_convolve_rounding(
554 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
555
556 const __m256i res_16b =
557 _mm256_packus_epi32(round_result, round_result);
558 const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
559
560 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
561 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
562
563 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
564 _mm_storel_epi64(
565 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
566 } else {
567 __m256i res_16b =
568 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
569 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
570 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
571
572 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
573 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
574 res_1);
575 }
576 } else {
577 __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
578 __m256i res_unsigned_hi = _mm256_add_epi32(res2, offset_const);
579
580 if (do_average) {
581 const __m256i data_0 = _mm256_castsi128_si256(
582 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
583 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
584 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
585 const __m256i data_01 =
586 _mm256_permute2x128_si256(data_0, data_1, 0x20);
587
588 const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
589 const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
590
591 const __m256i comp_avg_res_lo = highbd_comp_avg(
592 &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
593 const __m256i comp_avg_res_hi = highbd_comp_avg(
594 &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
595
596 const __m256i round_result_lo = highbd_convolve_rounding(
597 &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
598 const __m256i round_result_hi = highbd_convolve_rounding(
599 &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);
600
601 const __m256i res_16b =
602 _mm256_packus_epi32(round_result_lo, round_result_hi);
603 const __m256i res_clip = _mm256_min_epi16(res_16b, clip_pixel_to_bd);
604
605 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
606 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
607
608 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
609 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]),
610 res_1);
611 } else {
612 __m256i res_16b =
613 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
614 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
615 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
616
617 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
618 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
619 res_1);
620 }
621 }
622 }
623 }
624}
625
Peng Bin3a0c2ed2018-07-19 16:24:00 +0800626void av1_highbd_jnt_convolve_y_avx2(
627 const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
628 int h, const InterpFilterParams *filter_params_x,
629 const InterpFilterParams *filter_params_y, const int subpel_x_q4,
630 const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
Cherma Rajan Aa7be3682018-03-20 10:00:51 +0530631 CONV_BUF_TYPE *dst = conv_params->dst;
632 int dst_stride = conv_params->dst_stride;
633 const int fo_vert = filter_params_y->taps / 2 - 1;
634 const uint16_t *const src_ptr = src - fo_vert * src_stride;
635 const int bits = FILTER_BITS - conv_params->round_0;
636 (void)filter_params_x;
637 (void)subpel_x_q4;
638
639 assert(bits >= 0);
640 int i, j;
641 __m256i s[8], coeffs_y[4];
642 const int do_average = conv_params->do_average;
643 const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
644
645 const int w0 = conv_params->fwd_offset;
646 const int w1 = conv_params->bck_offset;
647 const __m256i wt0 = _mm256_set1_epi32(w0);
648 const __m256i wt1 = _mm256_set1_epi32(w1);
649 const __m256i round_const_y =
650 _mm256_set1_epi32(((1 << conv_params->round_1) >> 1));
651 const __m128i round_shift_y = _mm_cvtsi32_si128(conv_params->round_1);
652 const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
653
654 const int offset_0 =
655 bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
656 const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
657 const __m256i offset_const = _mm256_set1_epi32(offset);
658 const int rounding_shift =
659 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
660 const __m256i rounding_const = _mm256_set1_epi32((1 << rounding_shift) >> 1);
661 const __m256i clip_pixel_to_bd =
662 _mm256_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
663 const __m256i zero = _mm256_setzero_si256();
664
665 prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
666
667 for (j = 0; j < w; j += 8) {
668 const uint16_t *data = &src_ptr[j];
669 /* Vertical filter */
670 {
671 __m256i src6;
672 __m256i s01 = _mm256_permute2x128_si256(
673 _mm256_castsi128_si256(
674 _mm_loadu_si128((__m128i *)(data + 0 * src_stride))),
675 _mm256_castsi128_si256(
676 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
677 0x20);
678 __m256i s12 = _mm256_permute2x128_si256(
679 _mm256_castsi128_si256(
680 _mm_loadu_si128((__m128i *)(data + 1 * src_stride))),
681 _mm256_castsi128_si256(
682 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
683 0x20);
684 __m256i s23 = _mm256_permute2x128_si256(
685 _mm256_castsi128_si256(
686 _mm_loadu_si128((__m128i *)(data + 2 * src_stride))),
687 _mm256_castsi128_si256(
688 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
689 0x20);
690 __m256i s34 = _mm256_permute2x128_si256(
691 _mm256_castsi128_si256(
692 _mm_loadu_si128((__m128i *)(data + 3 * src_stride))),
693 _mm256_castsi128_si256(
694 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
695 0x20);
696 __m256i s45 = _mm256_permute2x128_si256(
697 _mm256_castsi128_si256(
698 _mm_loadu_si128((__m128i *)(data + 4 * src_stride))),
699 _mm256_castsi128_si256(
700 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
701 0x20);
702 src6 = _mm256_castsi128_si256(
703 _mm_loadu_si128((__m128i *)(data + 6 * src_stride)));
704 __m256i s56 = _mm256_permute2x128_si256(
705 _mm256_castsi128_si256(
706 _mm_loadu_si128((__m128i *)(data + 5 * src_stride))),
707 src6, 0x20);
708
709 s[0] = _mm256_unpacklo_epi16(s01, s12);
710 s[1] = _mm256_unpacklo_epi16(s23, s34);
711 s[2] = _mm256_unpacklo_epi16(s45, s56);
712
713 s[4] = _mm256_unpackhi_epi16(s01, s12);
714 s[5] = _mm256_unpackhi_epi16(s23, s34);
715 s[6] = _mm256_unpackhi_epi16(s45, s56);
716
717 for (i = 0; i < h; i += 2) {
718 data = &src_ptr[i * src_stride + j];
719
720 const __m256i s67 = _mm256_permute2x128_si256(
721 src6,
722 _mm256_castsi128_si256(
723 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
724 0x20);
725
726 src6 = _mm256_castsi128_si256(
727 _mm_loadu_si128((__m128i *)(data + 8 * src_stride)));
728
729 const __m256i s78 = _mm256_permute2x128_si256(
730 _mm256_castsi128_si256(
731 _mm_loadu_si128((__m128i *)(data + 7 * src_stride))),
732 src6, 0x20);
733
734 s[3] = _mm256_unpacklo_epi16(s67, s78);
735 s[7] = _mm256_unpackhi_epi16(s67, s78);
736
737 const __m256i res_a = convolve(s, coeffs_y);
738
739 __m256i res_a_round = _mm256_sll_epi32(res_a, round_shift_bits);
740 res_a_round = _mm256_sra_epi32(
741 _mm256_add_epi32(res_a_round, round_const_y), round_shift_y);
742
743 __m256i res_unsigned_lo = _mm256_add_epi32(res_a_round, offset_const);
744
745 if (w - j < 8) {
746 if (do_average) {
747 const __m256i data_0 = _mm256_castsi128_si256(
748 _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])));
749 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadl_epi64(
750 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
751 const __m256i data_01 =
752 _mm256_permute2x128_si256(data_0, data_1, 0x20);
753
754 const __m256i data_ref_0 = _mm256_unpacklo_epi16(data_01, zero);
755
756 const __m256i comp_avg_res = highbd_comp_avg(
757 &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
758
759 const __m256i round_result = highbd_convolve_rounding(
760 &comp_avg_res, &offset_const, &rounding_const, rounding_shift);
761
762 const __m256i res_16b =
763 _mm256_packus_epi32(round_result, round_result);
764 const __m256i res_clip =
765 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
766
767 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
768 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
769
770 _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
771 _mm_storel_epi64(
772 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
773 } else {
774 __m256i res_16b =
775 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_lo);
776 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
777 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
778
779 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_0);
780 _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
781 res_1);
782 }
783 } else {
784 const __m256i res_b = convolve(s + 4, coeffs_y);
785 __m256i res_b_round = _mm256_sll_epi32(res_b, round_shift_bits);
786 res_b_round = _mm256_sra_epi32(
787 _mm256_add_epi32(res_b_round, round_const_y), round_shift_y);
788
789 __m256i res_unsigned_hi = _mm256_add_epi32(res_b_round, offset_const);
790
791 if (do_average) {
792 const __m256i data_0 = _mm256_castsi128_si256(
793 _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])));
794 const __m256i data_1 = _mm256_castsi128_si256(_mm_loadu_si128(
795 (__m128i *)(&dst[i * dst_stride + j + dst_stride])));
796 const __m256i data_01 =
797 _mm256_permute2x128_si256(data_0, data_1, 0x20);
798
799 const __m256i data_ref_0_lo = _mm256_unpacklo_epi16(data_01, zero);
800 const __m256i data_ref_0_hi = _mm256_unpackhi_epi16(data_01, zero);
801
802 const __m256i comp_avg_res_lo = highbd_comp_avg(
803 &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
804 const __m256i comp_avg_res_hi = highbd_comp_avg(
805 &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);
806
807 const __m256i round_result_lo =
808 highbd_convolve_rounding(&comp_avg_res_lo, &offset_const,
809 &rounding_const, rounding_shift);
810 const __m256i round_result_hi =
811 highbd_convolve_rounding(&comp_avg_res_hi, &offset_const,
812 &rounding_const, rounding_shift);
813
814 const __m256i res_16b =
815 _mm256_packus_epi32(round_result_lo, round_result_hi);
816 const __m256i res_clip =
817 _mm256_min_epi16(res_16b, clip_pixel_to_bd);
818
819 const __m128i res_0 = _mm256_castsi256_si128(res_clip);
820 const __m128i res_1 = _mm256_extracti128_si256(res_clip, 1);
821
822 _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_0);
823 _mm_store_si128(
824 (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
825 } else {
826 __m256i res_16b =
827 _mm256_packus_epi32(res_unsigned_lo, res_unsigned_hi);
828 const __m128i res_0 = _mm256_castsi256_si128(res_16b);
829 const __m128i res_1 = _mm256_extracti128_si256(res_16b, 1);
830
831 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_0);
832 _mm_store_si128((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
833 res_1);
834 }
835 }
836 s[0] = s[1];
837 s[1] = s[2];
838 s[2] = s[3];
839
840 s[4] = s[5];
841 s[5] = s[6];
842 s[6] = s[7];
843 }
844 }
845 }
846}