blob: d3d7aab00e82cf6ae376761acee340360e0b23ba [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
Lester Lu6bc30d62021-12-16 19:13:21 +00002 * Copyright (c) 2021, Alliance for Open Media. All rights reserved
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Lester Lu6bc30d62021-12-16 19:13:21 +00004 * This source code is subject to the terms of the BSD 3-Clause Clear License
5 * and the Alliance for Open Media Patent License 1.0. If the BSD 3-Clause Clear
6 * License was not distributed with this source code in the LICENSE file, you
7 * can obtain it at aomedia.org/license/software-license/bsd-3-c-c/. If the
8 * Alliance for Open Media Patent License 1.0 was not distributed with this
9 * source code in the PATENTS file, you can obtain it at
10 * aomedia.org/license/patent-license/.
Yaowu Xuc27fc142016-08-22 16:08:15 -070011 */
12
13#include <immintrin.h> // AVX2
14
Tom Finegan44702c82018-05-22 13:00:39 -070015#include "config/aom_dsp_rtcd.h"
16
Yaowu Xuc27fc142016-08-22 16:08:15 -070017#include "aom_ports/mem.h"
18
19/* clang-format off */
20DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
Yaowu Xu628d3c52016-09-08 09:25:14 -070021 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
22 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
23 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
24 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
27 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
29 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
30 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
31 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
32 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
33 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
34 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
35 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
36 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
Yaowu Xuc27fc142016-08-22 16:08:15 -070037};
38/* clang-format on */
39
Yaowu Xuc27fc142016-08-22 16:08:15 -070040#define FILTER_SRC(filter) \
41 /* filter the source */ \
42 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
43 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
44 \
45 /* add 8 to source */ \
46 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
47 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
48 \
49 /* divide source by 16 */ \
50 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
51 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
52
53#define MERGE_WITH_SRC(src_reg, reg) \
54 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
55 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
56
57#define LOAD_SRC_DST \
58 /* load source and destination */ \
59 src_reg = _mm256_loadu_si256((__m256i const *)(src)); \
60 dst_reg = _mm256_loadu_si256((__m256i const *)(dst));
61
62#define AVG_NEXT_SRC(src_reg, size_stride) \
63 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
64 /* average between current and next stride source */ \
65 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
66
67#define MERGE_NEXT_SRC(src_reg, size_stride) \
68 src_next_reg = _mm256_loadu_si256((__m256i const *)(src + size_stride)); \
69 MERGE_WITH_SRC(src_reg, src_next_reg)
70
71#define CALC_SUM_SSE_INSIDE_LOOP \
72 /* expand each byte to 2 bytes */ \
73 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
74 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
75 /* source - dest */ \
76 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
77 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
78 /* caculate sum */ \
79 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
80 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
81 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
82 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
83 /* calculate sse */ \
84 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
85 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
86
87// final calculation to sum and sse
88#define CALC_SUM_AND_SSE \
89 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
90 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
91 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
92 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
93 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
94 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
95 \
96 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
97 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
98 \
99 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
100 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
101 *((int *)sse) = _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
102 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
103 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
104 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
105 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
106 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
107
venkat sanampudi0a480f22020-03-05 09:09:00 +0530108// Functions related to sub pixel variance width 16
109#define LOAD_SRC_DST_INSERT(src_stride, dst_stride) \
110 /* load source and destination of 2 rows and insert*/ \
111 src_reg = _mm256_inserti128_si256( \
112 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
113 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
114 dst_reg = _mm256_inserti128_si256( \
115 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
116 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
117
118#define AVG_NEXT_SRC_INSERT(src_reg, size_stride) \
119 src_next_reg = _mm256_inserti128_si256( \
120 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
121 _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1); \
122 /* average between current and next stride source */ \
123 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
124
125#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride) \
126 src_next_reg = _mm256_inserti128_si256( \
127 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
128 _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1); \
129 MERGE_WITH_SRC(src_reg, src_next_reg)
130
131#define LOAD_SRC_NEXT_BYTE_INSERT \
132 /* load source and another source from next row */ \
133 src_reg = _mm256_inserti128_si256( \
134 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
135 _mm_loadu_si128((__m128i *)(src + src_stride)), 1); \
136 /* load source and next row source from 1 byte onwards */ \
137 src_next_reg = _mm256_inserti128_si256( \
138 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
139 _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
140
141#define LOAD_DST_INSERT \
142 dst_reg = _mm256_inserti128_si256( \
143 _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
144 _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
145
146#define LOAD_SRC_MERGE_128BIT(filter) \
147 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src)); \
148 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
149 __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1); \
150 __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1); \
151 __m128i filter_128bit = _mm256_castsi256_si128(filter); \
152 __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
153
154#define FILTER_SRC_128BIT(filter) \
155 /* filter the source */ \
156 src_lo = _mm_maddubs_epi16(src_lo, filter); \
157 src_hi = _mm_maddubs_epi16(src_hi, filter); \
158 \
159 /* add 8 to source */ \
160 src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
161 src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
162 \
163 /* divide source by 16 */ \
164 src_lo = _mm_srai_epi16(src_lo, 4); \
165 src_hi = _mm_srai_epi16(src_hi, 4);
166
Yaowu Xuf883b422016-08-30 14:01:10 -0700167unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700168 int x_offset, int y_offset,
169 const uint8_t *dst, int dst_stride,
170 int height, unsigned int *sse) {
171 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
172 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
173 __m256i zero_reg;
174 int i, sum;
175 sum_reg = _mm256_set1_epi16(0);
176 sse_reg = _mm256_set1_epi16(0);
177 zero_reg = _mm256_set1_epi16(0);
178
179 // x_offset = 0 and y_offset = 0
180 if (x_offset == 0) {
181 if (y_offset == 0) {
182 for (i = 0; i < height; i++) {
183 LOAD_SRC_DST
184 // expend each byte to 2 bytes
185 MERGE_WITH_SRC(src_reg, zero_reg)
186 CALC_SUM_SSE_INSIDE_LOOP
187 src += src_stride;
188 dst += dst_stride;
189 }
venkat sanampudifa4bf0d2020-03-11 09:30:01 +0530190 // x_offset = 0 and y_offset = 4
191 } else if (y_offset == 4) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700192 __m256i src_next_reg;
193 for (i = 0; i < height; i++) {
194 LOAD_SRC_DST
195 AVG_NEXT_SRC(src_reg, src_stride)
196 // expend each byte to 2 bytes
197 MERGE_WITH_SRC(src_reg, zero_reg)
198 CALC_SUM_SSE_INSIDE_LOOP
199 src += src_stride;
200 dst += dst_stride;
201 }
202 // x_offset = 0 and y_offset = bilin interpolation
203 } else {
204 __m256i filter, pw8, src_next_reg;
205
206 y_offset <<= 5;
207 filter = _mm256_load_si256(
208 (__m256i const *)(bilinear_filters_avx2 + y_offset));
209 pw8 = _mm256_set1_epi16(8);
210 for (i = 0; i < height; i++) {
211 LOAD_SRC_DST
212 MERGE_NEXT_SRC(src_reg, src_stride)
213 FILTER_SRC(filter)
214 CALC_SUM_SSE_INSIDE_LOOP
215 src += src_stride;
216 dst += dst_stride;
217 }
218 }
venkat sanampudifa4bf0d2020-03-11 09:30:01 +0530219 // x_offset = 4 and y_offset = 0
220 } else if (x_offset == 4) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700221 if (y_offset == 0) {
222 __m256i src_next_reg;
223 for (i = 0; i < height; i++) {
224 LOAD_SRC_DST
225 AVG_NEXT_SRC(src_reg, 1)
226 // expand each byte to 2 bytes
227 MERGE_WITH_SRC(src_reg, zero_reg)
228 CALC_SUM_SSE_INSIDE_LOOP
229 src += src_stride;
230 dst += dst_stride;
231 }
venkat sanampudifa4bf0d2020-03-11 09:30:01 +0530232 // x_offset = 4 and y_offset = 4
233 } else if (y_offset == 4) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700234 __m256i src_next_reg, src_avg;
235 // load source and another source starting from the next
236 // following byte
237 src_reg = _mm256_loadu_si256((__m256i const *)(src));
238 AVG_NEXT_SRC(src_reg, 1)
239 for (i = 0; i < height; i++) {
240 src_avg = src_reg;
241 src += src_stride;
242 LOAD_SRC_DST
243 AVG_NEXT_SRC(src_reg, 1)
244 // average between previous average to current average
245 src_avg = _mm256_avg_epu8(src_avg, src_reg);
246 // expand each byte to 2 bytes
247 MERGE_WITH_SRC(src_avg, zero_reg)
248 // save current source average
249 CALC_SUM_SSE_INSIDE_LOOP
250 dst += dst_stride;
251 }
venkat sanampudifa4bf0d2020-03-11 09:30:01 +0530252 // x_offset = 4 and y_offset = bilin interpolation
Yaowu Xuc27fc142016-08-22 16:08:15 -0700253 } else {
254 __m256i filter, pw8, src_next_reg, src_avg;
255 y_offset <<= 5;
256 filter = _mm256_load_si256(
257 (__m256i const *)(bilinear_filters_avx2 + y_offset));
258 pw8 = _mm256_set1_epi16(8);
259 // load source and another source starting from the next
260 // following byte
261 src_reg = _mm256_loadu_si256((__m256i const *)(src));
262 AVG_NEXT_SRC(src_reg, 1)
263 for (i = 0; i < height; i++) {
264 // save current source average
265 src_avg = src_reg;
266 src += src_stride;
267 LOAD_SRC_DST
268 AVG_NEXT_SRC(src_reg, 1)
269 MERGE_WITH_SRC(src_avg, src_reg)
270 FILTER_SRC(filter)
271 CALC_SUM_SSE_INSIDE_LOOP
272 dst += dst_stride;
273 }
274 }
275 // x_offset = bilin interpolation and y_offset = 0
276 } else {
277 if (y_offset == 0) {
278 __m256i filter, pw8, src_next_reg;
279 x_offset <<= 5;
280 filter = _mm256_load_si256(
281 (__m256i const *)(bilinear_filters_avx2 + x_offset));
282 pw8 = _mm256_set1_epi16(8);
283 for (i = 0; i < height; i++) {
284 LOAD_SRC_DST
285 MERGE_NEXT_SRC(src_reg, 1)
286 FILTER_SRC(filter)
287 CALC_SUM_SSE_INSIDE_LOOP
288 src += src_stride;
289 dst += dst_stride;
290 }
venkat sanampudifa4bf0d2020-03-11 09:30:01 +0530291 // x_offset = bilin interpolation and y_offset = 4
292 } else if (y_offset == 4) {
Yaowu Xuc27fc142016-08-22 16:08:15 -0700293 __m256i filter, pw8, src_next_reg, src_pack;
294 x_offset <<= 5;
295 filter = _mm256_load_si256(
296 (__m256i const *)(bilinear_filters_avx2 + x_offset));
297 pw8 = _mm256_set1_epi16(8);
298 src_reg = _mm256_loadu_si256((__m256i const *)(src));
299 MERGE_NEXT_SRC(src_reg, 1)
300 FILTER_SRC(filter)
301 // convert each 16 bit to 8 bit to each low and high lane source
302 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
303 for (i = 0; i < height; i++) {
304 src += src_stride;
305 LOAD_SRC_DST
306 MERGE_NEXT_SRC(src_reg, 1)
307 FILTER_SRC(filter)
308 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
309 // average between previous pack to the current
310 src_pack = _mm256_avg_epu8(src_pack, src_reg);
311 MERGE_WITH_SRC(src_pack, zero_reg)
312 CALC_SUM_SSE_INSIDE_LOOP
313 src_pack = src_reg;
314 dst += dst_stride;
315 }
316 // x_offset = bilin interpolation and y_offset = bilin interpolation
317 } else {
318 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
319 x_offset <<= 5;
320 xfilter = _mm256_load_si256(
321 (__m256i const *)(bilinear_filters_avx2 + x_offset));
322 y_offset <<= 5;
323 yfilter = _mm256_load_si256(
324 (__m256i const *)(bilinear_filters_avx2 + y_offset));
325 pw8 = _mm256_set1_epi16(8);
326 // load source and another source starting from the next
327 // following byte
328 src_reg = _mm256_loadu_si256((__m256i const *)(src));
329 MERGE_NEXT_SRC(src_reg, 1)
330
331 FILTER_SRC(xfilter)
332 // convert each 16 bit to 8 bit to each low and high lane source
333 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
334 for (i = 0; i < height; i++) {
335 src += src_stride;
336 LOAD_SRC_DST
337 MERGE_NEXT_SRC(src_reg, 1)
338 FILTER_SRC(xfilter)
339 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
340 // merge previous pack to current pack source
341 MERGE_WITH_SRC(src_pack, src_reg)
342 // filter the source
343 FILTER_SRC(yfilter)
344 src_pack = src_reg;
345 CALC_SUM_SSE_INSIDE_LOOP
346 dst += dst_stride;
347 }
348 }
349 }
350 CALC_SUM_AND_SSE
Yi Luoe9fde262016-10-07 15:02:33 -0700351 _mm256_zeroupper();
Yaowu Xuc27fc142016-08-22 16:08:15 -0700352 return sum;
353}
354
venkat sanampudi0a480f22020-03-05 09:09:00 +0530355unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
356 int x_offset, int y_offset,
357 const uint8_t *dst, int dst_stride,
358 int height, unsigned int *sse) {
359 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
360 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
361 __m256i zero_reg;
362 int i, sum;
363 sum_reg = _mm256_set1_epi16(0);
364 sse_reg = _mm256_set1_epi16(0);
365 zero_reg = _mm256_set1_epi16(0);
366
367 // x_offset = 0 and y_offset = 0
368 if (x_offset == 0) {
369 if (y_offset == 0) {
370 for (i = 0; i < height; i += 2) {
371 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
372 // expend each byte to 2 bytes
373 MERGE_WITH_SRC(src_reg, zero_reg)
374 CALC_SUM_SSE_INSIDE_LOOP
375 src += (src_stride << 1);
376 dst += (dst_stride << 1);
377 }
378 // x_offset = 0 and y_offset = 4
379 } else if (y_offset == 4) {
380 __m256i src_next_reg;
381 for (i = 0; i < height; i += 2) {
382 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
383 AVG_NEXT_SRC_INSERT(src_reg, src_stride)
384 // expend each byte to 2 bytes
385 MERGE_WITH_SRC(src_reg, zero_reg)
386 CALC_SUM_SSE_INSIDE_LOOP
387 src += (src_stride << 1);
388 dst += (dst_stride << 1);
389 }
390 // x_offset = 0 and y_offset = bilin interpolation
391 } else {
392 __m256i filter, pw8, src_next_reg;
393 y_offset <<= 5;
394 filter = _mm256_load_si256(
395 (__m256i const *)(bilinear_filters_avx2 + y_offset));
396 pw8 = _mm256_set1_epi16(8);
397 for (i = 0; i < height; i += 2) {
398 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
399 MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
400 FILTER_SRC(filter)
401 CALC_SUM_SSE_INSIDE_LOOP
402 src += (src_stride << 1);
403 dst += (dst_stride << 1);
404 }
405 }
406 // x_offset = 4 and y_offset = 0
407 } else if (x_offset == 4) {
408 if (y_offset == 0) {
409 __m256i src_next_reg;
410 for (i = 0; i < height; i += 2) {
411 LOAD_SRC_NEXT_BYTE_INSERT
412 LOAD_DST_INSERT
413 /* average between current and next stride source */
414 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
415 // expand each byte to 2 bytes
416 MERGE_WITH_SRC(src_reg, zero_reg)
417 CALC_SUM_SSE_INSIDE_LOOP
418 src += (src_stride << 1);
419 dst += (dst_stride << 1);
420 }
421 // x_offset = 4 and y_offset = 4
422 } else if (y_offset == 4) {
423 __m256i src_next_reg, src_avg, src_temp;
424 // load and insert source and next row source
425 LOAD_SRC_NEXT_BYTE_INSERT
426 src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
427 src += src_stride << 1;
428 for (i = 0; i < height - 2; i += 2) {
429 LOAD_SRC_NEXT_BYTE_INSERT
430 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
431 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
432 src_temp = _mm256_avg_epu8(src_avg, src_temp);
433 LOAD_DST_INSERT
434 // expand each byte to 2 bytes
435 MERGE_WITH_SRC(src_temp, zero_reg)
436 // save current source average
437 src_avg = src_next_reg;
438 CALC_SUM_SSE_INSIDE_LOOP
439 dst += dst_stride << 1;
440 src += src_stride << 1;
441 }
442 // last 2 rows processing happens here
443 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
444 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
445 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
446 src_next_reg = _mm256_permute2x128_si256(
447 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
448 LOAD_DST_INSERT
449 src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
450 MERGE_WITH_SRC(src_avg, zero_reg)
451 CALC_SUM_SSE_INSIDE_LOOP
452 } else {
453 // x_offset = 4 and y_offset = bilin interpolation
454 __m256i filter, pw8, src_next_reg, src_avg, src_temp;
455 y_offset <<= 5;
456 filter = _mm256_load_si256(
457 (__m256i const *)(bilinear_filters_avx2 + y_offset));
458 pw8 = _mm256_set1_epi16(8);
459 // load and insert source and next row source
460 LOAD_SRC_NEXT_BYTE_INSERT
461 src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
462 src += src_stride << 1;
463 for (i = 0; i < height - 2; i += 2) {
464 LOAD_SRC_NEXT_BYTE_INSERT
465 src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
466 src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
467 LOAD_DST_INSERT
468 MERGE_WITH_SRC(src_avg, src_temp)
469 // save current source average
470 src_avg = src_next_reg;
471 FILTER_SRC(filter)
472 CALC_SUM_SSE_INSIDE_LOOP
473 dst += dst_stride << 1;
474 src += src_stride << 1;
475 }
476 // last 2 rows processing happens here
477 __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
478 __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
479 src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
480 src_next_reg = _mm256_permute2x128_si256(
481 src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
482 LOAD_DST_INSERT
483 MERGE_WITH_SRC(src_avg, src_next_reg)
484 FILTER_SRC(filter)
485 CALC_SUM_SSE_INSIDE_LOOP
486 }
487 // x_offset = bilin interpolation and y_offset = 0
488 } else {
489 if (y_offset == 0) {
490 __m256i filter, pw8, src_next_reg;
491 x_offset <<= 5;
492 filter = _mm256_load_si256(
493 (__m256i const *)(bilinear_filters_avx2 + x_offset));
494 pw8 = _mm256_set1_epi16(8);
495 for (i = 0; i < height; i += 2) {
496 LOAD_SRC_DST_INSERT(src_stride, dst_stride)
497 MERGE_NEXT_SRC_INSERT(src_reg, 1)
498 FILTER_SRC(filter)
499 CALC_SUM_SSE_INSIDE_LOOP
500 src += (src_stride << 1);
501 dst += (dst_stride << 1);
502 }
503 // x_offset = bilin interpolation and y_offset = 4
504 } else if (y_offset == 4) {
505 __m256i filter, pw8, src_next_reg, src_pack;
506 x_offset <<= 5;
507 filter = _mm256_load_si256(
508 (__m256i const *)(bilinear_filters_avx2 + x_offset));
509 pw8 = _mm256_set1_epi16(8);
510 // load and insert source and next row source
511 LOAD_SRC_NEXT_BYTE_INSERT
512 MERGE_WITH_SRC(src_reg, src_next_reg)
513 FILTER_SRC(filter)
514 // convert each 16 bit to 8 bit to each low and high lane source
515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
516 src += src_stride << 1;
517 for (i = 0; i < height - 2; i += 2) {
518 LOAD_SRC_NEXT_BYTE_INSERT
519 LOAD_DST_INSERT
520 MERGE_WITH_SRC(src_reg, src_next_reg)
521 FILTER_SRC(filter)
522 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
523 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
524 // average between previous pack to the current
525 src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
526 MERGE_WITH_SRC(src_pack, zero_reg)
527 CALC_SUM_SSE_INSIDE_LOOP
528 src_pack = src_reg;
529 src += src_stride << 1;
530 dst += dst_stride << 1;
531 }
532 // last 2 rows processing happens here
533 LOAD_SRC_MERGE_128BIT(filter)
534 LOAD_DST_INSERT
535 FILTER_SRC_128BIT(filter_128bit)
536 src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
537 src_next_reg = _mm256_permute2x128_si256(
538 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
539 // average between previous pack to the current
540 src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
541 MERGE_WITH_SRC(src_pack, zero_reg)
542 CALC_SUM_SSE_INSIDE_LOOP
543 } else {
544 // x_offset = bilin interpolation and y_offset = bilin interpolation
545 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
546 x_offset <<= 5;
547 xfilter = _mm256_load_si256(
548 (__m256i const *)(bilinear_filters_avx2 + x_offset));
549 y_offset <<= 5;
550 yfilter = _mm256_load_si256(
551 (__m256i const *)(bilinear_filters_avx2 + y_offset));
552 pw8 = _mm256_set1_epi16(8);
553 // load and insert source and next row source
554 LOAD_SRC_NEXT_BYTE_INSERT
555 MERGE_WITH_SRC(src_reg, src_next_reg)
556 FILTER_SRC(xfilter)
557 // convert each 16 bit to 8 bit to each low and high lane source
558 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
559 src += src_stride << 1;
560 for (i = 0; i < height - 2; i += 2) {
561 LOAD_SRC_NEXT_BYTE_INSERT
562 LOAD_DST_INSERT
563 MERGE_WITH_SRC(src_reg, src_next_reg)
564 FILTER_SRC(xfilter)
565 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
566 src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
567 // average between previous pack to the current
568 MERGE_WITH_SRC(src_pack, src_next_reg)
569 // filter the source
570 FILTER_SRC(yfilter)
571 src_pack = src_reg;
572 CALC_SUM_SSE_INSIDE_LOOP
573 src += src_stride << 1;
574 dst += dst_stride << 1;
575 }
576 // last 2 rows processing happens here
577 LOAD_SRC_MERGE_128BIT(xfilter)
578 LOAD_DST_INSERT
579 FILTER_SRC_128BIT(filter_128bit)
580 src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
581 src_next_reg = _mm256_permute2x128_si256(
582 src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
583 MERGE_WITH_SRC(src_pack, src_next_reg)
584 FILTER_SRC(yfilter)
585 CALC_SUM_SSE_INSIDE_LOOP
586 }
587 }
588 CALC_SUM_AND_SSE
589 _mm256_zeroupper();
590 return sum;
591}
592
Yaowu Xuf883b422016-08-30 14:01:10 -0700593unsigned int aom_sub_pixel_avg_variance32xh_avx2(
Yaowu Xuc27fc142016-08-22 16:08:15 -0700594 const uint8_t *src, int src_stride, int x_offset, int y_offset,
595 const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
596 int height, unsigned int *sse) {
597 __m256i sec_reg;
598 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
599 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
600 __m256i zero_reg;
601 int i, sum;
602 sum_reg = _mm256_set1_epi16(0);
603 sse_reg = _mm256_set1_epi16(0);
604 zero_reg = _mm256_set1_epi16(0);
605
606 // x_offset = 0 and y_offset = 0
607 if (x_offset == 0) {
608 if (y_offset == 0) {
609 for (i = 0; i < height; i++) {
610 LOAD_SRC_DST
611 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
612 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
613 sec += sec_stride;
614 // expend each byte to 2 bytes
615 MERGE_WITH_SRC(src_reg, zero_reg)
616 CALC_SUM_SSE_INSIDE_LOOP
617 src += src_stride;
618 dst += dst_stride;
619 }
620 } else if (y_offset == 8) {
621 __m256i src_next_reg;
622 for (i = 0; i < height; i++) {
623 LOAD_SRC_DST
624 AVG_NEXT_SRC(src_reg, src_stride)
625 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
626 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
627 sec += sec_stride;
628 // expend each byte to 2 bytes
629 MERGE_WITH_SRC(src_reg, zero_reg)
630 CALC_SUM_SSE_INSIDE_LOOP
631 src += src_stride;
632 dst += dst_stride;
633 }
634 // x_offset = 0 and y_offset = bilin interpolation
635 } else {
636 __m256i filter, pw8, src_next_reg;
637
638 y_offset <<= 5;
639 filter = _mm256_load_si256(
640 (__m256i const *)(bilinear_filters_avx2 + y_offset));
641 pw8 = _mm256_set1_epi16(8);
642 for (i = 0; i < height; i++) {
643 LOAD_SRC_DST
644 MERGE_NEXT_SRC(src_reg, src_stride)
645 FILTER_SRC(filter)
646 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
647 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
648 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
649 sec += sec_stride;
650 MERGE_WITH_SRC(src_reg, zero_reg)
651 CALC_SUM_SSE_INSIDE_LOOP
652 src += src_stride;
653 dst += dst_stride;
654 }
655 }
656 // x_offset = 8 and y_offset = 0
657 } else if (x_offset == 8) {
658 if (y_offset == 0) {
659 __m256i src_next_reg;
660 for (i = 0; i < height; i++) {
661 LOAD_SRC_DST
662 AVG_NEXT_SRC(src_reg, 1)
663 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
664 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
665 sec += sec_stride;
666 // expand each byte to 2 bytes
667 MERGE_WITH_SRC(src_reg, zero_reg)
668 CALC_SUM_SSE_INSIDE_LOOP
669 src += src_stride;
670 dst += dst_stride;
671 }
672 // x_offset = 8 and y_offset = 8
673 } else if (y_offset == 8) {
674 __m256i src_next_reg, src_avg;
675 // load source and another source starting from the next
676 // following byte
677 src_reg = _mm256_loadu_si256((__m256i const *)(src));
678 AVG_NEXT_SRC(src_reg, 1)
679 for (i = 0; i < height; i++) {
680 // save current source average
681 src_avg = src_reg;
682 src += src_stride;
683 LOAD_SRC_DST
684 AVG_NEXT_SRC(src_reg, 1)
685 // average between previous average to current average
686 src_avg = _mm256_avg_epu8(src_avg, src_reg);
687 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
688 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
689 sec += sec_stride;
690 // expand each byte to 2 bytes
691 MERGE_WITH_SRC(src_avg, zero_reg)
692 CALC_SUM_SSE_INSIDE_LOOP
693 dst += dst_stride;
694 }
695 // x_offset = 8 and y_offset = bilin interpolation
696 } else {
697 __m256i filter, pw8, src_next_reg, src_avg;
698 y_offset <<= 5;
699 filter = _mm256_load_si256(
700 (__m256i const *)(bilinear_filters_avx2 + y_offset));
701 pw8 = _mm256_set1_epi16(8);
702 // load source and another source starting from the next
703 // following byte
704 src_reg = _mm256_loadu_si256((__m256i const *)(src));
705 AVG_NEXT_SRC(src_reg, 1)
706 for (i = 0; i < height; i++) {
707 // save current source average
708 src_avg = src_reg;
709 src += src_stride;
710 LOAD_SRC_DST
711 AVG_NEXT_SRC(src_reg, 1)
712 MERGE_WITH_SRC(src_avg, src_reg)
713 FILTER_SRC(filter)
714 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
715 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
716 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
717 // expand each byte to 2 bytes
718 MERGE_WITH_SRC(src_avg, zero_reg)
719 sec += sec_stride;
720 CALC_SUM_SSE_INSIDE_LOOP
721 dst += dst_stride;
722 }
723 }
724 // x_offset = bilin interpolation and y_offset = 0
725 } else {
726 if (y_offset == 0) {
727 __m256i filter, pw8, src_next_reg;
728 x_offset <<= 5;
729 filter = _mm256_load_si256(
730 (__m256i const *)(bilinear_filters_avx2 + x_offset));
731 pw8 = _mm256_set1_epi16(8);
732 for (i = 0; i < height; i++) {
733 LOAD_SRC_DST
734 MERGE_NEXT_SRC(src_reg, 1)
735 FILTER_SRC(filter)
736 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
737 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
738 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
739 MERGE_WITH_SRC(src_reg, zero_reg)
740 sec += sec_stride;
741 CALC_SUM_SSE_INSIDE_LOOP
742 src += src_stride;
743 dst += dst_stride;
744 }
745 // x_offset = bilin interpolation and y_offset = 8
746 } else if (y_offset == 8) {
747 __m256i filter, pw8, src_next_reg, src_pack;
748 x_offset <<= 5;
749 filter = _mm256_load_si256(
750 (__m256i const *)(bilinear_filters_avx2 + x_offset));
751 pw8 = _mm256_set1_epi16(8);
752 src_reg = _mm256_loadu_si256((__m256i const *)(src));
753 MERGE_NEXT_SRC(src_reg, 1)
754 FILTER_SRC(filter)
755 // convert each 16 bit to 8 bit to each low and high lane source
756 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
757 for (i = 0; i < height; i++) {
758 src += src_stride;
759 LOAD_SRC_DST
760 MERGE_NEXT_SRC(src_reg, 1)
761 FILTER_SRC(filter)
762 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
763 // average between previous pack to the current
764 src_pack = _mm256_avg_epu8(src_pack, src_reg);
765 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
766 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
767 sec += sec_stride;
768 MERGE_WITH_SRC(src_pack, zero_reg)
769 src_pack = src_reg;
770 CALC_SUM_SSE_INSIDE_LOOP
771 dst += dst_stride;
772 }
773 // x_offset = bilin interpolation and y_offset = bilin interpolation
774 } else {
775 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
776 x_offset <<= 5;
777 xfilter = _mm256_load_si256(
778 (__m256i const *)(bilinear_filters_avx2 + x_offset));
779 y_offset <<= 5;
780 yfilter = _mm256_load_si256(
781 (__m256i const *)(bilinear_filters_avx2 + y_offset));
782 pw8 = _mm256_set1_epi16(8);
783 // load source and another source starting from the next
784 // following byte
785 src_reg = _mm256_loadu_si256((__m256i const *)(src));
786 MERGE_NEXT_SRC(src_reg, 1)
787
788 FILTER_SRC(xfilter)
789 // convert each 16 bit to 8 bit to each low and high lane source
790 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
791 for (i = 0; i < height; i++) {
792 src += src_stride;
793 LOAD_SRC_DST
794 MERGE_NEXT_SRC(src_reg, 1)
795 FILTER_SRC(xfilter)
796 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
797 // merge previous pack to current pack source
798 MERGE_WITH_SRC(src_pack, src_reg)
799 // filter the source
800 FILTER_SRC(yfilter)
801 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
802 sec_reg = _mm256_loadu_si256((__m256i const *)(sec));
803 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
804 MERGE_WITH_SRC(src_pack, zero_reg)
805 src_pack = src_reg;
806 sec += sec_stride;
807 CALC_SUM_SSE_INSIDE_LOOP
808 dst += dst_stride;
809 }
810 }
811 }
812 CALC_SUM_AND_SSE
Yi Luoe9fde262016-10-07 15:02:33 -0700813 _mm256_zeroupper();
Yaowu Xuc27fc142016-08-22 16:08:15 -0700814 return sum;
815}