blob: 7bc88ebf5f72fa916a021e67ea1ef5ab158ee9cb [file] [log] [blame]
Yaowu Xuc27fc142016-08-22 16:08:15 -07001/*
James Zernb7c05bd2024-06-11 19:15:10 -07002 * Copyright (c) 2016, Alliance for Open Media. All rights reserved.
Yaowu Xuc27fc142016-08-22 16:08:15 -07003 *
Yaowu Xu2ab7ff02016-09-02 12:04:54 -07004 * This source code is subject to the terms of the BSD 2 Clause License and
5 * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
6 * was not distributed with this source code in the LICENSE file, you can
7 * obtain it at www.aomedia.org/license/software. If the Alliance for Open
8 * Media Patent License 1.0 was not distributed with this source code in the
9 * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
Yaowu Xuc27fc142016-08-22 16:08:15 -070010 */
11
12#include <tmmintrin.h>
13
Tom Finegan44702c82018-05-22 13:00:39 -070014#include "config/aom_dsp_rtcd.h"
15
Yaowu Xuf883b422016-08-30 14:01:10 -070016#include "aom_dsp/aom_filter.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070017#include "aom_dsp/x86/convolve.h"
Jerome Jiangafa74192020-08-19 20:39:35 -070018#include "aom_dsp/x86/convolve_sse2.h"
Jerome Jiangc62a6492021-01-12 13:15:34 -080019#include "aom_dsp/x86/convolve_ssse3.h"
Jerome Jiangafa74192020-08-19 20:39:35 -070020#include "aom_dsp/x86/mem_sse2.h"
21#include "aom_dsp/x86/transpose_sse2.h"
Yaowu Xuf883b422016-08-30 14:01:10 -070022#include "aom_mem/aom_mem.h"
Yaowu Xuc27fc142016-08-22 16:08:15 -070023#include "aom_ports/mem.h"
24#include "aom_ports/emmintrin_compat.h"
25
Sachin Kumar Garg18d55552018-09-21 19:21:13 +053026DECLARE_ALIGNED(32, static const uint8_t, filt_h4[]) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 0, 1, 1,
28 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 2, 3, 3, 4, 4, 5,
29 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 2, 3, 3, 4, 4, 5, 5, 6, 6,
30 7, 7, 8, 8, 9, 9, 10, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
31 10, 11, 11, 12, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
32 12, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 6, 7,
33 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
34};
35
Sachin Kumar Garga65377b2018-09-26 14:44:53 +053036DECLARE_ALIGNED(32, static const uint8_t, filtd4[]) = {
37 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
38 2, 3, 4, 5, 3, 4, 5, 6, 4, 5, 6, 7, 5, 6, 7, 8,
39};
40
Sachin Kumar Garga65377b2018-09-26 14:44:53 +053041static void aom_filter_block1d4_h4_ssse3(
42 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
43 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
44 __m128i filtersReg;
45 __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
46 unsigned int i;
47 src_ptr -= 3;
48 addFilterReg32 = _mm_set1_epi16(32);
49 filtersReg = _mm_loadu_si128((const __m128i *)filter);
50 filtersReg = _mm_srai_epi16(filtersReg, 1);
51 // converting the 16 bit (short) to 8 bit (byte) and have the same data
52 // in both lanes of 128 bit register.
53 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
54
55 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
56 filt1Reg = _mm_load_si128((__m128i const *)(filtd4));
57
58 for (i = output_height; i > 0; i -= 1) {
59 // load the 2 strides of source
60 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
61
62 // filter the source buffer
63 srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);
64
65 // multiply 4 adjacent elements with the filter and add the result
66 srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
67
68 srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
69
70 // shift by 6 bit each 16 bit
71 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
72 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
73
74 // shrink to 8 bit each 16 bits, the first lane contain the first
75 // convolve result and the second lane contain the second convolve result
76 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
77
78 src_ptr += src_pixels_per_line;
79
James Zernbf733e62022-07-30 19:48:54 -070080 *((int *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
Sachin Kumar Garga65377b2018-09-26 14:44:53 +053081 output_ptr += output_pitch;
82 }
83}
84
85static void aom_filter_block1d4_v4_ssse3(
86 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
87 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
88 __m128i filtersReg;
89 __m128i addFilterReg32;
90 __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
91 srcReg6, srcReg56;
92 __m128i srcReg23_34_lo, srcReg45_56_lo;
93 __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
94 __m128i resReglo, resReghi;
95 __m128i firstFilters;
96 unsigned int i;
97 ptrdiff_t src_stride, dst_stride;
98
99 addFilterReg32 = _mm_set1_epi16(32);
100 filtersReg = _mm_loadu_si128((const __m128i *)filter);
101 // converting the 16 bit (short) to 8 bit (byte) and have the
102 // same data in both lanes of 128 bit register.
103 filtersReg = _mm_srai_epi16(filtersReg, 1);
104 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
105
106 firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
107
108 // multiple the size of the source and destination stride by two
109 src_stride = src_pitch << 1;
110 dst_stride = out_pitch << 1;
111
112 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
113 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
114 srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);
115
116 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
117
118 // have consecutive loads on the same 256 register
119 srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);
120
121 srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);
122
123 for (i = output_height; i > 1; i -= 2) {
124 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
125 srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);
126
127 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
128 srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);
129
130 // merge every two consecutive registers
131 srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);
132
133 srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
134 srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);
135
136 // multiply 2 adjacent elements with the filter and add the result
137 resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
138 resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);
139
140 resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
141 resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());
142
143 // shift by 6 bit each 16 bit
144 resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
145 resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
146 resReglo = _mm_srai_epi16(resReglo, 6);
147 resReghi = _mm_srai_epi16(resReghi, 6);
148
149 // shrink to 8 bit each 16 bits, the first lane contain the first
150 // convolve result and the second lane contain the second convolve
151 // result
152 resReglo = _mm_packus_epi16(resReglo, resReglo);
153 resReghi = _mm_packus_epi16(resReghi, resReghi);
154
155 src_ptr += src_stride;
156
James Zernbf733e62022-07-30 19:48:54 -0700157 *((int *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
158 *((int *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);
Sachin Kumar Garga65377b2018-09-26 14:44:53 +0530159
160 output_ptr += dst_stride;
161
162 // save part of the registers for next strides
163 srcReg23_34_lo = srcReg45_56_lo;
164 srcReg4 = srcReg6;
165 }
166}
167
Sachin Kumar Garga65377b2018-09-26 14:44:53 +0530168static void aom_filter_block1d8_h4_ssse3(
169 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
170 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
171 __m128i filtersReg;
172 __m128i addFilterReg32, filt2Reg, filt3Reg;
173 __m128i secondFilters, thirdFilters;
174 __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
175 __m128i srcReg32b1;
176 unsigned int i;
177 src_ptr -= 3;
178 addFilterReg32 = _mm_set1_epi16(32);
179 filtersReg = _mm_loadu_si128((const __m128i *)filter);
180 filtersReg = _mm_srai_epi16(filtersReg, 1);
181 // converting the 16 bit (short) to 8 bit (byte) and have the same data
182 // in both lanes of 128 bit register.
183 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
184
185 // duplicate only the second 16 bits (third and forth byte)
186 // across 256 bit register
187 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
188 // duplicate only the third 16 bits (fifth and sixth byte)
189 // across 256 bit register
190 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
191
192 filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
193 filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
194
195 for (i = output_height; i > 0; i -= 1) {
196 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
197
198 // filter the source buffer
199 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
200 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
201
202 // multiply 2 adjacent elements with the filter and add the result
203 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
204 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
205
206 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
207
208 // shift by 6 bit each 16 bit
209 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
210 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
211
212 // shrink to 8 bit each 16 bits
213 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());
214
215 src_ptr += src_pixels_per_line;
216
217 _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);
218
219 output_ptr += output_pitch;
220 }
221}
222
223static void aom_filter_block1d8_v4_ssse3(
224 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
225 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
226 __m128i filtersReg;
227 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
228 __m128i srcReg23, srcReg34, srcReg45, srcReg56;
229 __m128i resReg23, resReg34, resReg45, resReg56;
230 __m128i resReg23_45, resReg34_56;
231 __m128i addFilterReg32, secondFilters, thirdFilters;
232 unsigned int i;
233 ptrdiff_t src_stride, dst_stride;
234
235 addFilterReg32 = _mm_set1_epi16(32);
236 filtersReg = _mm_loadu_si128((const __m128i *)filter);
237 // converting the 16 bit (short) to 8 bit (byte) and have the
238 // same data in both lanes of 128 bit register.
239 filtersReg = _mm_srai_epi16(filtersReg, 1);
240 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
241
242 // duplicate only the second 16 bits (third and forth byte)
243 // across 128 bit register
244 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
245 // duplicate only the third 16 bits (fifth and sixth byte)
246 // across 128 bit register
247 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
248
249 // multiple the size of the source and destination stride by two
250 src_stride = src_pitch << 1;
251 dst_stride = out_pitch << 1;
252
253 srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
254 srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
255 srcReg23 = _mm_unpacklo_epi8(srcReg2, srcReg3);
256
257 srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
258
259 // have consecutive loads on the same 256 register
260 srcReg34 = _mm_unpacklo_epi8(srcReg3, srcReg4);
261
262 for (i = output_height; i > 1; i -= 2) {
263 srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
264
265 srcReg45 = _mm_unpacklo_epi8(srcReg4, srcReg5);
266
267 srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
268
269 srcReg56 = _mm_unpacklo_epi8(srcReg5, srcReg6);
270
271 // multiply 2 adjacent elements with the filter and add the result
272 resReg23 = _mm_maddubs_epi16(srcReg23, secondFilters);
273 resReg34 = _mm_maddubs_epi16(srcReg34, secondFilters);
274 resReg45 = _mm_maddubs_epi16(srcReg45, thirdFilters);
275 resReg56 = _mm_maddubs_epi16(srcReg56, thirdFilters);
276
277 // add and saturate the results together
278 resReg23_45 = _mm_adds_epi16(resReg23, resReg45);
279 resReg34_56 = _mm_adds_epi16(resReg34, resReg56);
280
281 // shift by 6 bit each 16 bit
282 resReg23_45 = _mm_adds_epi16(resReg23_45, addFilterReg32);
283 resReg34_56 = _mm_adds_epi16(resReg34_56, addFilterReg32);
284 resReg23_45 = _mm_srai_epi16(resReg23_45, 6);
285 resReg34_56 = _mm_srai_epi16(resReg34_56, 6);
286
287 // shrink to 8 bit each 16 bits, the first lane contain the first
288 // convolve result and the second lane contain the second convolve
289 // result
290 resReg23_45 = _mm_packus_epi16(resReg23_45, _mm_setzero_si128());
291 resReg34_56 = _mm_packus_epi16(resReg34_56, _mm_setzero_si128());
292
293 src_ptr += src_stride;
294
295 _mm_storel_epi64((__m128i *)output_ptr, (resReg23_45));
296 _mm_storel_epi64((__m128i *)(output_ptr + out_pitch), (resReg34_56));
297
298 output_ptr += dst_stride;
299
300 // save part of the registers for next strides
301 srcReg23 = srcReg45;
302 srcReg34 = srcReg56;
303 srcReg4 = srcReg6;
304 }
305}
306
Sachin Kumar Garg18d55552018-09-21 19:21:13 +0530307static void aom_filter_block1d16_h4_ssse3(
308 const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
309 ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
310 __m128i filtersReg;
311 __m128i addFilterReg32, filt2Reg, filt3Reg;
312 __m128i secondFilters, thirdFilters;
313 __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
314 __m128i srcReg32b1, srcReg32b2;
315 unsigned int i;
316 src_ptr -= 3;
317 addFilterReg32 = _mm_set1_epi16(32);
318 filtersReg = _mm_loadu_si128((const __m128i *)filter);
319 filtersReg = _mm_srai_epi16(filtersReg, 1);
320 // converting the 16 bit (short) to 8 bit (byte) and have the same data
321 // in both lanes of 128 bit register.
322 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
323
324 // duplicate only the second 16 bits (third and forth byte)
325 // across 256 bit register
326 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
327 // duplicate only the third 16 bits (fifth and sixth byte)
328 // across 256 bit register
329 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
330
331 filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
332 filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));
333
334 for (i = output_height; i > 0; i -= 1) {
335 srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);
336
337 // filter the source buffer
338 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
339 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);
340
341 // multiply 2 adjacent elements with the filter and add the result
342 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
343 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
344
345 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
346
347 // reading stride of the next 16 bytes
348 // (part of it was being read by earlier read)
349 srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8));
350
351 // filter the source buffer
352 srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg);
353 srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg);
354
355 // multiply 2 adjacent elements with the filter and add the result
356 srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
357 srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);
358
359 // add and saturate the results together
360 srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);
361
362 // shift by 6 bit each 16 bit
363 srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
364 srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32);
365 srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);
366 srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6);
367
368 // shrink to 8 bit each 16 bits, the first lane contain the first
369 // convolve result and the second lane contain the second convolve result
370 srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1);
371
372 src_ptr += src_pixels_per_line;
373
374 _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1);
375
376 output_ptr += output_pitch;
377 }
378}
379
380static void aom_filter_block1d16_v4_ssse3(
381 const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
382 ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
383 __m128i filtersReg;
384 __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
385 __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
386 __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
387 __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
388 __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
389 __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
390 __m128i resReg23_45, resReg34_56;
391 __m128i addFilterReg32, secondFilters, thirdFilters;
392 unsigned int i;
393 ptrdiff_t src_stride, dst_stride;
394
395 addFilterReg32 = _mm_set1_epi16(32);
396 filtersReg = _mm_loadu_si128((const __m128i *)filter);
397 // converting the 16 bit (short) to 8 bit (byte) and have the
398 // same data in both lanes of 128 bit register.
399 filtersReg = _mm_srai_epi16(filtersReg, 1);
400 filtersReg = _mm_packs_epi16(filtersReg, filtersReg);
401
402 // duplicate only the second 16 bits (third and forth byte)
403 // across 128 bit register
404 secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
405 // duplicate only the third 16 bits (fifth and sixth byte)
406 // across 128 bit register
407 thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
408
409 // multiple the size of the source and destination stride by two
410 src_stride = src_pitch << 1;
411 dst_stride = out_pitch << 1;
412
413 srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
414 srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
415 srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
416 srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);
417
418 srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
419
420 // have consecutive loads on the same 256 register
421 srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
422 srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);
423
424 for (i = output_height; i > 1; i -= 2) {
425 srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
426
427 srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
428 srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);
429
430 srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
431
432 srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
433 srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);
434
435 // multiply 2 adjacent elements with the filter and add the result
436 resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
437 resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
438 resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
439 resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);
440
441 // add and saturate the results together
442 resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
443 resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);
444
445 // multiply 2 adjacent elements with the filter and add the result
446
447 resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
448 resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
449 resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
450 resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);
451
452 // add and saturate the results together
453 resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
454 resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);
455
456 // shift by 6 bit each 16 bit
457 resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
458 resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
459 resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
460 resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
461 resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
462 resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
463 resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
464 resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);
465
466 // shrink to 8 bit each 16 bits, the first lane contain the first
467 // convolve result and the second lane contain the second convolve
468 // result
469 resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
470 resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);
471
472 src_ptr += src_stride;
473
474 _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
475 _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));
476
477 output_ptr += dst_stride;
478
479 // save part of the registers for next strides
480 srcReg23_lo = srcReg45_lo;
481 srcReg34_lo = srcReg56_lo;
482 srcReg23_hi = srcReg45_hi;
483 srcReg34_hi = srcReg56_hi;
484 srcReg4 = srcReg6;
485 }
486}
487
Jerome Jiangafa74192020-08-19 20:39:35 -0700488static INLINE __m128i shuffle_filter_convolve8_8_ssse3(
489 const __m128i *const s, const int16_t *const filter) {
490 __m128i f[4];
491 shuffle_filter_ssse3(filter, f);
492 return convolve8_8_ssse3(s, f);
493}
494
495static void filter_horiz_w8_ssse3(const uint8_t *const src,
496 const ptrdiff_t src_stride,
497 uint8_t *const dst,
498 const int16_t *const x_filter) {
499 __m128i s[8], ss[4], temp;
500
501 load_8bit_8x8(src, src_stride, s);
502 // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
503 // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
504 // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
505 // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
506 transpose_16bit_4x8(s, ss);
507 temp = shuffle_filter_convolve8_8_ssse3(ss, x_filter);
508 // shrink to 8 bit each 16 bits
509 temp = _mm_packus_epi16(temp, temp);
510 // save only 8 bytes convolve result
511 _mm_storel_epi64((__m128i *)dst, temp);
512}
513
514static void transpose8x8_to_dst(const uint8_t *const src,
515 const ptrdiff_t src_stride, uint8_t *const dst,
516 const ptrdiff_t dst_stride) {
517 __m128i s[8];
518
519 load_8bit_8x8(src, src_stride, s);
520 transpose_8bit_8x8(s, s);
521 store_8bit_8x8(s, dst, dst_stride);
522}
523
524static void scaledconvolve_horiz_w8(const uint8_t *src,
525 const ptrdiff_t src_stride, uint8_t *dst,
526 const ptrdiff_t dst_stride,
527 const InterpKernel *const x_filters,
528 const int x0_q4, const int x_step_q4,
529 const int w, const int h) {
530 DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
531 int x, y, z;
532 src -= SUBPEL_TAPS / 2 - 1;
533
534 // This function processes 8x8 areas. The intermediate height is not always
535 // a multiple of 8, so force it to be a multiple of 8 here.
536 y = h + (8 - (h & 0x7));
537
538 do {
539 int x_q4 = x0_q4;
540 for (x = 0; x < w; x += 8) {
541 // process 8 src_x steps
542 for (z = 0; z < 8; ++z) {
543 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
544 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
545 if (x_q4 & SUBPEL_MASK) {
546 filter_horiz_w8_ssse3(src_x, src_stride, temp + (z * 8), x_filter);
547 } else {
548 int i;
549 for (i = 0; i < 8; ++i) {
550 temp[z * 8 + i] = src_x[i * src_stride + 3];
551 }
552 }
553 x_q4 += x_step_q4;
554 }
555
556 // transpose the 8x8 filters values back to dst
557 transpose8x8_to_dst(temp, 8, dst + x, dst_stride);
558 }
559
560 src += src_stride * 8;
561 dst += dst_stride * 8;
562 } while (y -= 8);
563}
564
565static void filter_horiz_w4_ssse3(const uint8_t *const src,
566 const ptrdiff_t src_stride,
567 uint8_t *const dst,
568 const int16_t *const filter) {
Wan-Teh Chang4c61c6a2020-12-02 11:33:00 -0800569 __m128i s[4];
Jerome Jiangafa74192020-08-19 20:39:35 -0700570 __m128i temp;
571
572 load_8bit_8x4(src, src_stride, s);
Wan-Teh Chang4c61c6a2020-12-02 11:33:00 -0800573 transpose_16bit_4x4(s, s);
Jerome Jiangafa74192020-08-19 20:39:35 -0700574
575 temp = shuffle_filter_convolve8_8_ssse3(s, filter);
576 // shrink to 8 bit each 16 bits
577 temp = _mm_packus_epi16(temp, temp);
578 // save only 4 bytes
579 *(int *)dst = _mm_cvtsi128_si32(temp);
580}
581
582static void transpose4x4_to_dst(const uint8_t *const src,
583 const ptrdiff_t src_stride, uint8_t *const dst,
584 const ptrdiff_t dst_stride) {
585 __m128i s[4];
586
587 load_8bit_4x4(src, src_stride, s);
588 s[0] = transpose_8bit_4x4(s);
589 s[1] = _mm_srli_si128(s[0], 4);
590 s[2] = _mm_srli_si128(s[0], 8);
591 s[3] = _mm_srli_si128(s[0], 12);
592 store_8bit_4x4(s, dst, dst_stride);
593}
594
595static void scaledconvolve_horiz_w4(const uint8_t *src,
596 const ptrdiff_t src_stride, uint8_t *dst,
597 const ptrdiff_t dst_stride,
598 const InterpKernel *const x_filters,
599 const int x0_q4, const int x_step_q4,
600 const int w, const int h) {
601 DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
602 int x, y, z;
603 src -= SUBPEL_TAPS / 2 - 1;
604
605 for (y = 0; y < h; y += 4) {
606 int x_q4 = x0_q4;
607 for (x = 0; x < w; x += 4) {
608 // process 4 src_x steps
609 for (z = 0; z < 4; ++z) {
610 const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
611 const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
612 if (x_q4 & SUBPEL_MASK) {
613 filter_horiz_w4_ssse3(src_x, src_stride, temp + (z * 4), x_filter);
614 } else {
615 int i;
616 for (i = 0; i < 4; ++i) {
617 temp[z * 4 + i] = src_x[i * src_stride + 3];
618 }
619 }
620 x_q4 += x_step_q4;
621 }
622
623 // transpose the 4x4 filters values back to dst
624 transpose4x4_to_dst(temp, 4, dst + x, dst_stride);
625 }
626
627 src += src_stride * 4;
628 dst += dst_stride * 4;
629 }
630}
631
632static __m128i filter_vert_kernel(const __m128i *const s,
633 const int16_t *const filter) {
634 __m128i ss[4];
635 __m128i temp;
636
637 // 00 10 01 11 02 12 03 13
638 ss[0] = _mm_unpacklo_epi8(s[0], s[1]);
639 // 20 30 21 31 22 32 23 33
640 ss[1] = _mm_unpacklo_epi8(s[2], s[3]);
641 // 40 50 41 51 42 52 43 53
642 ss[2] = _mm_unpacklo_epi8(s[4], s[5]);
643 // 60 70 61 71 62 72 63 73
644 ss[3] = _mm_unpacklo_epi8(s[6], s[7]);
645
646 temp = shuffle_filter_convolve8_8_ssse3(ss, filter);
647 // shrink to 8 bit each 16 bits
648 return _mm_packus_epi16(temp, temp);
649}
650
651static void filter_vert_w4_ssse3(const uint8_t *const src,
652 const ptrdiff_t src_stride, uint8_t *const dst,
653 const int16_t *const filter) {
654 __m128i s[8];
655 __m128i temp;
656
657 load_8bit_4x8(src, src_stride, s);
658 temp = filter_vert_kernel(s, filter);
659 // save only 4 bytes
660 *(int *)dst = _mm_cvtsi128_si32(temp);
661}
662
663static void scaledconvolve_vert_w4(
664 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
665 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
666 const int y0_q4, const int y_step_q4, const int w, const int h) {
667 int y;
668 int y_q4 = y0_q4;
669
670 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
671 for (y = 0; y < h; ++y) {
672 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
673 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
674
675 if (y_q4 & SUBPEL_MASK) {
676 filter_vert_w4_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
677 } else {
678 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
679 }
680
681 y_q4 += y_step_q4;
682 }
683}
684
685static void filter_vert_w8_ssse3(const uint8_t *const src,
686 const ptrdiff_t src_stride, uint8_t *const dst,
687 const int16_t *const filter) {
688 __m128i s[8], temp;
689
690 load_8bit_8x8(src, src_stride, s);
691 temp = filter_vert_kernel(s, filter);
692 // save only 8 bytes convolve result
693 _mm_storel_epi64((__m128i *)dst, temp);
694}
695
696static void scaledconvolve_vert_w8(
697 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
698 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
699 const int y0_q4, const int y_step_q4, const int w, const int h) {
700 int y;
701 int y_q4 = y0_q4;
702
703 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
704 for (y = 0; y < h; ++y) {
705 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
706 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
707 if (y_q4 & SUBPEL_MASK) {
708 filter_vert_w8_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter);
709 } else {
710 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
711 }
712 y_q4 += y_step_q4;
713 }
714}
715
716static void filter_vert_w16_ssse3(const uint8_t *src,
717 const ptrdiff_t src_stride,
718 uint8_t *const dst,
719 const int16_t *const filter, const int w) {
720 int i;
721 __m128i f[4];
722 shuffle_filter_ssse3(filter, f);
723
724 for (i = 0; i < w; i += 16) {
725 __m128i s[8], s_lo[4], s_hi[4], temp_lo, temp_hi;
726
727 loadu_8bit_16x8(src, src_stride, s);
728
729 // merge the result together
730 s_lo[0] = _mm_unpacklo_epi8(s[0], s[1]);
731 s_hi[0] = _mm_unpackhi_epi8(s[0], s[1]);
732 s_lo[1] = _mm_unpacklo_epi8(s[2], s[3]);
733 s_hi[1] = _mm_unpackhi_epi8(s[2], s[3]);
734 s_lo[2] = _mm_unpacklo_epi8(s[4], s[5]);
735 s_hi[2] = _mm_unpackhi_epi8(s[4], s[5]);
736 s_lo[3] = _mm_unpacklo_epi8(s[6], s[7]);
737 s_hi[3] = _mm_unpackhi_epi8(s[6], s[7]);
738 temp_lo = convolve8_8_ssse3(s_lo, f);
739 temp_hi = convolve8_8_ssse3(s_hi, f);
740
741 // shrink to 8 bit each 16 bits, the first lane contain the first convolve
742 // result and the second lane contain the second convolve result
743 temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
744 src += 16;
745 // save 16 bytes convolve result
746 _mm_store_si128((__m128i *)&dst[i], temp_hi);
747 }
748}
749
750static void scaledconvolve_vert_w16(
751 const uint8_t *src, const ptrdiff_t src_stride, uint8_t *const dst,
752 const ptrdiff_t dst_stride, const InterpKernel *const y_filters,
753 const int y0_q4, const int y_step_q4, const int w, const int h) {
754 int y;
755 int y_q4 = y0_q4;
756
757 src -= src_stride * (SUBPEL_TAPS / 2 - 1);
758 for (y = 0; y < h; ++y) {
759 const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
760 const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
761 if (y_q4 & SUBPEL_MASK) {
762 filter_vert_w16_ssse3(src_y, src_stride, &dst[y * dst_stride], y_filter,
763 w);
764 } else {
765 memcpy(&dst[y * dst_stride], &src_y[3 * src_stride], w);
766 }
767 y_q4 += y_step_q4;
768 }
769}
770
771void aom_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
772 ptrdiff_t dst_stride, const InterpKernel *filter,
773 int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
774 int w, int h) {
775 // Note: Fixed size intermediate buffer, temp, places limits on parameters.
776 // 2d filtering proceeds in 2 steps:
777 // (1) Interpolate horizontally into an intermediate buffer, temp.
778 // (2) Interpolate temp vertically to derive the sub-pixel result.
779 // Deriving the maximum number of rows in the temp buffer (135):
780 // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
781 // --Largest block size is 64x64 pixels.
782 // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
783 // original frame (in 1/16th pixel units).
784 // --Must round-up because block may be located at sub-pixel position.
785 // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
786 // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
787 // --Require an additional 8 rows for the horiz_w8 transpose tail.
788 // When calling in frame scaling function, the smallest scaling factor is x1/4
789 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
790 // big enough.
791 DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
792 const int intermediate_height =
793 (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
794
795 assert(w <= 64);
796 assert(h <= 64);
797 assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
798 assert(x_step_q4 <= 64);
799
800 if (w >= 8) {
801 scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
802 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
803 intermediate_height);
804 } else {
805 scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
806 src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
807 intermediate_height);
808 }
809
810 if (w >= 16) {
811 scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
812 dst_stride, filter, y0_q4, y_step_q4, w, h);
813 } else if (w == 8) {
814 scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
815 dst_stride, filter, y0_q4, y_step_q4, w, h);
816 } else {
817 scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
818 dst_stride, filter, y0_q4, y_step_q4, w, h);
819 }
820}
821
Yaowu Xuf883b422016-08-30 14:01:10 -0700822filter8_1dfunction aom_filter_block1d16_v8_ssse3;
823filter8_1dfunction aom_filter_block1d16_h8_ssse3;
824filter8_1dfunction aom_filter_block1d8_v8_ssse3;
825filter8_1dfunction aom_filter_block1d8_h8_ssse3;
826filter8_1dfunction aom_filter_block1d4_v8_ssse3;
827filter8_1dfunction aom_filter_block1d4_h8_ssse3;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700828
Yaowu Xuf883b422016-08-30 14:01:10 -0700829filter8_1dfunction aom_filter_block1d16_v2_ssse3;
830filter8_1dfunction aom_filter_block1d16_h2_ssse3;
831filter8_1dfunction aom_filter_block1d8_v2_ssse3;
832filter8_1dfunction aom_filter_block1d8_h2_ssse3;
833filter8_1dfunction aom_filter_block1d4_v2_ssse3;
834filter8_1dfunction aom_filter_block1d4_h2_ssse3;
Yaowu Xuc27fc142016-08-22 16:08:15 -0700835
Yaowu Xuf883b422016-08-30 14:01:10 -0700836// void aom_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700837// uint8_t *dst, ptrdiff_t dst_stride,
838// const int16_t *filter_x, int x_step_q4,
839// const int16_t *filter_y, int y_step_q4,
840// int w, int h);
Yaowu Xuf883b422016-08-30 14:01:10 -0700841// void aom_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
Yaowu Xuc27fc142016-08-22 16:08:15 -0700842// uint8_t *dst, ptrdiff_t dst_stride,
843// const int16_t *filter_x, int x_step_q4,
844// const int16_t *filter_y, int y_step_q4,
845// int w, int h);
James Zernf2658a32022-02-09 10:18:38 -0800846FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3)
847FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3)